Matplotlib / Seaborn Notebook

Matplotlib is Python's foundational plotting library. Seaborn is built on top of it and handles statistical charts with less boilerplate. In practice, use Seaborn for most plots and drop into Matplotlib when you need fine-grained control.

All examples use the same parking dataset from Pandas Notebook: parking_df, payment_df, and station_df, plus aggregated results like monthly and station_summary that you'd produce from those workflows. For choosing the right chart type for your data, see Visualization Selection Guide.

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import pandas as pd
import numpy as np

Setup & Style

# apply a clean theme once at the top of your notebook
sns.set_theme(style='whitegrid', palette='muted')

# figure size default (width x height in inches)
plt.rcParams['figure.figsize'] = (10, 5)

style='whitegrid' gives horizontal grid lines without visual clutter. Set figure size globally so you don't repeat it on every chart.

Line Chart — Trend Over Time

# monthly revenue trend
monthly = (
    parking_df
    .groupby(parking_df['entry_time'].dt.to_period('M'))['amount']
    .sum()
    .reset_index()
)
monthly.columns = ['month', 'revenue']
monthly['month_str'] = monthly['month'].astype(str)   # Period → string for plotting

Basic Line

fig, ax = plt.subplots()
ax.plot(monthly['month_str'], monthly['revenue'], marker='o')
ax.set_title('Monthly Revenue')
ax.set_xlabel('Month')
ax.set_ylabel('Revenue (NTD)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

With Rolling Average Overlay

monthly['rolling_3m'] = monthly['revenue'].rolling(3, center=True).mean()

fig, ax = plt.subplots()
ax.plot(monthly['month_str'], monthly['revenue'],    marker='o', label='Monthly')
ax.plot(monthly['month_str'], monthly['rolling_3m'], linestyle='--', label='3M Avg')
ax.set_title('Monthly Revenue with 3-Month Rolling Average')
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

center=True on rolling mean centers the window so the smoothed line doesn't lag behind the actual values.

Bar Chart — Categorical Comparison

Vertical Bar (Seaborn)

station_rev = (
    parking_df.groupby('station_code')['amount']
    .sum()
    .sort_values(ascending=False)
    .reset_index()
)

fig, ax = plt.subplots()
sns.barplot(data=station_rev, x='station_code', y='amount', ax=ax)
ax.set_title('Total Revenue by Station')
ax.set_xlabel('Station')
ax.set_ylabel('Revenue (NTD)')
plt.tight_layout()
plt.show()

Horizontal Bar (better when category labels are long)

fig, ax = plt.subplots()
sns.barplot(data=station_rev, y='station_code', x='amount', orient='h', ax=ax)
ax.set_title('Total Revenue by Station')
plt.tight_layout()
plt.show()

Stacked Bar — Composition by Group

pivot = parking_df.pivot_table(
    values='amount', index='station_code',
    columns='payment_method', aggfunc='sum', fill_value=0
)
pivot.plot(kind='bar', stacked=True, figsize=(10, 5))
plt.title('Revenue by Station and Payment Method')
plt.ylabel('Revenue (NTD)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Use .plot() on a DataFrame directly for stacked charts — Seaborn doesn't have a stacked bar.

Histogram — Single Variable Distribution

fig, ax = plt.subplots()
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=ax)
ax.set_title('Distribution of Parking Amounts')
ax.set_xlabel('Amount (NTD)')
plt.tight_layout()
plt.show()

kde=True overlays a smooth density curve. Useful for seeing whether the distribution is skewed.

Compare Two Groups

fig, ax = plt.subplots()
sns.histplot(
    data=parking_df, x='amount',
    hue='parking_type', bins=30, kde=True, ax=ax
)
ax.set_title('Amount Distribution by Parking Type')
plt.tight_layout()
plt.show()

hue splits into overlapping histograms — good for 2–3 groups. More than that gets hard to read.

Box Plot — Distribution by Group

fig, ax = plt.subplots()
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=ax)
ax.set_title('Amount Distribution by Parking Type')
plt.tight_layout()
plt.show()

Box = IQR (25th–75th percentile), line = median, whiskers = 1.5×IQR, dots = outliers.

With Data Points (Strip)

fig, ax = plt.subplots()
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=ax)
sns.stripplot(data=parking_df, x='parking_type', y='amount',
              color='black', alpha=0.3, size=2, ax=ax)
plt.tight_layout()
plt.show()

Overlay stripplot on top of boxplot to show individual points alongside the summary — helpful when n is small.

Scatter Plot — Relationship Between Two Numeric Variables

fig, ax = plt.subplots()
sns.scatterplot(
    data=parking_df, x='duration_mins', y='amount',
    hue='parking_type', alpha=0.5, ax=ax
)
ax.set_title('Duration vs Amount')
plt.tight_layout()
plt.show()

With Regression Line

fig, ax = plt.subplots()
sns.regplot(data=parking_df, x='duration_mins', y='amount',
            scatter_kws={'alpha': 0.3}, ax=ax)
ax.set_title('Duration vs Amount (with trend line)')
plt.tight_layout()
plt.show()

regplot fits a linear regression and shows a confidence interval band. Use when you want to visualize correlation direction at a glance.

Heatmap — Correlation Matrix or Pivot Table

Correlation Matrix

corr = parking_df.select_dtypes(include='number').corr()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm',
            vmin=-1, vmax=1, ax=ax)
ax.set_title('Correlation Matrix')
plt.tight_layout()
plt.show()

annot=True prints the value in each cell. fmt='.2f' formats it to 2 decimal places. vmin/vmax fixes the color scale so -1 is always blue and 1 is always red.

Pivot Heatmap (Revenue by Station × Month)

pivot = parking_df.pivot_table(
    values='amount', index='station_code',
    columns=parking_df['entry_time'].dt.month, aggfunc='sum'
)

fig, ax = plt.subplots(figsize=(12, 5))
sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax)
ax.set_title('Revenue by Station × Month')
ax.set_xlabel('Month')
plt.tight_layout()
plt.show()

Subplots — Multiple Charts in One Figure

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# left: revenue by station
sns.barplot(data=station_rev, x='station_code', y='amount', ax=axes[0])
axes[0].set_title('Revenue by Station')

# right: amount distribution
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=axes[1])
axes[1].set_title('Amount Distribution')

plt.suptitle('Parking System Overview', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

Always pass ax=axes[i] to Seaborn functions when using subplots — otherwise they create a new figure.

2×2 Grid

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# top-left
sns.barplot(data=station_rev, x='station_code', y='amount', ax=axes[0, 0])
axes[0, 0].set_title('Revenue by Station')

# top-right
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Amount Distribution')

# bottom-left
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=axes[1, 0])
axes[1, 0].set_title('Amount by Type')

# bottom-right
sns.scatterplot(data=parking_df, x='duration_mins', y='amount',
                alpha=0.3, ax=axes[1, 1])
axes[1, 1].set_title('Duration vs Amount')

plt.tight_layout()
plt.show()

Access subplots by axes[row, col] for 2D grids, axes[i] for 1D rows.

Styling & Formatting

Axis Labels & Title

ax.set_title('Chart Title', fontsize=14)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.tick_params(axis='x', rotation=45)

Format Y-Axis as Currency

ax.yaxis.set_major_formatter(mticker.FuncFormatter(
    lambda x, _: f'${x:,.0f}'
))

Format Y-Axis as Percentage

ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1, decimals=0))

Add Value Labels on Bar Chart

for container in ax.containers:
    ax.bar_label(container, fmt='%.0f', padding=3)

Annotate a Specific Point

ax.annotate(
    'Peak',
    xy=(peak_x, peak_y),
    xytext=(peak_x, peak_y + 5000),
    arrowprops=dict(arrowstyle='->', color='red'),
    color='red'
)

Common Color Palettes

Palette Use case
'muted' Default categorical, low contrast
'Set2' Categorical, colorblind-friendly
'coolwarm' Diverging (positive / negative)
'YlOrRd' Sequential (low → high intensity)
'Blues' Sequential single color
sns.set_palette('Set2')                          # apply globally
sns.barplot(..., palette='Set2')                 # apply to one chart

Save Figure

fig.savefig('output.png', dpi=150, bbox_inches='tight')

bbox_inches='tight' prevents titles and labels from being clipped at the edges.

Common DA Workflows

1. Monthly Trend Report

monthly = (
    parking_df
    .groupby(parking_df['entry_time'].dt.to_period('M'))['amount']
    .sum()
    .reset_index()
)
monthly.columns = ['month', 'revenue']
monthly['month_str']  = monthly['month'].astype(str)
monthly['rolling_3m'] = monthly['revenue'].rolling(3, center=True).mean()
monthly['mom_pct']    = monthly['revenue'].pct_change() * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# trend + rolling avg
axes[0].plot(monthly['month_str'], monthly['revenue'],    marker='o', label='Monthly')
axes[0].plot(monthly['month_str'], monthly['rolling_3m'], linestyle='--', label='3M Avg')
axes[0].set_title('Monthly Revenue')
axes[0].legend()
axes[0].tick_params(axis='x', rotation=45)

# MoM % change
colors = ['green' if v >= 0 else 'red' for v in monthly['mom_pct'].fillna(0)]
axes[1].bar(monthly['month_str'], monthly['mom_pct'].fillna(0), color=colors)
axes[1].axhline(0, color='black', linewidth=0.8)
axes[1].set_title('Month-over-Month Change (%)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

2. Station Performance Dashboard

station_summary = (
    parking_df.groupby('station_code').agg(
        revenue = ('amount',     'sum'),
        visits  = ('parking_id', 'count'),
        avg_amt = ('amount',     'mean'),
    ).reset_index()
    .sort_values('revenue', ascending=False)
)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.barplot(data=station_summary, x='station_code', y='revenue', ax=axes[0])
axes[0].set_title('Revenue by Station')
for c in axes[0].containers:
    axes[0].bar_label(c, fmt='%.0f', padding=3, fontsize=8)

sns.barplot(data=station_summary, x='station_code', y='visits', ax=axes[1])
axes[1].set_title('Visits by Station')

sns.barplot(data=station_summary, x='station_code', y='avg_amt', ax=axes[2])
axes[2].set_title('Avg Amount by Station')

plt.suptitle('Station Performance Overview', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

3. EDA Distribution Overview

numeric_cols = parking_df.select_dtypes(include='number').columns.tolist()

n = len(numeric_cols)
fig, axes = plt.subplots(1, n, figsize=(6 * n, 4))

for i, col in enumerate(numeric_cols):
    sns.histplot(parking_df[col].dropna(), bins=30, kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution: {col}')

plt.tight_layout()
plt.show()

One histogram per numeric column — useful as a first-pass EDA step to spot skew and outliers before modeling.

4. Correlation Heatmap + Scatter Matrix

# correlation heatmap
corr = parking_df.select_dtypes(include='number').corr()
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1, ax=ax)
ax.set_title('Correlation Matrix')
plt.tight_layout()
plt.show()

# scatter matrix for all numeric pairs
pd.plotting.scatter_matrix(
    parking_df.select_dtypes(include='number'),
    figsize=(10, 10), alpha=0.3, diagonal='kde'
)
plt.suptitle('Scatter Matrix', y=1.01)
plt.tight_layout()
plt.show()

Run these together at the start of any analysis — correlation heatmap for a numeric overview, scatter matrix to visually confirm which correlations are real vs. driven by a few outliers.