Matplotlib / Seaborn Notebook
Matplotlib is Python's foundational plotting library. Seaborn is built on top of it and handles statistical charts with less boilerplate. In practice, use Seaborn for most plots and drop into Matplotlib when you need fine-grained control.
All examples use the same parking dataset from Pandas Notebook: parking_df, payment_df, and station_df, plus aggregated results like monthly and station_summary that you'd produce from those workflows. For choosing the right chart type for your data, see Visualization Selection Guide.
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import pandas as pd
import numpy as np
Setup & Style
# apply a clean theme once at the top of your notebook
sns.set_theme(style='whitegrid', palette='muted')
# figure size default (width x height in inches)
plt.rcParams['figure.figsize'] = (10, 5)
style='whitegrid' gives horizontal grid lines without visual clutter. Set figure size globally so you don't repeat it on every chart.
Line Chart — Trend Over Time
# monthly revenue trend
monthly = (
parking_df
.groupby(parking_df['entry_time'].dt.to_period('M'))['amount']
.sum()
.reset_index()
)
monthly.columns = ['month', 'revenue']
monthly['month_str'] = monthly['month'].astype(str) # Period → string for plotting
Basic Line
fig, ax = plt.subplots()
ax.plot(monthly['month_str'], monthly['revenue'], marker='o')
ax.set_title('Monthly Revenue')
ax.set_xlabel('Month')
ax.set_ylabel('Revenue (NTD)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
With Rolling Average Overlay
monthly['rolling_3m'] = monthly['revenue'].rolling(3, center=True).mean()
fig, ax = plt.subplots()
ax.plot(monthly['month_str'], monthly['revenue'], marker='o', label='Monthly')
ax.plot(monthly['month_str'], monthly['rolling_3m'], linestyle='--', label='3M Avg')
ax.set_title('Monthly Revenue with 3-Month Rolling Average')
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
center=True on rolling mean centers the window so the smoothed line doesn't lag behind the actual values.
Bar Chart — Categorical Comparison
Vertical Bar (Seaborn)
station_rev = (
parking_df.groupby('station_code')['amount']
.sum()
.sort_values(ascending=False)
.reset_index()
)
fig, ax = plt.subplots()
sns.barplot(data=station_rev, x='station_code', y='amount', ax=ax)
ax.set_title('Total Revenue by Station')
ax.set_xlabel('Station')
ax.set_ylabel('Revenue (NTD)')
plt.tight_layout()
plt.show()
Horizontal Bar (better when category labels are long)
fig, ax = plt.subplots()
sns.barplot(data=station_rev, y='station_code', x='amount', orient='h', ax=ax)
ax.set_title('Total Revenue by Station')
plt.tight_layout()
plt.show()
Stacked Bar — Composition by Group
pivot = parking_df.pivot_table(
values='amount', index='station_code',
columns='payment_method', aggfunc='sum', fill_value=0
)
pivot.plot(kind='bar', stacked=True, figsize=(10, 5))
plt.title('Revenue by Station and Payment Method')
plt.ylabel('Revenue (NTD)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Use .plot() on a DataFrame directly for stacked charts — Seaborn doesn't have a stacked bar.
Histogram — Single Variable Distribution
fig, ax = plt.subplots()
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=ax)
ax.set_title('Distribution of Parking Amounts')
ax.set_xlabel('Amount (NTD)')
plt.tight_layout()
plt.show()
kde=True overlays a smooth density curve. Useful for seeing whether the distribution is skewed.
Compare Two Groups
fig, ax = plt.subplots()
sns.histplot(
data=parking_df, x='amount',
hue='parking_type', bins=30, kde=True, ax=ax
)
ax.set_title('Amount Distribution by Parking Type')
plt.tight_layout()
plt.show()
hue splits into overlapping histograms — good for 2–3 groups. More than that gets hard to read.
Box Plot — Distribution by Group
fig, ax = plt.subplots()
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=ax)
ax.set_title('Amount Distribution by Parking Type')
plt.tight_layout()
plt.show()
Box = IQR (25th–75th percentile), line = median, whiskers = 1.5×IQR, dots = outliers.
With Data Points (Strip)
fig, ax = plt.subplots()
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=ax)
sns.stripplot(data=parking_df, x='parking_type', y='amount',
color='black', alpha=0.3, size=2, ax=ax)
plt.tight_layout()
plt.show()
Overlay stripplot on top of boxplot to show individual points alongside the summary — helpful when n is small.
Scatter Plot — Relationship Between Two Numeric Variables
fig, ax = plt.subplots()
sns.scatterplot(
data=parking_df, x='duration_mins', y='amount',
hue='parking_type', alpha=0.5, ax=ax
)
ax.set_title('Duration vs Amount')
plt.tight_layout()
plt.show()
With Regression Line
fig, ax = plt.subplots()
sns.regplot(data=parking_df, x='duration_mins', y='amount',
scatter_kws={'alpha': 0.3}, ax=ax)
ax.set_title('Duration vs Amount (with trend line)')
plt.tight_layout()
plt.show()
regplot fits a linear regression and shows a confidence interval band. Use when you want to visualize correlation direction at a glance.
Heatmap — Correlation Matrix or Pivot Table
Correlation Matrix
corr = parking_df.select_dtypes(include='number').corr()
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm',
vmin=-1, vmax=1, ax=ax)
ax.set_title('Correlation Matrix')
plt.tight_layout()
plt.show()
annot=True prints the value in each cell. fmt='.2f' formats it to 2 decimal places. vmin/vmax fixes the color scale so -1 is always blue and 1 is always red.
Pivot Heatmap (Revenue by Station × Month)
pivot = parking_df.pivot_table(
values='amount', index='station_code',
columns=parking_df['entry_time'].dt.month, aggfunc='sum'
)
fig, ax = plt.subplots(figsize=(12, 5))
sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax)
ax.set_title('Revenue by Station × Month')
ax.set_xlabel('Month')
plt.tight_layout()
plt.show()
Subplots — Multiple Charts in One Figure
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# left: revenue by station
sns.barplot(data=station_rev, x='station_code', y='amount', ax=axes[0])
axes[0].set_title('Revenue by Station')
# right: amount distribution
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=axes[1])
axes[1].set_title('Amount Distribution')
plt.suptitle('Parking System Overview', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
Always pass ax=axes[i] to Seaborn functions when using subplots — otherwise they create a new figure.
2×2 Grid
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# top-left
sns.barplot(data=station_rev, x='station_code', y='amount', ax=axes[0, 0])
axes[0, 0].set_title('Revenue by Station')
# top-right
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Amount Distribution')
# bottom-left
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=axes[1, 0])
axes[1, 0].set_title('Amount by Type')
# bottom-right
sns.scatterplot(data=parking_df, x='duration_mins', y='amount',
alpha=0.3, ax=axes[1, 1])
axes[1, 1].set_title('Duration vs Amount')
plt.tight_layout()
plt.show()
Access subplots by axes[row, col] for 2D grids, axes[i] for 1D rows.
Styling & Formatting
Axis Labels & Title
ax.set_title('Chart Title', fontsize=14)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.tick_params(axis='x', rotation=45)
Format Y-Axis as Currency
ax.yaxis.set_major_formatter(mticker.FuncFormatter(
lambda x, _: f'${x:,.0f}'
))
Format Y-Axis as Percentage
ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1, decimals=0))
Add Value Labels on Bar Chart
for container in ax.containers:
ax.bar_label(container, fmt='%.0f', padding=3)
Annotate a Specific Point
ax.annotate(
'Peak',
xy=(peak_x, peak_y),
xytext=(peak_x, peak_y + 5000),
arrowprops=dict(arrowstyle='->', color='red'),
color='red'
)
Common Color Palettes
| Palette | Use case |
|---|---|
'muted' |
Default categorical, low contrast |
'Set2' |
Categorical, colorblind-friendly |
'coolwarm' |
Diverging (positive / negative) |
'YlOrRd' |
Sequential (low → high intensity) |
'Blues' |
Sequential single color |
sns.set_palette('Set2') # apply globally
sns.barplot(..., palette='Set2') # apply to one chart
Save Figure
fig.savefig('output.png', dpi=150, bbox_inches='tight')
bbox_inches='tight' prevents titles and labels from being clipped at the edges.
Common DA Workflows
1. Monthly Trend Report
monthly = (
parking_df
.groupby(parking_df['entry_time'].dt.to_period('M'))['amount']
.sum()
.reset_index()
)
monthly.columns = ['month', 'revenue']
monthly['month_str'] = monthly['month'].astype(str)
monthly['rolling_3m'] = monthly['revenue'].rolling(3, center=True).mean()
monthly['mom_pct'] = monthly['revenue'].pct_change() * 100
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# trend + rolling avg
axes[0].plot(monthly['month_str'], monthly['revenue'], marker='o', label='Monthly')
axes[0].plot(monthly['month_str'], monthly['rolling_3m'], linestyle='--', label='3M Avg')
axes[0].set_title('Monthly Revenue')
axes[0].legend()
axes[0].tick_params(axis='x', rotation=45)
# MoM % change
colors = ['green' if v >= 0 else 'red' for v in monthly['mom_pct'].fillna(0)]
axes[1].bar(monthly['month_str'], monthly['mom_pct'].fillna(0), color=colors)
axes[1].axhline(0, color='black', linewidth=0.8)
axes[1].set_title('Month-over-Month Change (%)')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
2. Station Performance Dashboard
station_summary = (
parking_df.groupby('station_code').agg(
revenue = ('amount', 'sum'),
visits = ('parking_id', 'count'),
avg_amt = ('amount', 'mean'),
).reset_index()
.sort_values('revenue', ascending=False)
)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.barplot(data=station_summary, x='station_code', y='revenue', ax=axes[0])
axes[0].set_title('Revenue by Station')
for c in axes[0].containers:
axes[0].bar_label(c, fmt='%.0f', padding=3, fontsize=8)
sns.barplot(data=station_summary, x='station_code', y='visits', ax=axes[1])
axes[1].set_title('Visits by Station')
sns.barplot(data=station_summary, x='station_code', y='avg_amt', ax=axes[2])
axes[2].set_title('Avg Amount by Station')
plt.suptitle('Station Performance Overview', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
3. EDA Distribution Overview
numeric_cols = parking_df.select_dtypes(include='number').columns.tolist()
n = len(numeric_cols)
fig, axes = plt.subplots(1, n, figsize=(6 * n, 4))
for i, col in enumerate(numeric_cols):
sns.histplot(parking_df[col].dropna(), bins=30, kde=True, ax=axes[i])
axes[i].set_title(f'Distribution: {col}')
plt.tight_layout()
plt.show()
One histogram per numeric column — useful as a first-pass EDA step to spot skew and outliers before modeling.
4. Correlation Heatmap + Scatter Matrix
# correlation heatmap
corr = parking_df.select_dtypes(include='number').corr()
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1, ax=ax)
ax.set_title('Correlation Matrix')
plt.tight_layout()
plt.show()
# scatter matrix for all numeric pairs
pd.plotting.scatter_matrix(
parking_df.select_dtypes(include='number'),
figsize=(10, 10), alpha=0.3, diagonal='kde'
)
plt.suptitle('Scatter Matrix', y=1.01)
plt.tight_layout()
plt.show()
Run these together at the start of any analysis — correlation heatmap for a numeric overview, scatter matrix to visually confirm which correlations are real vs. driven by a few outliers.