Matplotlib / Seaborn Notebook
Matplotlib is Python's base plotting library. Seaborn is built on top of it and handles statistical charts with less code. In real work, use Seaborn for most plots and drop into Matplotlib when you need fine-grained control.
All examples use the same parking dataset from Pandas Notebook: parking_df, payment_df, and station_df, plus aggregated results like monthly and station_summary that you would get from those workflows. For picking the right chart for your data, see Visualization Selection Guide.
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import pandas as pd
import numpy as np
Setup & Style
# apply a clean theme once at the top of your notebook
sns.set_theme(style='whitegrid', palette='muted')
# default figure size (width x height in inches)
plt.rcParams['figure.figsize'] = (10, 5)
style='whitegrid' adds horizontal grid lines without too much visual noise. Set the figure size once so you don't have to repeat it on every chart.
Line Chart — Trend Over Time
# monthly revenue trend
monthly = (
parking_df
.groupby(parking_df['entry_time'].dt.to_period('M'))['amount']
.sum()
.reset_index()
)
monthly.columns = ['month', 'revenue']
monthly['month_str'] = monthly['month'].astype(str) # Period → string for plotting
Basic Line
fig, ax = plt.subplots()
ax.plot(monthly['month_str'], monthly['revenue'], marker='o')
ax.set_title('Monthly Revenue')
ax.set_xlabel('Month')
ax.set_ylabel('Revenue (NTD)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
With Rolling Average Overlay
monthly['rolling_3m'] = monthly['revenue'].rolling(3, center=True).mean()
fig, ax = plt.subplots()
ax.plot(monthly['month_str'], monthly['revenue'], marker='o', label='Monthly')
ax.plot(monthly['month_str'], monthly['rolling_3m'], linestyle='--', label='3M Avg')
ax.set_title('Monthly Revenue with 3-Month Rolling Average')
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
center=True on the rolling mean centers the window, so the smoothed line does not lag behind the real values.
Bar Chart — Category Comparison
Vertical Bar (Seaborn)
station_rev = (
parking_df.groupby('station_code')['amount']
.sum()
.sort_values(ascending=False)
.reset_index()
)
fig, ax = plt.subplots()
sns.barplot(data=station_rev, x='station_code', y='amount', ax=ax)
ax.set_title('Total Revenue by Station')
ax.set_xlabel('Station')
ax.set_ylabel('Revenue (NTD)')
plt.tight_layout()
plt.show()
Horizontal Bar (better when category labels are long)
fig, ax = plt.subplots()
sns.barplot(data=station_rev, y='station_code', x='amount', orient='h', ax=ax)
ax.set_title('Total Revenue by Station')
plt.tight_layout()
plt.show()
Stacked Bar — Composition by Group
pivot = parking_df.pivot_table(
values='amount', index='station_code',
columns='payment_method', aggfunc='sum', fill_value=0
)
pivot.plot(kind='bar', stacked=True, figsize=(10, 5))
plt.title('Revenue by Station and Payment Method')
plt.ylabel('Revenue (NTD)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Use .plot() on a DataFrame directly for stacked charts. Seaborn does not have a stacked bar.
Histogram — One Variable's Distribution
fig, ax = plt.subplots()
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=ax)
ax.set_title('Distribution of Parking Amounts')
ax.set_xlabel('Amount (NTD)')
plt.tight_layout()
plt.show()
kde=True adds a smooth density curve on top. It is helpful for seeing whether the distribution is skewed.
Compare Two Groups
fig, ax = plt.subplots()
sns.histplot(
data=parking_df, x='amount',
hue='parking_type', bins=30, kde=True, ax=ax
)
ax.set_title('Amount Distribution by Parking Type')
plt.tight_layout()
plt.show()
hue splits the chart into overlapping histograms — good for 2–3 groups. More than that becomes hard to read.
Box Plot — Distribution by Group
fig, ax = plt.subplots()
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=ax)
ax.set_title('Amount Distribution by Parking Type')
plt.tight_layout()
plt.show()
Box = IQR (25th–75th percentile), line = median, whiskers = 1.5× IQR, dots = outliers.
With Data Points (Strip)
fig, ax = plt.subplots()
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=ax)
sns.stripplot(data=parking_df, x='parking_type', y='amount',
color='black', alpha=0.3, size=2, ax=ax)
plt.tight_layout()
plt.show()
Add stripplot on top of boxplot to show the points along with the summary. Helpful when n is small.
Scatter Plot — Link Between Two Numeric Variables
fig, ax = plt.subplots()
sns.scatterplot(
data=parking_df, x='duration_mins', y='amount',
hue='parking_type', alpha=0.5, ax=ax
)
ax.set_title('Duration vs Amount')
plt.tight_layout()
plt.show()
With Regression Line
fig, ax = plt.subplots()
sns.regplot(data=parking_df, x='duration_mins', y='amount',
scatter_kws={'alpha': 0.3}, ax=ax)
ax.set_title('Duration vs Amount (with trend line)')
plt.tight_layout()
plt.show()
regplot fits a straight line through the points and shows a confidence band. Use it when you want to see the direction of the link at a glance.
Heatmap — Correlation Matrix or Pivot Table
Correlation Matrix
corr = parking_df.select_dtypes(include='number').corr()
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm',
vmin=-1, vmax=1, ax=ax)
ax.set_title('Correlation Matrix')
plt.tight_layout()
plt.show()
annot=True prints the value in each cell. fmt='.2f' shows 2 decimal places. vmin/vmax lock the color scale, so -1 is always blue and 1 is always red.
Pivot Heatmap (Revenue by Station × Month)
pivot = parking_df.pivot_table(
values='amount', index='station_code',
columns=parking_df['entry_time'].dt.month, aggfunc='sum'
)
fig, ax = plt.subplots(figsize=(12, 5))
sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax)
ax.set_title('Revenue by Station × Month')
ax.set_xlabel('Month')
plt.tight_layout()
plt.show()
Subplots — Several Charts in One Figure
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# left: revenue by station
sns.barplot(data=station_rev, x='station_code', y='amount', ax=axes[0])
axes[0].set_title('Revenue by Station')
# right: amount distribution
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=axes[1])
axes[1].set_title('Amount Distribution')
plt.suptitle('Parking System Overview', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
Always pass ax=axes[i] to Seaborn functions when you use subplots. If you don't, they create a new figure on their own.
2×2 Grid
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# top-left
sns.barplot(data=station_rev, x='station_code', y='amount', ax=axes[0, 0])
axes[0, 0].set_title('Revenue by Station')
# top-right
sns.histplot(parking_df['amount'], bins=30, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Amount Distribution')
# bottom-left
sns.boxplot(data=parking_df, x='parking_type', y='amount', ax=axes[1, 0])
axes[1, 0].set_title('Amount by Type')
# bottom-right
sns.scatterplot(data=parking_df, x='duration_mins', y='amount',
alpha=0.3, ax=axes[1, 1])
axes[1, 1].set_title('Duration vs Amount')
plt.tight_layout()
plt.show()
Reach subplots with axes[row, col] for 2D grids, and axes[i] for 1D rows.
Styling & Formatting
Axis Labels & Title
ax.set_title('Chart Title', fontsize=14)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.tick_params(axis='x', rotation=45)
Format Y-Axis as Currency
ax.yaxis.set_major_formatter(mticker.FuncFormatter(
lambda x, _: f'${x:,.0f}'
))
Format Y-Axis as Percentage
ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1, decimals=0))
Add Value Labels on a Bar Chart
for container in ax.containers:
ax.bar_label(container, fmt='%.0f', padding=3)
Annotate a Specific Point
ax.annotate(
'Peak',
xy=(peak_x, peak_y),
xytext=(peak_x, peak_y + 5000),
arrowprops=dict(arrowstyle='->', color='red'),
color='red'
)
Common Color Palettes
| Palette | Use case |
|---|---|
'muted' |
Default category palette, low contrast |
'Set2' |
Category, colorblind-friendly |
'coolwarm' |
Diverging (positive / negative) |
'YlOrRd' |
One direction (low → high intensity) |
'Blues' |
One color, single direction |
sns.set_palette('Set2') # apply globally
sns.barplot(..., palette='Set2') # apply to one chart
Save Figure
fig.savefig('output.png', dpi=150, bbox_inches='tight')
bbox_inches='tight' makes sure titles and labels are not cut off at the edges.
Common DA Workflows
1. Monthly Trend Report
monthly = (
parking_df
.groupby(parking_df['entry_time'].dt.to_period('M'))['amount']
.sum()
.reset_index()
)
monthly.columns = ['month', 'revenue']
monthly['month_str'] = monthly['month'].astype(str)
monthly['rolling_3m'] = monthly['revenue'].rolling(3, center=True).mean()
monthly['mom_pct'] = monthly['revenue'].pct_change() * 100
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# trend + rolling avg
axes[0].plot(monthly['month_str'], monthly['revenue'], marker='o', label='Monthly')
axes[0].plot(monthly['month_str'], monthly['rolling_3m'], linestyle='--', label='3M Avg')
axes[0].set_title('Monthly Revenue')
axes[0].legend()
axes[0].tick_params(axis='x', rotation=45)
# MoM % change
colors = ['green' if v >= 0 else 'red' for v in monthly['mom_pct'].fillna(0)]
axes[1].bar(monthly['month_str'], monthly['mom_pct'].fillna(0), color=colors)
axes[1].axhline(0, color='black', linewidth=0.8)
axes[1].set_title('Month-over-Month Change (%)')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
2. Station Performance Dashboard
station_summary = (
parking_df.groupby('station_code').agg(
revenue = ('amount', 'sum'),
visits = ('parking_id', 'count'),
avg_amt = ('amount', 'mean'),
).reset_index()
.sort_values('revenue', ascending=False)
)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.barplot(data=station_summary, x='station_code', y='revenue', ax=axes[0])
axes[0].set_title('Revenue by Station')
for c in axes[0].containers:
axes[0].bar_label(c, fmt='%.0f', padding=3, fontsize=8)
sns.barplot(data=station_summary, x='station_code', y='visits', ax=axes[1])
axes[1].set_title('Visits by Station')
sns.barplot(data=station_summary, x='station_code', y='avg_amt', ax=axes[2])
axes[2].set_title('Avg Amount by Station')
plt.suptitle('Station Performance Overview', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()
3. EDA Distribution Overview
numeric_cols = parking_df.select_dtypes(include='number').columns.tolist()
n = len(numeric_cols)
fig, axes = plt.subplots(1, n, figsize=(6 * n, 4))
for i, col in enumerate(numeric_cols):
sns.histplot(parking_df[col].dropna(), bins=30, kde=True, ax=axes[i])
axes[i].set_title(f'Distribution: {col}')
plt.tight_layout()
plt.show()
One histogram per numeric column. Useful as a first-pass EDA (exploratory data analysis) step to spot skew and outliers before you build a model.
4. Correlation Heatmap + Scatter Matrix
# correlation heatmap
corr = parking_df.select_dtypes(include='number').corr()
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1, ax=ax)
ax.set_title('Correlation Matrix')
plt.tight_layout()
plt.show()
# scatter matrix for all numeric pairs
pd.plotting.scatter_matrix(
parking_df.select_dtypes(include='number'),
figsize=(10, 10), alpha=0.3, diagonal='kde'
)
plt.suptitle('Scatter Matrix', y=1.01)
plt.tight_layout()
plt.show()
Run these together at the start of any analysis. The correlation heatmap gives you a numeric overview, and the scatter matrix lets you check which links are real vs. driven by a few outliers.