pandas pipe:
calm
Pandas code can get quite nasty inside of your jupyter notebook. It's not just the syntax, it's the infinite amount of scrolling too. In this series of videos we're going to explore a way to clean this up. This series of videos is inspired by the modern pandas blogposts originally written by Tom Augspurger.
Notes
The common.py
file that we have looks like this;
import datetime as dt
from functools import wraps
import pandas as pd
import altair as alt
def log_step(func):
@wraps(func)
def wrapper(*args, **kwargs):
tic = dt.datetime.now()
result = func(*args, **kwargs)
time_taken = str(dt.datetime.now() - tic)
print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
return result
return wrapper
@log_step
def start_pipeline(dataf):
return dataf.copy()
@log_step
def set_dtypes(dataf):
return (dataf
.assign(date=lambda d: pd.to_datetime(d['date']))
.sort_values(['currency_code', 'date']))
@log_step
def add_inflation_features(dataf):
return (dataf
.assign(local_inflation=lambda d: d.groupby('name')['local_price'].diff()/d['local_price'])
.assign(dollar_inflation=lambda d: d.groupby('name')['dollar_price'].diff()/d['dollar_price']))
@log_step
def remove_outliers(dataf, min_row_country=32):
countries = (dataf
.groupby('currency_code')
.agg(n=('name', 'count'))
.loc[lambda d: d['n'] >= min_row_country]
.index)
return (dataf
.loc[lambda d: d['currency_code'].isin(countries)]
.loc[lambda d: d['local_inflation'] > -20])
def plot_bigmac(dataf):
return (alt.Chart(dataf)
.mark_point()
.encode(x='local_inflation',
y='dollar_inflation',
color=alt.Color('currency_code'),
tooltip=["currency_code", "local_inflation", "dollar_inflation"])
.properties(width=600, height=150)
.interactive())
These functions are imported in the notebook via;
from common import start_pipeline, set_dtypes, add_inflation_features, remove_outliers, plot_bigmac
df = pd.read_csv('https://calmcode.io/datasets/bigmac.csv')
clean_df = (df
.pipe(start_pipeline)
.pipe(set_dtypes)
.pipe(add_inflation_features)
.pipe(remove_outliers, min_row_country=30))
clean_df.pipe(plot_bigmac)
Feedback? See an issue? Something unclear? Feel free to mention it here.
If you want to be kept up to date, consider signing up for the newsletter.