The common.py
file that we have looks like this;
import datetime as dt
from functools import wraps
import pandas as pd
import altair as alt
def log_step(func):
@wraps(func)
def wrapper(*args, **kwargs):
tic = dt.datetime.now()
result = func(*args, **kwargs)
time_taken = str(dt.datetime.now() - tic)
print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
return result
return wrapper
@log_step
def start_pipeline(dataf):
return dataf.copy()
@log_step
def set_dtypes(dataf):
return (dataf
.assign(date=lambda d: pd.to_datetime(d['date']))
.sort_values(['currency_code', 'date']))
@log_step
def add_inflation_features(dataf):
return (dataf
.assign(local_inflation=lambda d: d.groupby('name')['local_price'].diff()/d['local_price'])
.assign(dollar_inflation=lambda d: d.groupby('name')['dollar_price'].diff()/d['dollar_price']))
@log_step
def remove_outliers(dataf, min_row_country=32):
countries = (dataf
.groupby('currency_code')
.agg(n=('name', 'count'))
.loc[lambda d: d['n'] >= min_row_country]
.index)
return (dataf
.loc[lambda d: d['currency_code'].isin(countries)]
.loc[lambda d: d['local_inflation'] > -20])
def plot_bigmac(dataf):
return (alt.Chart(dataf)
.mark_point()
.encode(x='local_inflation',
y='dollar_inflation',
color=alt.Color('currency_code'),
tooltip=["currency_code", "local_inflation", "dollar_inflation"])
.properties(width=600, height=150)
.interactive())
These functions are imported in the notebook via;
from common import start_pipeline, set_dtypes, add_inflation_features, remove_outliers, plot_bigmac
df = pd.read_csv('https://calmcode.io/datasets/bigmac.csv')
clean_df = (df
.pipe(start_pipeline)
.pipe(set_dtypes)
.pipe(add_inflation_features)
.pipe(remove_outliers, min_row_country=30))
clean_df.pipe(plot_bigmac)