... pandas pipe: calm


The common.py file that we have looks like this;

import datetime as dt
from functools import wraps

import pandas as pd 
import altair as alt

def log_step(func):
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

def start_pipeline(dataf):
    return dataf.copy() 

def set_dtypes(dataf):
    return (dataf
            .assign(date=lambda d: pd.to_datetime(d['date']))
            .sort_values(['currency_code', 'date']))

def add_inflation_features(dataf):
    return (dataf
            .assign(local_inflation=lambda d: d.groupby('name')['local_price'].diff()/d['local_price'])
            .assign(dollar_inflation=lambda d: d.groupby('name')['dollar_price'].diff()/d['dollar_price']))

def remove_outliers(dataf, min_row_country=32):
    countries = (dataf
                .agg(n=('name', 'count'))
                .loc[lambda d: d['n'] >= min_row_country]
    return (dataf
            .loc[lambda d: d['currency_code'].isin(countries)]
            .loc[lambda d: d['local_inflation'] > -20])

def plot_bigmac(dataf):
    return (alt.Chart(dataf)
              tooltip=["currency_code", "local_inflation", "dollar_inflation"])
      .properties(width=600, height=150)

These functions are imported in the notebook via;

from common import start_pipeline, set_dtypes, add_inflation_features, remove_outliers, plot_bigmac

df = pd.read_csv('https://calmcode.io/datasets/bigmac.csv')

clean_df = (df
  .pipe(remove_outliers, min_row_country=30))


Feedback? See an issue? Something unclear? Feel free to mention it here.

If you want to be kept up to date, consider signing up for the newsletter.