logo

... pandas pipe: calm



Notes

The common.py file that we have looks like this;

import datetime as dt
from functools import wraps

import pandas as pd 
import altair as alt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

@log_step
def start_pipeline(dataf):
    return dataf.copy() 

@log_step
def set_dtypes(dataf):
    return (dataf
            .assign(date=lambda d: pd.to_datetime(d['date']))
            .sort_values(['currency_code', 'date']))

@log_step
def add_inflation_features(dataf):
    return (dataf
            .assign(local_inflation=lambda d: d.groupby('name')['local_price'].diff()/d['local_price'])
            .assign(dollar_inflation=lambda d: d.groupby('name')['dollar_price'].diff()/d['dollar_price']))

@log_step
def remove_outliers(dataf, min_row_country=32):
    countries = (dataf
                .groupby('currency_code')
                .agg(n=('name', 'count'))
                .loc[lambda d: d['n'] >= min_row_country]
                .index)
    return (dataf
            .loc[lambda d: d['currency_code'].isin(countries)]
            .loc[lambda d: d['local_inflation'] > -20])

def plot_bigmac(dataf):
    return (alt.Chart(dataf)
      .mark_point()
      .encode(x='local_inflation', 
              y='dollar_inflation', 
              color=alt.Color('currency_code'),
              tooltip=["currency_code", "local_inflation", "dollar_inflation"])
      .properties(width=600, height=150)
      .interactive())

These functions are imported in the notebook via;

from common import start_pipeline, set_dtypes, add_inflation_features, remove_outliers, plot_bigmac

df = pd.read_csv('https://calmcode.io/datasets/bigmac.csv')

clean_df = (df
  .pipe(start_pipeline)
  .pipe(set_dtypes)
  .pipe(add_inflation_features)
  .pipe(remove_outliers, min_row_country=30))

clean_df.pipe(plot_bigmac)

Feedback? See an issue? Something unclear? Feel free to mention it here.



If you want to be kept up to date, consider signing up for the newsletter.