Setting up a Pipeline
Again, we'll list some boilerplate imports first.
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklego.datasets import load_chicken
from sklego.preprocessing import ColumnSelector
df = load_chicken(as_frame=True)
def plot_model(model):
df = load_chicken(as_frame=True)
model.fit(df[['diet', 'time']], df['weight'])
metric_df = df[['diet', 'time', 'weight']].assign(pred=lambda d: model.predict(d[['diet', 'time']]))
metric = mean_absolute_error(metric_df['weight'], metric_df['pred'])
plt.figure(figsize=(12, 4))
# plt.scatter(df['time'], df['weight'])
for i in [1, 2, 3, 4]:
pltr = metric_df[['time', 'diet', 'pred']].drop_duplicates().loc[lambda d: d['diet'] == i]
plt.plot(pltr['time'], pltr['pred'], color='.rbgy'[i])
plt.title(f"linear model per group, MAE: {np.round(metric, 2)}");
To see the effect of using the "diet" as a one-hot encoded vector
you can run this pipeline. Note that the ColumnSelector
is a tool from scikit-lego.
feature_pipeline = Pipeline([
("datagrab", FeatureUnion([
("discrete", Pipeline([
("grab", ColumnSelector("diet")),
("encode", OneHotEncoder(categories="auto", sparse=False))
])),
("continuous", Pipeline([
("grab", ColumnSelector("time")),
("standardize", StandardScaler())
]))
]))
])
pipe = Pipeline([
("transform", feature_pipeline),
("model", LinearRegression())
])
plot_model(pipe)
Using a GroupedPredictor
to predict per group
To run a model for each diet instead, you can run:
from sklego.meta import GroupedPredictor
mod = GroupedPredictor(LinearRegression(), groups=["diet"])
plot_model(mod)
Note how each line has it's own intercept and slope.