dirty cat logo dirty cat: results

1 2 3 4 5
Notes

Here is a large chunk of the code used to build the grid-search.

from sklearn import set_config

set_config(display="diagram")

from sklearn.pipeline import Pipeline, FeatureUnion
from sklego.preprocessing import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge

method = {
    'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
    'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
    'sim_enc_all': dirty_cat.SimilarityEncoder(),
    'one-hot': OneHotEncoder(handle_unknown='ignore')
}

results = []

for k, encoder in method.items():
    pipe = Pipeline([
        ('split', FeatureUnion([
            ('cat', Pipeline([
                ('grab', ColumnSelector(['employee_position_title'])),
                ('handle', encoder)
            ])),
            ('one-hot', Pipeline([
                ('grab', ColumnSelector('assignment_category')),
                ('handle', OneHotEncoder(handle_unknown='ignore'))
            ])),
            ('floats', Pipeline([
                ('grab', ColumnSelector('year_first_hired')),
                ('scale', StandardScaler())
            ])),
        ])),
        ('mod', Ridge())
    ])

    grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
    res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
    res_df['key'] = k
    results.append(res_df)

To understand why this pipeline does not include the CountVectorizer component we need to observe one difference between their implementation.

import dirty_cat
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='char', ngram_range=(2, 4))
cv.fit_transform(ml_df['employee_position_title']).shape

mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)
mod.fit_transform(data[['employee_position_title']]).shape

Notice that the CountVectorizer receives ml_df['employee_position_title'] while the SimilarityEncoder receives ml_df[['employee_position_title']]. The former is a single column from the dataframe while the latter is a dataframe with one column. It's a subtle difference but this difference in input means that they expect different inputs.

To fully explore this, we recommend exploring the original notebook.