... dirty cat: results


Here is a large chunk of the code used to build the grid-search.

from sklearn import set_config


from sklearn.pipeline import Pipeline, FeatureUnion
from sklego.preprocessing import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge

method = {
    'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
    'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
    'sim_enc_all': dirty_cat.SimilarityEncoder(),
    'one-hot': OneHotEncoder(handle_unknown='ignore')

results = []

for k, encoder in method.items():
    pipe = Pipeline([
        ('split', FeatureUnion([
            ('cat', Pipeline([
                ('grab', ColumnSelector(['employee_position_title'])),
                ('handle', encoder)
            ('one-hot', Pipeline([
                ('grab', ColumnSelector('assignment_category')),
                ('handle', OneHotEncoder(handle_unknown='ignore'))
            ('floats', Pipeline([
                ('grab', ColumnSelector('year_first_hired')),
                ('scale', StandardScaler())
        ('mod', Ridge())

    grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
    res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
    res_df['key'] = k

To understand why this pipeline does not include the CountVectorizer component we need to observe one difference between their implementation.

import dirty_cat 
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='char', ngram_range=(2, 4))

mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)

Notice that the CountVectorizer receives ml_df['employee_position_title'] while the SimilarityEncoder receives ml_df[['employee_position_title']]. The former is a single column from the dataframe while the latter is a dataframe with one column. It's a subtle difference but this difference in input means that they expect different inputs.

To fully explore this, we recommend exploring the original notebook which can be found here.

Feedback? See an issue? Something unclear? Feel free to mention it here.

If you want to be kept up to date, consider signing up for the newsletter.