Calmcode - dirty cat: results


1 2 3 4 5

Here is a large chunk of the code used to build the grid-search.

from sklearn import set_config


from sklearn.pipeline import Pipeline, FeatureUnion
from sklego.preprocessing import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge

method = {
    'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
    'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
    'sim_enc_all': dirty_cat.SimilarityEncoder(),
    'one-hot': OneHotEncoder(handle_unknown='ignore')

results = []

for k, encoder in method.items():
    pipe = Pipeline([
        ('split', FeatureUnion([
            ('cat', Pipeline([
                ('grab', ColumnSelector(['employee_position_title'])),
                ('handle', encoder)
            ('one-hot', Pipeline([
                ('grab', ColumnSelector('assignment_category')),
                ('handle', OneHotEncoder(handle_unknown='ignore'))
            ('floats', Pipeline([
                ('grab', ColumnSelector('year_first_hired')),
                ('scale', StandardScaler())
        ('mod', Ridge())

    grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
    res_df = pd.DataFrame(, y).cv_results_)
    res_df['key'] = k

To understand why this pipeline does not include the CountVectorizer component we need to observe one difference between their implementation.

import dirty_cat
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer='char', ngram_range=(2, 4))

mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)

Notice that the CountVectorizer receives ml_df['employee_position_title'] while the SimilarityEncoder receives ml_df[['employee_position_title']]. The former is a single column from the dataframe while the latter is a dataframe with one column. It's a subtle difference but this difference in input means that they expect different inputs.