Here is a large chunk of the code used to build the grid-search.
from sklearn import set_config
set_config(display="diagram")
from sklearn.pipeline import Pipeline, FeatureUnion
from sklego.preprocessing import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Ridge
method = {
'sim_enc100': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=100),
'sim_enc300': dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=300),
'sim_enc_all': dirty_cat.SimilarityEncoder(),
'one-hot': OneHotEncoder(handle_unknown='ignore')
}
results = []
for k, encoder in method.items():
pipe = Pipeline([
('split', FeatureUnion([
('cat', Pipeline([
('grab', ColumnSelector(['employee_position_title'])),
('handle', encoder)
])),
('one-hot', Pipeline([
('grab', ColumnSelector('assignment_category')),
('handle', OneHotEncoder(handle_unknown='ignore'))
])),
('floats', Pipeline([
('grab', ColumnSelector('year_first_hired')),
('scale', StandardScaler())
])),
])),
('mod', Ridge())
])
grid = GridSearchCV(pipe, cv=10, param_grid={}, scoring=['r2', 'neg_mean_absolute_error'], refit='r2', n_jobs=-1)
res_df = pd.DataFrame(grid.fit(X, y).cv_results_)
res_df['key'] = k
results.append(res_df)
To understand why this pipeline does not include the CountVectorizer
component we need
to observe one difference between their implementation.
import dirty_cat
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer='char', ngram_range=(2, 4))
cv.fit_transform(ml_df['employee_position_title']).shape
mod = dirty_cat.SimilarityEncoder(categories='most_frequent', n_prototypes=200)
mod.fit_transform(data[['employee_position_title']]).shape
Notice that the CountVectorizer
receives ml_df['employee_position_title']
while
the SimilarityEncoder
receives ml_df[['employee_position_title']]
. The former
is a single column from the dataframe while the latter is a dataframe with one column.
It's a subtle difference but this difference in input means that they expect different
inputs.