Using the Thresholder meta-model.
Again we'll need a few tools to be available.
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_blobs
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, make_scorer
from sklego.meta import Thresholder
We're taking a look at another dataset in this video.
X, y = make_blobs(1000, centers=[(0, 0), (1.5, 1.5)], cluster_std=[1, 0.5])
plt.scatter(X[:, 0], X[:, 1], c=y, s=5);
You can see the effect of using a threshold by running:
m1 = Thresholder(LogisticRegression(solver='lbfgs'), threshold=0.1).fit(X, y)
m2 = Thresholder(LogisticRegression(solver='lbfgs'), threshold=0.9).fit(X, y)
plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.scatter(X[:, 0], X[:, 1], c=y, s=5)
plt.title("original data")
plt.subplot(132)
plt.scatter(X[:, 0], X[:, 1], c=m1.predict(X), s=5)
plt.title("threshold=0.1")
plt.subplot(133)
plt.scatter(X[:, 0], X[:, 1], c=m2.predict(X), s=5)
plt.title("threshold=0.9");
You can also run this meta model in a gridsearch.
pipe = Pipeline([
("model", Thresholder(LogisticRegression(solver='lbfgs'), threshold=0.1))
])
mod = GridSearchCV(estimator=pipe,
param_grid = {"model__threshold": np.linspace(0.1, 0.9, 50)},
scoring={"precision": make_scorer(precision_score),
"recall": make_scorer(recall_score),
"accuracy": make_scorer(accuracy_score)},
refit="precision",
cv=5)
mod.fit(X, y);