Here's the demo of the IsolationForest
.
from collections import Counter
from sklearn.ensemble import IsolationForest
mod = IsolationForest().fit(X)
np.where(mod.predict(X) == -1, 1, 0)
To run the setup with our metrics set up to judge the outlier model on how it performs on correlation;
def outlier_precision(mod, X, y):
preds = mod.predict(X)
return precision_score(y, np.where(preds == -1, 1, 0))
def outlier_recall(mod, X, y):
preds = mod.predict(X)
return recall_score(y, np.where(preds == -1, 1, 0))
grid = GridSearchCV(
estimator=IsolationForest(),
param_grid={'contamination': np.linspace(0.001, 0.02, 10)},
scoring={'precision': outlier_precision,
'recall': outlier_recall},
refit='precision',
cv=5,
n_jobs=-1
)
grid.fit(X, y);
plt.figure(figsize=(12, 4))
df_results = pd.DataFrame(grid.cv_results_)
for score in ['mean_test_recall', 'mean_test_precision']:
plt.plot(df_results['param_contamination'],
df_results[score],
label=score)
plt.legend();
Note that we're not using the sample_weight in our own custom scorer, but you should see a huge effect on the metric.