import csv
import datetime
import random
from operator import itemgetter
import lightgbm as lgb
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import (
AdaBoostClassifier,
AdaBoostRegressor,
BaggingClassifier,
BaggingRegressor,
RandomForestClassifier,
RandomForestRegressor,
)
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import (
ElasticNet,
LogisticRegression,
SGDClassifier,
SGDRegressor,
)
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
[docs]class TrainModel:
"""
Select & train models
"""
[docs] def __init__(self):
self.classifier_param_list = [
{
"model": [DecisionTreeClassifier()],
"model__min_samples_split": [0.25, 0.5, 1.0],
"model__max_depth": [5, 10, 15],
},
{
"model": [RandomForestClassifier()],
"model__min_samples_split": [0.25, 0.5, 1.0],
"model__max_depth": [5, 10, 15],
},
{
"model": [MLPClassifier()],
"model__activation": ["identity", "logistic", "tanh", "relu"],
"model__alpha": [0.001, 0.01, 0.1],
},
{
"model": [LogisticRegression(fit_intercept=False)],
"model__C": [1, 5, 10],
},
{
"model": [BaggingClassifier()],
"model__n_estimators": [5, 10, 15],
"model__max_features": [0.25, 0.5, 1.0],
},
{
"model": [AdaBoostClassifier()],
"model__n_estimators": [5, 10, 15],
"model__learning_rate": [0.001, 0.01, 0.1],
},
{
"model": [XGBClassifier()],
"model__n_estimators": [5, 10, 15],
"model__learning_rate": [0.001, 0.01, 0.1],
},
{
"model": [lgb.LGBMClassifier()],
"model__learning_rate": [0.01, 0.001],
},
{
"model": [CatBoostClassifier()],
"model__learning_rate": [0.01, 0.001],
},
]
self.regressor_param_list = [
{
"model": [DecisionTreeRegressor()],
"model__min_samples_split": [0.25, 0.5, 1.0],
"model__max_depth": [5, 10, 15],
},
{
"model": [RandomForestRegressor()],
"model__min_samples_split": [0.25, 0.5, 1.0],
"model__max_depth": [5, 10, 15],
},
{
"model": [MLPRegressor()],
"model__activation": ["identity", "logistic", "tanh", "relu"],
"model__alpha": [0.001, 0.01, 0.1],
},
{
"model": [ElasticNet(fit_intercept=False)],
"model__alpha": [0.001, 0.01, 0.1],
"model__l1_ratio": [0.25, 0.5, 1.0],
},
{
"model": [BaggingRegressor()],
"model__n_estimators": [5, 10, 15],
"model__max_features": [0.25, 0.5, 1.0],
},
{
"model": [AdaBoostRegressor()],
"model__n_estimators": [5, 10, 15],
"model__learning_rate": [0.001, 0.01, 0.1],
},
{
"model": [XGBRegressor()],
"model__n_estimators": [5, 10, 15],
"model__learning_rate": [0.001, 0.01, 0.1],
},
{
"model": [lgb.LGBMRegressor()],
"model__learning_rate": [0.01, 0.001],
},
{
"model": [CatBoostRegressor()],
"model__learning_rate": [0.01, 0.001],
},
]
[docs] def train_test_split_by_ids(self, df, id_col, target_col, prop_train):
"""
Parameters
--------
df: DataFrame
id_col: str
target_col: str
prop_train: float
Returns
--------
X_train: DataFrame
y_train: DataFrame
X_test: DataFrame
y_test: DataFrame
"""
ids = list(set(df[id_col].values))
random.shuffle(ids)
len_ids = len(ids)
number_to_select = int(len_ids * prop_train)
X_train_ids = pd.DataFrame(ids[:number_to_select], columns=[id_col])
X_test_ids = pd.DataFrame(ids[number_to_select:], columns=[id_col])
X_train = pd.merge(df, X_train_ids, how="inner")
X_test = pd.merge(df, X_test_ids, how="inner")
y_train = X_train[target_col]
y_test = X_test[target_col]
return X_train, X_test, y_train, y_test
[docs] def model_testing(
self,
X_train,
y_train,
model_type,
tie_breaker_scoring_method,
save_to_csv=True,
file_name="gridsearching_results",
multiclass=False,
):
"""
Gridsearches using a model for best models/params out of a list of commonly used
Parameters
--------
X_train: DataFrame
y_train: DataFrame
model: sklearn model
model_type: str
'classification' or 'regression'
tie_breaker_scoring_method: str
For classification: "precision", "recall", or "f1"
For regression: "neg_root_mean_squared_error", "neg_median_absolute_error", or "r2"
save_to_csv: bool
file_name: str
multiclass: bool
Returns
--------
best_params: dict
"""
if model_type == "classification":
model = Pipeline(
[
("model", LogisticRegression()),
]
)
# Only some models/scoring work with multiclass
if multiclass:
param_list = self.classifier_param_list[:3]
lst_scoring_methods = [
"recall_weighted",
"precision_weighted",
"f1_weighted",
]
else:
param_list = self.classifier_param_list[:3]
lst_scoring_methods = ["recall", "precision", "f1"]
else:
model = Pipeline(
[
("model", ElasticNet()),
]
)
lst_scoring_methods = [
"neg_root_mean_squared_error",
"neg_median_absolute_error",
"r2",
]
param_list = self.regressor_param_list
g = GridSearchCV(
model,
param_list,
cv=3,
n_jobs=-2,
verbose=2,
scoring=lst_scoring_methods,
refit=tie_breaker_scoring_method,
)
g.fit(X_train, y_train)
if model_type == "classification":
if multiclass:
all_scores = list(
zip(
g.cv_results_["params"],
g.cv_results_["mean_test_recall_weighted"],
g.cv_results_["mean_test_precision_weighted"],
g.cv_results_["mean_test_f1_weighted"],
)
)
all_scores.sort(key=lambda x: x[1], reverse=True)
formatted_scores = [
(
"Params: {}".format(x[0]),
"Mean Recall Weighted: {0:.4f}".format(x[1]),
"Mean Precision Weighted: {0:.4f}".format(x[2]),
"Mean F1 Weighted: {0:.4f}".format(x[3]),
)
for x in all_scores
]
else:
all_scores = list(
zip(
g.cv_results_["params"],
g.cv_results_["mean_test_recall"],
g.cv_results_["mean_test_precision"],
g.cv_results_["mean_test_f1"],
)
)
all_scores.sort(key=lambda x: x[1], reverse=True)
formatted_scores = [
(
"Params: {}".format(x[0]),
"Mean Recall: {0:.4f}".format(x[1]),
"Mean Precision: {0:.4f}".format(x[2]),
"Mean F1 Score: {0:.4f}".format(x[3]),
)
for x in all_scores
]
else:
all_scores = list(
zip(
g.cv_results_["params"],
g.cv_results_["mean_test_neg_root_mean_squared_error"],
g.cv_results_["mean_test_neg_median_absolute_error"],
g.cv_results_["mean_test_r2"],
)
)
all_scores.sort(key=lambda x: x[1], reverse=True)
formatted_scores = [
(
"Params: {}".format(x[0]),
"Mean Negative Root Mean Squared Errror: {0:.4f}".format(x[1]),
"Mean Negative Median Absolute Error: {0:.4f}".format(x[2]),
"Mean R2: {0:.4f}".format(x[3]),
)
for x in all_scores
]
# Cleaner printing
print("\n\n")
print(
"*** Best Parameters Using {} | Tie Breaker: {} | {} ***".format(
lst_scoring_methods,
tie_breaker_scoring_method,
datetime.datetime.today().strftime("%Y-%m-%d %H:%m"),
)
)
[
print("{}\n{}\n{}\n{}\n\n".format(x[0], x[1], x[2], x[3]))
for x in formatted_scores[:30]
]
if save_to_csv:
lst_dict = []
for model in all_scores[:30]:
d = dict()
for k, v in zip(
list(model[0].keys()) + lst_scoring_methods,
list(model[0].values()) + [x for x in model[1:]],
):
d[k] = v
lst_dict.append(d)
cleaned_date = (
datetime.datetime.today().isoformat(" ", "seconds").replace(" ", "-")
)
temp_df = pd.DataFrame(lst_dict)
temp_df["timestamp"] = cleaned_date
try:
previous_df = pd.read_csv(f"{file_name}.csv")
model_results_df = pd.concat([previous_df, temp_df], axis=0)
model_results_df.reset_index(inplace=True, drop=True)
except Exception:
model_results_df = temp_df
model_results_df = model_results_df.reindex(
sorted(model_results_df.columns), axis=1
)
with open(f"{file_name}.csv", "w") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(model_results_df.columns)
for _, row in model_results_df.iterrows():
csvwriter.writerow(row)
return g.best_params_
[docs] def run_feature_selection(self, X_train, y_train, k, is_multiclass):
"""
Run SelectKBest feature selection for given datasets.
Implements a custom method of feature selection for multiclass targets.
Parameters
--------
X_train: DataFrame
y_train: DataFrame
k: int
is_multiclass: bool
Returns
--------
cols_to_keep: list
"""
if is_multiclass:
all_feature_scores = []
for label in y_train.columns:
# Select from all first, limit to k after means
selector = SelectKBest(chi2, k="all")
selector.fit(X_train, y_train[label])
all_feature_scores.append(list(selector.scores_))
# Get mean feature scoring across all target classes
mean_feature_scores = np.mean(all_feature_scores, axis=0)
# Remove nulls
no_nulls_scores = np.nan_to_num(mean_feature_scores)
# Sort descending values & keep top k
selected_feature_idxs = np.argsort(no_nulls_scores)[::-1][:k]
cols_to_keep = list(itemgetter(*selected_feature_idxs)(X_train.columns))
else:
selector = SelectKBest(k=k)
selector.fit(X_train, y_train)
cols_to_keep = list(
X_train.columns[np.where(selector.get_support() == True)].values
)
return cols_to_keep
[docs] def train_in_chunks(
self, X_train, y_train, model_type, is_multiclass, chunk_sizes=500000
):
"""
For large datasets, train model in managable chunk sizes
Parameters
--------
X_train: DataFrame
y_train: DataFrame
model_type: str
'classification' or 'regression'
is_multiclass: bool
chunk_sizes: int
Returns
--------
model: sklearn model
"""
# Set default num_chunks to number needed to get manageable chunk sizes
# Used this number after testing time needed to train various sizes
num_chunks = round(y_train.shape[0] / chunk_sizes)
# Make sure num_chunks isn't below 1
if num_chunks < 1:
num_chunks = 1
step = round(y_train.shape[0] / num_chunks)
idx_start = 0
idx_end = step
if is_multiclass:
# Need this custom model to do partial_fit with multiclass data
model = MultiOutputClassifier(MultinomialNB())
elif model_type == "classification":
model = SGDClassifier()
else:
model = SGDRegressor()
while idx_start < y_train.shape[0]:
if is_multiclass:
model.partial_fit(
X_train.iloc[idx_start:idx_end],
y_train.iloc[idx_start:idx_end],
classes=[
np.array(np.unique(y_train[col])) for col in y_train.columns
],
)
elif model_type == "classification":
model = model.partial_fit(
X_train.iloc[idx_start:idx_end],
y_train.iloc[idx_start:idx_end],
classes=np.array(np.unique(y_train)),
)
else:
model = model.partial_fit(
X_train.iloc[idx_start:idx_end],
y_train.iloc[idx_start:idx_end],
)
idx_start += step
idx_end += step
return model