import logging
import warnings
import numpy as np
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import ComplementNB, GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
[docs]
class RegressionModel():
"""Load a regression model, adjust its hyperparameters and get the best model.
Attributes
----------
estimator : sklearn model object
Trained model. In case optimize is True, it is the model that was chosen by the Randomized
Search, i.e., estimator which gave the best result on the validation data.
hyperparams : dict
Hyperparameters of the model. In case optimize is True, it is the best hyperparameters
used to fit the machine learning model.
"""
models = {
"linear": (LinearRegression, {}),
"ridge": (Ridge, {}),
"lasso": (Lasso, {}),
"elastic_net": (ElasticNet, {}),
"random_forest": (RandomForestRegressor, {}),
"support_vector_machine": (SVR, {}),
}
def __init__(self, model_type: str):
"""Initialize the regression model.
Parameters
----------
model_type : str
Name of the machine learning model that will be fitted for a regression task.
"""
# Check if the chosen regression model is available
if model_type not in RegressionModel.models.keys():
raise AssertionError(
f"Model type '{model_type}' was not found. "
f"The available models are {', '.join(RegressionModel.models.keys())}."
)
# Initialize regression model
self.model_type = model_type
model_class, params = RegressionModel.models[self.model_type]
self.estimator = model_class(**params)
# Initialize logger with info level
logging.basicConfig(encoding="utf-8", level=logging.INFO)
# Suppress warnings
warnings.filterwarnings(action="ignore", category=FutureWarning)
def _model_selection(self,
X_train: np.ndarray,
y_train: np.ndarray,
n_iter: int = 100,
seed: int = None,
kfolds: int = 5):
"""Optimize the hyperparameters of the model and return the best model found.
Parameters
----------
X_train: np.ndarray
Train input data.
y_train: np.ndarray
Train output data.
n_iter: int, default 100
Number of hyperparameter settings that are sampled. It trades off runtime and quality
of the solution.
seed: int, default None
Controls the shuffling applied for subsampling the data.
kfolds: int, default 5
Number of folds in the k-fold cross validation.
Returns
-------
estimator: sklearn model object
Trained model. It is the model that was chosen by the Randomized Search, i.e., estimator
which gave the best result on the validation data.
hyperparams: dict
Hyperparameters of the model. It is the best hyperparameters used to fit the machine
learning model.
"""
# Grid of possible hyperparameter values
if self.model_type == "ridge":
self.grid = {
"alpha": np.logspace(-6, 6, num=13)
}
elif self.model_type == "lasso":
self.grid = {
"alpha": np.logspace(-6, 6, num=13)
}
elif self.model_type == "random_forest":
self.grid = {
"n_estimators": np.arange(10, 201, 10),
"max_features": ["sqrt", "log2", None],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4],
"bootstrap": [True, False]
}
elif self.model_type == "support_vector_machine":
self.grid = {
"C": np.logspace(-6, 6, num=13),
"epsilon": [0.01, 0.1, 0.5, 1.0],
"kernel": ["linear", "poly", "rbf", "sigmoid"]
}
elif self.model_type == "elastic_net":
self.grid = {
"alpha": np.logspace(-6, 6, num=13),
"l1_ratio": np.linspace(0, 1, 11)
}
elif self.model_type == "linear":
self.grid = {
"fit_intercept": [True, False]
}
# Scoring metric used for regression tasks
scoring = "neg_mean_squared_error"
# Tuning hyperparameters with Randomized Search with Cross Validation
search = RandomizedSearchCV(estimator=self.estimator,
param_distributions=self.grid,
n_iter=n_iter,
cv=kfolds,
random_state=seed,
scoring=scoring,
refit=True)
logging.info("Tuning hyperparameters ...")
with warnings.catch_warnings():
# NOTE: Suppress DeprecationWarning from scikit-learn Ridge regression
# when calling scipy.linalg.solve() with deprecated 'sym_pos' argument.
warnings.filterwarnings("ignore", message="The 'sym_pos' keyword is deprecated")
search.fit(X_train, y_train)
# Get best model and its respective hyperparameters
estimator = search.best_estimator_
hyperparams = search.best_params_
return estimator, hyperparams
[docs]
def train(self,
X_train: np.ndarray,
y_train: np.ndarray,
seed: int = None,
kfolds: int = 10,
n_iter: int = 100,
optimize: bool = False,
verbose: bool = False):
"""Build and train a regression model.
Parameters
----------
X_train: np.ndarray
Train input data.
y_train: np.ndarray
Train output data.
seed: int, default None
Controls the shuffling applied for subsampling the data.
kfolds: int, default 10
Number of folds in the k-fold cross validation.
n_iter: int, default 100
Number of hyperparameter settings that are sampled. It trades off runtime and quality
of the solution.
optimize: bool, default False
If True, optimize the hyperparameters of the model and return the best model.
verbose: bool, default False
If True, show logs.
"""
logging.getLogger().disabled = False if verbose else True
if optimize:
# Best results before tuning
self.estimator, self.hyperparams = self._model_selection(X_train=X_train,
y_train=y_train,
seed=seed,
kfolds=kfolds,
n_iter=n_iter)
else:
logging.info("Fitting model ...")
self.estimator.fit(X_train, y_train)
self.hyperparams = self.estimator.get_params()
logging.info(f"Hyperparameters: {self.hyperparams}")
[docs]
class ClassificationModel():
"""Load a classification model, adjust its hyperparameters and get the best model.
Attributes
----------
estimator : sklearn model object
Trained model. In case optimize is True, it is the model that was chosen by the Randomized
Search, i.e., estimator which gave the best result on the validation data.
hyperparams : dict
Hyperparameters of the model. In case optimize is True, it is the best hyperparameters
used to fit the machine learning model.
"""
models = {
"complement_naive_bayes": (ComplementNB, {}),
"gaussian_naive_bayes": (GaussianNB, {}),
"k_nearest_neighbors": (KNeighborsClassifier, {"n_neighbors": 1}),
"logistic_regression": (LogisticRegression, {}),
"multinomial_naive_bayes": (MultinomialNB, {}),
"random_forest": (RandomForestClassifier, {"n_jobs": -1}),
"support_vector_machine": (SVC, {}),
}
def __init__(self, model_type: str):
"""Initialize the classification model.
Parameters
----------
model_type : str
Name of the machine learning model that will be fitted for a classification task.
"""
# Check if the chosen classification model is available
if not model_type in ClassificationModel.models.keys():
raise AssertionError(
f"Model type '{model_type}' was not found. "
f"The available models are {', '.join(ClassificationModel.models.keys())}."
)
# Initialize classification model
self.model_type = model_type
model_class, params = ClassificationModel.models[self.model_type]
self.estimator = model_class(**params)
# Initialize logger with info level
logging.basicConfig(encoding="utf-8", level=logging.INFO)
# Supress divide-by-zero warnings
warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)
# Supress future warnings
warnings.filterwarnings(action="ignore", category=FutureWarning)
def _model_selection(self,
X_train: np.ndarray,
y_train: np.ndarray,
n_iter: int = 100,
seed: int = None,
kfolds: int = 5):
"""Optimize the hyperparameters of the model and return the best model found.
Parameters
----------
X_train: np.ndarray
Train input data.
y_train: np.ndarray
Train output data.
n_iter: int, default 100
Number of hyperparameter settings that are sampled. It trades off runtime and quality
of the solution.
seed: int, default None
Controls the shuffling applied for subsampling the data.
kfolds: int, default 5
Number of folds in the k-fold cross validation.
Returns
-------
estimator: sklearn model object
Trained model. It is the model that was chosen by the Randomized Search, i.e., estimator
which gave the best result on the validation data.
hyperparams: dict
Hyperparameters of the model. It is the best hyperparameters used to fit the machine
learning model.
"""
# Grid of possible hyperparameter values
if self.model_type == "support_vector_machine":
self.grid = {
"kernel": ["linear", "poly", "rbf", "sigmoid"],
"degree": np.arange(1, 3),
"gamma": ["scale", "auto"],
"class_weight": [None, "balanced"]
}
elif self.model_type == "random_forest":
self.grid = {
"n_estimators": np.arange(1, 250, 50),
"criterion": ["gini", "entropy"],
"min_samples_split": [2, 0.1, 0.2, 0.3, 0.4],
"min_samples_leaf": [1, 0.1, 0.2, 0.3, 0.4, 0.5],
"max_features": ["sqrt", "log2", None],
"class_weight": ["balanced_subsample", None],
"ccp_alpha": np.arange(0, 0.6, 0.1),
"max_samples": np.arange(0.1, 1.0, 0.1)
}
elif self.model_type in ["complement_naive_bayes", "multinomial_naive_bayes"]:
self.grid = {
"alpha": [0.01, 0.1, 0.5, 1.0, 10.0],
"fit_prior": [True, False]
}
elif self.model_type in ["gaussian_naive_bayes"]:
self.grid = {
"var_smoothing": np.logspace(0,-9, num=100)
}
elif self.model_type == "k_nearest_neighbors":
self.grid = {
"n_neighbors": np.arange(1, 31, 1)
}
# Scoring metric used according to the classification task
scoring = "f1_macro" if np.unique(y_train).shape[0] > 2 else "f1"
# Tuning hyperparameters with Randomized Search with Cross Validation
search = RandomizedSearchCV(estimator=self.estimator,
param_distributions=self.grid,
n_iter=n_iter,
cv=kfolds,
random_state=seed,
scoring=scoring,
refit=True)
logging.info("Tuning hyperparameters ...")
search.fit(X_train, y_train)
# Get best model and its respective hyperparameters
estimator = search.best_estimator_
hyperparams = search.best_params_
return estimator, hyperparams
[docs]
def train(self,
X_train: np.ndarray,
y_train: np.ndarray,
seed: int = None,
kfolds: int = 10,
n_iter: int = 100,
optimize: bool = False,
verbose: bool = False):
"""Build and train a classification model.
Parameters
----------
X_train: np.ndarray
Train input data.
y_train: np.ndarray
Train output data.
seed: int, default None
Controls the shuffling applied for subsampling the data.
kfolds: int, default 10
Number of folds in the k-fold cross validation.
n_iter: int, default 100
Number of hyperparameter settings that are sampled. It trades off runtime and quality
of the solution.
optimize: bool, default False
If True, optimize the hyperparameters of the model and return the best model.
verbose: bool, default False
If True, show logs.
"""
logging.getLogger().disabled = False if verbose else True
if optimize:
# Best results before tuning
self.estimator, self.hyperparams = self._model_selection(X_train=X_train,
y_train=y_train,
seed=seed,
kfolds=kfolds,
n_iter=n_iter)
else:
logging.info("Fitting model ...")
self.estimator.fit(X_train, y_train)
self.hyperparams = self.estimator.get_params()
logging.info(f"Hyperparameters: {self.hyperparams}")