Source code for pyccea.utils.models

import logging
import warnings
import numpy as np
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import ComplementNB, GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet



[docs]
class RegressionModel():
    """Load a regression model, adjust its hyperparameters and get the best model.

    Attributes
    ----------
    estimator : sklearn model object
        Trained model. In case optimize is True, it is the model that was chosen by the Randomized
        Search, i.e., estimator which gave the best result on the validation data.
    hyperparams : dict
        Hyperparameters of the model. In case optimize is True, it is the best hyperparameters
        used to fit the machine learning model.
    """

    models = {
        "linear": (LinearRegression, {}),
        "ridge": (Ridge, {}),
        "lasso": (Lasso, {}),
        "elastic_net": (ElasticNet, {}),
        "random_forest": (RandomForestRegressor, {}),
        "support_vector_machine": (SVR, {}),
    }

    def __init__(self, model_type: str):
        """Initialize the regression model.

        Parameters
        ----------
        model_type : str
            Name of the machine learning model that will be fitted for a regression task.
        """
        # Check if the chosen regression model is available
        if model_type not in RegressionModel.models.keys():
            raise AssertionError(
                f"Model type '{model_type}' was not found. "
                f"The available models are {', '.join(RegressionModel.models.keys())}."
            )
        # Initialize regression model
        self.model_type = model_type
        model_class, params = RegressionModel.models[self.model_type]
        self.estimator = model_class(**params)
        # Initialize logger with info level
        logging.basicConfig(encoding="utf-8", level=logging.INFO)
        # Suppress warnings
        warnings.filterwarnings(action="ignore", category=FutureWarning)


    def _model_selection(self, 
                         X_train: np.ndarray,
                         y_train: np.ndarray,
                         n_iter: int = 100,
                         seed: int = None,
                         kfolds: int = 5):
        """Optimize the hyperparameters of the model and return the best model found.

        Parameters
        ----------
        X_train: np.ndarray
            Train input data.
        y_train: np.ndarray
            Train output data.
        n_iter: int, default 100
            Number of hyperparameter settings that are sampled. It trades off runtime and quality
            of the solution.
        seed: int, default None
            Controls the shuffling applied for subsampling the data.
        kfolds: int, default 5
            Number of folds in the k-fold cross validation.

        Returns
        -------
        estimator: sklearn model object
            Trained model. It is the model that was chosen by the Randomized Search, i.e., estimator
            which gave the best result on the validation data.
        hyperparams: dict
            Hyperparameters of the model. It is the best hyperparameters used to fit the machine
            learning model.
        """
        # Grid of possible hyperparameter values
        if self.model_type == "ridge":
            self.grid = {
                "alpha": np.logspace(-6, 6, num=13)
            }
        elif self.model_type == "lasso":
            self.grid = {
                "alpha": np.logspace(-6, 6, num=13)
            }
        elif self.model_type == "random_forest":
            self.grid = {
                "n_estimators": np.arange(10, 201, 10),
                "max_features": ["sqrt", "log2", None],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4],
                "bootstrap": [True, False]
            }
        elif self.model_type == "support_vector_machine":
            self.grid = {
                "C": np.logspace(-6, 6, num=13),
                "epsilon": [0.01, 0.1, 0.5, 1.0],
                "kernel": ["linear", "poly", "rbf", "sigmoid"]
            }
        elif self.model_type == "elastic_net":
            self.grid = {
                "alpha": np.logspace(-6, 6, num=13),
                "l1_ratio": np.linspace(0, 1, 11)
            }
        elif self.model_type == "linear":
            self.grid = {
                "fit_intercept": [True, False]
            }

        # Scoring metric used for regression tasks
        scoring = "neg_mean_squared_error"

        # Tuning hyperparameters with Randomized Search with Cross Validation
        search = RandomizedSearchCV(estimator=self.estimator,
                                    param_distributions=self.grid,
                                    n_iter=n_iter,
                                    cv=kfolds,
                                    random_state=seed,
                                    scoring=scoring,
                                    refit=True)
        logging.info("Tuning hyperparameters ...")
        with warnings.catch_warnings():
            # NOTE: Suppress DeprecationWarning from scikit-learn Ridge regression
            # when calling scipy.linalg.solve() with deprecated 'sym_pos' argument.
            warnings.filterwarnings("ignore", message="The 'sym_pos' keyword is deprecated")
            search.fit(X_train, y_train)
        # Get best model and its respective hyperparameters
        estimator = search.best_estimator_
        hyperparams = search.best_params_

        return estimator, hyperparams 


[docs]
    def train(self,
              X_train: np.ndarray,
              y_train: np.ndarray,
              seed: int = None,
              kfolds: int = 10,
              n_iter: int = 100,
              optimize: bool = False,
              verbose: bool = False):
        """Build and train a regression model.

        Parameters
        ----------
        X_train: np.ndarray
            Train input data.
        y_train: np.ndarray
            Train output data.   
        seed: int, default None
            Controls the shuffling applied for subsampling the data.
        kfolds: int, default 10
            Number of folds in the k-fold cross validation.
        n_iter: int, default 100
            Number of hyperparameter settings that are sampled. It trades off runtime and quality
            of the solution.
        optimize: bool, default False
            If True, optimize the hyperparameters of the model and return the best model.
        verbose: bool, default False
            If True, show logs.
        """
        logging.getLogger().disabled = False if verbose else True

        if optimize:
            # Best results before tuning
            self.estimator, self.hyperparams = self._model_selection(X_train=X_train,
                                                                     y_train=y_train,
                                                                     seed=seed,
                                                                     kfolds=kfolds,
                                                                     n_iter=n_iter)
        else:
            logging.info("Fitting model ...")
            self.estimator.fit(X_train, y_train)
            self.hyperparams = self.estimator.get_params()
        logging.info(f"Hyperparameters: {self.hyperparams}")





[docs]
class ClassificationModel():
    """Load a classification model, adjust its hyperparameters and get the best model.

    Attributes
    ----------
    estimator : sklearn model object
        Trained model. In case optimize is True, it is the model that was chosen by the Randomized
        Search, i.e., estimator which gave the best result on the validation data.
    hyperparams : dict
        Hyperparameters of the model. In case optimize is True, it is the best hyperparameters
        used to fit the machine learning model.
    """

    models = {
        "complement_naive_bayes": (ComplementNB, {}),
        "gaussian_naive_bayes": (GaussianNB, {}),
        "k_nearest_neighbors": (KNeighborsClassifier, {"n_neighbors": 1}),
        "logistic_regression": (LogisticRegression, {}),
        "multinomial_naive_bayes": (MultinomialNB, {}),
        "random_forest": (RandomForestClassifier, {"n_jobs": -1}),
        "support_vector_machine": (SVC, {}),
    }

    def __init__(self, model_type: str):
        """Initialize the classification model.

        Parameters
        ----------
        model_type : str
            Name of the machine learning model that will be fitted for a classification task.
        """
        # Check if the chosen classification model is available
        if not model_type in ClassificationModel.models.keys():
            raise AssertionError(
                f"Model type '{model_type}' was not found. "
                f"The available models are {', '.join(ClassificationModel.models.keys())}."
            )
        # Initialize classification model
        self.model_type = model_type
        model_class, params = ClassificationModel.models[self.model_type]
        self.estimator = model_class(**params)
        # Initialize logger with info level
        logging.basicConfig(encoding="utf-8", level=logging.INFO)
        # Supress divide-by-zero warnings
        warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)
        # Supress future warnings
        warnings.filterwarnings(action="ignore", category=FutureWarning)

    def _model_selection(self, 
                         X_train: np.ndarray,
                         y_train: np.ndarray,
                         n_iter: int = 100,
                         seed: int = None,
                         kfolds: int = 5):
        """Optimize the hyperparameters of the model and return the best model found.

        Parameters
        ----------
        X_train: np.ndarray
            Train input data.
        y_train: np.ndarray
            Train output data.
        n_iter: int, default 100
            Number of hyperparameter settings that are sampled. It trades off runtime and quality
            of the solution.
        seed: int, default None
            Controls the shuffling applied for subsampling the data.
        kfolds: int, default 5
            Number of folds in the k-fold cross validation.

        Returns
        -------
        estimator: sklearn model object
            Trained model. It is the model that was chosen by the Randomized Search, i.e., estimator
            which gave the best result on the validation data.
        hyperparams: dict
            Hyperparameters of the model. It is the best hyperparameters used to fit the machine
            learning model.
        """
        # Grid of possible hyperparameter values
        if self.model_type == "support_vector_machine":
            self.grid = {
                "kernel": ["linear", "poly", "rbf", "sigmoid"],
                "degree": np.arange(1, 3),
                "gamma": ["scale", "auto"],
                "class_weight": [None, "balanced"]
            }
        elif self.model_type == "random_forest":
            self.grid = {
                "n_estimators": np.arange(1, 250, 50),
                "criterion": ["gini", "entropy"],
                "min_samples_split": [2, 0.1, 0.2, 0.3, 0.4],
                "min_samples_leaf": [1, 0.1, 0.2, 0.3, 0.4, 0.5],
                "max_features": ["sqrt", "log2", None],
                "class_weight": ["balanced_subsample", None],
                "ccp_alpha": np.arange(0, 0.6, 0.1),
                "max_samples": np.arange(0.1, 1.0, 0.1)
            }
        elif self.model_type in ["complement_naive_bayes", "multinomial_naive_bayes"]:
            self.grid = {
                "alpha": [0.01, 0.1, 0.5, 1.0, 10.0],
                "fit_prior": [True, False]
            }
        elif self.model_type in ["gaussian_naive_bayes"]:
            self.grid = {
                "var_smoothing": np.logspace(0,-9, num=100)
            }
        elif self.model_type == "k_nearest_neighbors":
            self.grid = {
                "n_neighbors": np.arange(1, 31, 1)
            }

        # Scoring metric used according to the classification task
        scoring = "f1_macro" if np.unique(y_train).shape[0] > 2 else "f1"

        # Tuning hyperparameters with Randomized Search with Cross Validation
        search = RandomizedSearchCV(estimator=self.estimator,
                                    param_distributions=self.grid,
                                    n_iter=n_iter,
                                    cv=kfolds,
                                    random_state=seed,
                                    scoring=scoring,
                                    refit=True)
        logging.info("Tuning hyperparameters ...")
        search.fit(X_train, y_train)
        # Get best model and its respective hyperparameters
        estimator = search.best_estimator_
        hyperparams = search.best_params_

        return estimator, hyperparams 


[docs]
    def train(self,
              X_train: np.ndarray,
              y_train: np.ndarray,
              seed: int = None,
              kfolds: int = 10,
              n_iter: int = 100,
              optimize: bool = False,
              verbose: bool = False):
        """Build and train a classification model.

        Parameters
        ----------
        X_train: np.ndarray
            Train input data.
        y_train: np.ndarray
            Train output data.   
        seed: int, default None
            Controls the shuffling applied for subsampling the data.
        kfolds: int, default 10
            Number of folds in the k-fold cross validation.
        n_iter: int, default 100
            Number of hyperparameter settings that are sampled. It trades off runtime and quality
            of the solution.
        optimize: bool, default False
            If True, optimize the hyperparameters of the model and return the best model.
        verbose: bool, default False
            If True, show logs.
        """
        logging.getLogger().disabled = False if verbose else True

        if optimize:
            # Best results before tuning
            self.estimator, self.hyperparams = self._model_selection(X_train=X_train,
                                                                     y_train=y_train,
                                                                     seed=seed,
                                                                     kfolds=kfolds,
                                                                     n_iter=n_iter)
        else:
            logging.info("Fitting model ...")
            self.estimator.fit(X_train, y_train)
            self.hyperparams = self.estimator.get_params()
        logging.info(f"Hyperparameters: {self.hyperparams}")