Source code for pyccea.coevolution.ccea

import copy
import logging
import numpy as np
from abc import ABC, abstractmethod
from ..utils.datasets import DataLoader



[docs]
class CCEA(ABC):
    """ An abstract class for a Cooperative Co-Evolutionary-Based Feature Selection Algorithm.

    Attributes
    ----------
    subpop_sizes: list
        Subpopulation sizes, that is, the number of individuals in each subpopulation.
    decomposer: object of one of the decomposition classes
        Responsible for decompose the problem into smaller subproblems.
    collaborator: object of one of the collaboration classes.
        Responsible for selecting collaborators for individuals.
    fitness_function: object of one of the fitness classes.
        Responsible for evaluating individuals, that is, subsets of features.
    initializer: object of one of the subpopulation initializers
        Responsible for initializing all individuals of all subpopulations.
    optimizers: list of objects of optimizer classes
        Responsible for evolving each of the subpopulations individually.
    subpops: list
        Individuals from all subpopulations. Each individual is represented by a binary
        n-dimensional array, where n is the number of features. If there is a 1 in the i-th
        position of the array, it indicates that the i-th feature should be considered and if
        there is a 0, it indicates that the feature should not be considered.
    fitness: list
        Evaluation of all context vectors from all subpopulations.
    context_vectors: list
        Complete problem solutions.
    convergence_curve: list
        Best global fitness in each generation.
    current_best: dict
        Current best individual of each subpopulation and its respective evaluation.
    best_context_vector: np.ndarray
        Best solution of the complete problem.
    best_fitness: float
        Evaluation of the best solution of the complete problem.
    feature_idxs : np.ndarray
        List of feature indexes.
    """

    def __init__(self, data: DataLoader, conf: dict, verbose: bool = True):
        """
        Parameters
        ----------
        data: DataLoader
            Container with process data and training and test sets.
        conf: dict
            Configuration parameters of the cooperative coevolutionary algorithm.
        verbose: bool, default True
            If True, show the improvements obtained from the optimization process.
        """
        # Seed
        self.seed = conf["coevolution"].get("seed")
        # Verbose
        self.verbose = verbose
        # Data
        self.data = data
        # Size of each subpopulation
        self.subpop_sizes = conf["coevolution"]["subpop_sizes"]
        # Number of subcomponents
        self.n_subcomps = conf["coevolution"].get("n_subcomps")
        if self.n_subcomps:
            if self.n_subcomps != len(self.subpop_sizes):
                if len(self.subpop_sizes) == 1:
                    subpop_size = self.subpop_sizes[0]
                    logging.info(f"Considering all subpopulations with size {subpop_size}.")
                    self.subpop_sizes = [subpop_size] * self.n_subcomps
                else:
                    raise AssertionError(
                        f"The number of subcomponents ({self.n_subcomps}) is not equal to the "
                        f"number of subpopulations ({len(self.subpop_sizes)}). Check parameters "
                        "'n_subcomps' and 'subpop_sizes' in the configuration file."
                    )
        # Number of features in each subcomponent
        self.subcomp_sizes = conf["coevolution"].get("subcomp_sizes")
        if self.subcomp_sizes:
            if len(self.subcomp_sizes) != len(self.subpop_sizes):
                raise AssertionError(
                    f"The number of subcomponents ({len(self.subcomp_sizes)}) is not equal to the"
                    f" number of subpopulations ({len(self.subpop_sizes)}). Check parameters "
                    "'subcomp_sizes' and 'subpop_sizes' in the configuration file."
                )
        # Evaluation mode
        self.eval_mode = self.data.splitter_type
        # Configuration parameters
        self.conf = conf
        # Initializes the components of the cooperative co-evolutionary algorithm
        self._init_evaluator()
        self._init_decomposer()
        self._init_collaborator()
        # List to store the best global fitness in each generation
        self.convergence_curve = list()
        # List to store the best context vector in each generation
        self.best_context_vectors = list()

        # Initialize logger with info level
        logging.basicConfig(encoding="utf-8", level=logging.INFO)
        # Reset handlers
        logging.getLogger().handlers = []
        # Add a custom handler
        handler = logging.StreamHandler()
        handler.setFormatter(logging.Formatter('%(message)s'))
        logging.getLogger().addHandler(handler)

    @abstractmethod
    def _init_decomposer(self):
        """Instantiate feature grouping method."""
        pass

    @abstractmethod
    def _init_evaluator(self):
        """Instantiate evaluation method."""
        pass

    @abstractmethod
    def _init_collaborator(self):
        """Instantiate collaboration method."""
        pass

    @abstractmethod
    def _init_subpop_initializer(self):
        """Instantiate subpopulation initialization method."""
        pass

    @abstractmethod
    def _init_optimizers(self):
        """Instantiate evolutionary algorithms to evolve each subpopulation."""
        pass


[docs]
    @abstractmethod
    def optimize(self):
        """Solve the feature selection problem through optimization."""
        pass


    def _get_best_individuals(self,
                              subpops: list,
                              fitness: list,
                              context_vectors: list):
        """
        Get the best individual from each subpopulation.

        Parameters
        ----------
        subpops: list
            Individuals from all subpopulations. Each individual is represented by a binary
            n-dimensional array, where n is the number of features. If there is a 1 in the i-th
            position of the array, it indicates that the i-th feature should be considered and if
            there is a 0, it indicates that the feature should not be considered.
        fitness: list
            Evaluation of all context vectors from all subpopulations.
        context_vectors: list
            Complete problem solutions.

        Returns
        -------
        current_best: dict
            Current best individual of each subpopulation and its respective evaluation.
        """
        # Current best individual of each subpopulation
        current_best = dict()
        # Number of subpopulations
        n_subpops = len(subpops)
        # For each subpopulation
        for i in range(n_subpops):
            best_ind_idx = np.argmax(fitness[i])
            current_best[i] = dict()
            current_best[i]["individual"] = subpops[i][best_ind_idx].copy()
            current_best[i]["context_vector"] = context_vectors[i][best_ind_idx].copy()
            current_best[i]["fitness"] = fitness[i][best_ind_idx]

        return current_best

    def _get_global_best(self):
        """Get the globally best context vector."""
        best_idx = np.argmax([best["fitness"] for best in self.current_best.values()])
        best_fitness = self.current_best[best_idx]["fitness"]
        best_context_vector = self.current_best[best_idx]["context_vector"].copy()
        return best_context_vector, best_fitness

    def _init_subpopulations(self):
        """Initialize all subpopulations according to their respective sizes."""
        # Instantiate subpopulation initialization method
        self._init_subpop_initializer()
        # Build subpopulations
        # Number of subpopulations is equal to the number of subcomponents
        self.initializer.build_subpopulations()
        # Evaluate all individuals in each subpopulation
        # Number of individuals in each subpopulation is in the list of subcomponent sizes
        self.initializer.evaluate_individuals()
        # Subpopulations
        self.subpops = copy.deepcopy(self.initializer.subpops)
        # Context vectors
        self.context_vectors = copy.deepcopy(self.initializer.context_vectors)
        # Evaluations of context vectors
        self.fitness = copy.deepcopy(self.initializer.fitness)

    def _problem_decomposition(self):
        """Decompose the problem into smaller subproblems."""
        for k in range(self.data.kfolds):
            Xk_train = self.data.train_folds[k][0].copy()
            # Decompose only once to use the same feature indexes on all k-folds
            if k == 0:
                _, self.feature_idxs = self.decomposer.decompose(X=Xk_train.copy())
            # Reorder training and validation folds built from the training set according to the
            # shuffling in the feature decomposition
            self.data.train_folds[k][0] = Xk_train[:, self.feature_idxs].copy()
            Xk_val = self.data.val_folds[k][0].copy()
            self.data.val_folds[k][0] = Xk_val[:, self.feature_idxs].copy()
        # Reorder training set according to the shuffling in the feature decomposition
        self.data.X_train = self.data.X_train[:, self.feature_idxs].copy()
        # Reorder test set according to the shuffling in the feature decomposition
        self.data.X_test = self.data.X_test[:, self.feature_idxs].copy()
        # Update 'n_subcomps' when it starts with NoneType
        self.n_subcomps = self.decomposer.n_subcomps
        # Update 'subcomp_sizes' when it starts with an empty list
        self.subcomp_sizes = self.decomposer.subcomp_sizes.copy()