Source code for pyccea.fitness.distance

import numpy as np
from ..utils.datasets import DataLoader
from ..evaluation.wrapper import WrapperEvaluation
from ..fitness.function import WrapperFitnessFunction


[docs] class DistanceBasedFitness(WrapperFitnessFunction): """ Objective function that maximizes balanced accuracy based on a k-nearest neighbors classifier. The fitness function is designed as a three-objective optimization, aimed at achieving a balance between maximizing balanced accuracy while simultaneously minimizing the average distance between instances sharing the same label and maximizing the average distance between instances with different labels. Firouznia, Marjan, Pietro Ruiu, and Giuseppe A. Trunfio. "Adaptive cooperative coevolutionary differential evolution for parallel feature selection in high-dimensional datasets." The Journal of Supercomputing (2023): 1-30. Attributes ---------- w1: float Predictive performance weight. w2: float Weight of the complement of the average distance between instances and their neighbors with the same class. w3: float Weight of the average distance between instances and their neighbors of different classes. """ def __init__(self, evaluator: WrapperEvaluation, weights: list): super().__init__(evaluator) # Check the number of weights if len(weights) != 3: raise AssertionError( f"'{DistanceBasedFitness.__name__}' fitness function has only three components. " "Therefore, it requires only three weights." ) # Check the sum of the weights if sum(weights) != 1: raise AssertionError( f"The sum of weights is {sum(weights)} but must be 1." ) self.w1 = weights[0] self.w2 = weights[1] self.w3 = weights[2] def _compute_distances(self, data: DataLoader): """ Calculate the average distances between instances and their neighbors with the same class and their neighbors with different class. """ avg_distances_same_label = list() avg_distances_diff_label = list() # If no feature is selected, no estimator has been trained if not self.evaluator.estimators: return 0, 0 # For all estimator for estimator in self.evaluator.estimators: # Get indices and distances between neighbors distances, indices = estimator.kneighbors( X=data.X_train, n_neighbors=estimator.n_neighbors+1, # Neighbors other than the i-th instance itself return_distance=True ) sum_distances_same_label = list() sum_distances_diff_label = list() n_neighbors_same_label = 0 n_neighbors_diff_label = 0 # For all training instance for i in range(data.train_size): # Neighbors with the same label same_label_indices = np.where(data.y_train[indices[i]] == data.y_train[i])[0] sum_ith_distance_same_label = np.sum(distances[i][same_label_indices]) n_neighbors_same_label += len(same_label_indices) - 1 # Neighbors other than the i-th instance itself sum_distances_same_label.append(sum_ith_distance_same_label) # Neighbors with different labels diff_label_indices = np.where(data.y_train[indices[i]] != data.y_train[i])[0] sum_ith_distance_diff_label = np.sum(distances[i][diff_label_indices]) n_neighbors_diff_label += len(diff_label_indices) sum_distances_diff_label.append(sum_ith_distance_diff_label) # Average distances for each estimator avg_distances_same_label.append(np.sum(sum_distances_same_label)/n_neighbors_same_label) avg_distances_diff_label.append(np.sum(sum_distances_diff_label)/n_neighbors_diff_label) # Mean average distances calculated across all estimators mean_avg_distance_same_label = np.mean(avg_distances_same_label) mean_avg_distance_diff_label = np.mean(avg_distances_diff_label) return mean_avg_distance_same_label, mean_avg_distance_diff_label
[docs] def evaluate(self, context_vector: np.ndarray, data: DataLoader): """ Evaluate the given context vector using the fitness function. Parameters ---------- context_vector: np.ndarray Solution of the complete problem. data: DataLoader Container with process data and training and test sets. Returns ------- fitness: float Quality of the context vector. """ sqrt_n_selected_features = np.sqrt(context_vector.sum()) evaluations = self._evaluate_predictive_performance(context_vector, data) evaluation = evaluations[self.evaluator.eval_function] mean_avg_distance_same_label, mean_avg_distance_diff_label = self._compute_distances(data) # Since we are maximizing: # - For regression: invert the evaluation (lower error is better) # - For classification: use evaluation directly (higher accuracy is better) sign = -1 if self.evaluator.task == "regression" else 1 fitness = ( sign * self.w1 * evaluation + self.w2 * (mean_avg_distance_diff_label/sqrt_n_selected_features) + self.w3 * (1 - (mean_avg_distance_same_label/sqrt_n_selected_features)) ) return fitness