Source code for bca.bca

"""
Binary Coordinate Ascent algorithm for feature subset selection

Authors: Amin Zarshenas <mzarshen@hawk.iit.edu>
         Vijay Srinivas Tida
         Kenji Suzuki
"""

import numpy as np
from sklearn.utils import check_X_y
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted


[docs]class BCA(BaseEstimator): """Feature Selection with Binary Coordinate Ascent Algorithm Given an external estimator, the goal of binary coordinate ascent (BCA) algorithm is to select features which maximize an objective function of an estimator. It returns a binary vector with its size equal to the number of features, where zero or one indicates a feature at that position is not selected, or selected, respectively. First the best feature subset is initialized (specified as a binary vector). The default initialization is the vector of all zeros, corresponding to no input features selected. The corresponding objective function of an specified estimator is then calculated for the initial subset. BCA algorithm then iteratively select or remove features, one at a time, by flipping the binary elements of the binary vector of features, and examine if the selection/removal can increase the objective function. The process will be repeated over this vector for several times untill a convergance criteria is reached (can be set to number of iterations or a delta for objective value). The algorithm will return a binary vector corresponding to the "best" subset of features. Read more in the reference link specified below: http://www.sciencedirect.com/science/article/pii/S0950705116302416 Parameters ---------- estimator : object A supervised learning estimator with a `` fit `` method that will be used along with an objective function, in order to calculate the importance of a feature subset. scoring : string The metric to be used as objective to be maximized, e.g., roc_auc, accuracy, etc. Note: at the moment sklearn cross_val_score inside the BCA class supports binary classification only for roc_auc. cv : int, cross-validation generator or an iterable, optional The cv parameter used inside the sklearn cross_val_score. delta : float The delta used to determine the convergance of the objective function. Examples -------- The following example shows how to select the optimial subset of features in the breast cancer dataset. >>> from bca import BCA >>> from sklearn.datasets import load_breast_cancer >>> from sklearn.naive_bayes import GaussianNB >>> X, y = load_breast_cancer().data, load_breast_cancer().target >>> estimator = GaussianNB() >>> selector = BCA(estimator, scoring='accuracy', cv=5) >>> selector = selector.fit(X, y) >>> selector.features [ 1 4 6 7 16 20 21 22 23 27 28] >>> selector.score 0.971989226626 >>> selector.predict(X[20:25]) [1 1 0 0 0] """ def __init__(self, estimator, scoring='accuracy', cv=5, delta=10**-5): self.estimator = estimator self.scoring = scoring self.cv = cv self.delta = delta def _estimator_type(self): return self.estimator._estimator_type
[docs] def fit(self, X, y, initial_subset=None, fit_estimator=True, verbose=True): """Fit the BCA model to find the best subset of features, and potentially fit the estimator on the best subset. Parameters ---------- X : {array-like, sparse matrix}, shape=[n_samples,n_features] The training input samples. y : array-like, shape = [n_samples] The target values. initial_subset : binary vector, shape=[n_features] The initial subset. Default to all zeros ("None"). fit_estimator : boolean, Indicates to fit the estimator on the final features or not. verbose : boolean Indicates the verbosity of the algorithm. Returns ------- self : class object The BCA object with trained classifier """ return self._fit(X, y, initial_subset, fit_estimator, verbose)
def _fit(self, X, y, initial_subset, fit_estimator, verbose): X, y = check_X_y(X, y, "csc") # scoring function def scorer(features): return self._scorer(X, y, features, verbose=verbose) # initialization iteration = 0 n_features = X.shape[1] if initial_subset is not None: features = initial_subset else: features = np.zeros(n_features) score = scorer(features) if verbose: print ("Iteration {0} starts...".format(iteration)) if verbose: self._print_features(features, score) # main BCA loop of feature selection stop = False if verbose: print("\nBCA algorithm starts...") while not stop: iteration += 1 if verbose: print("\nIteration {0} starts...".format(iteration)) score_best = score # one iteration over all features for i in range(n_features): # flip one of the feature to create a new subset features_trial = features.copy() features_trial[i] = not features_trial[i] score_trial = scorer(features_trial) # modify the features if the new subset is better if score_trial > score: features = features_trial.copy() score = score_trial if verbose: self._print_features(features, score) # after each iteration check if the maximization converged if abs(score_best-score) < self.delta: stop = True if verbose: print ("\nBCA algorithm finished...") # fit a final estimator on the entire dataset if fit_estimator: if verbose: print ("\nFitting the final estimator...") self.estimator.fit(X[:, features == 1], y) # set final attributes self.features = np.arange(len(features))[features == 1] self.score = score return self
[docs] def predict(self, X): """Reduce X to the selected features and then predict using the underlying estimator. Parameters ---------- X : array of shape [n_samples, n_features] The input samples. Returns ------- y : array of shape [n_samples] The predicted target values. """ check_is_fitted(self, 'estimator') return self.estimator.predict(X[:, self.features])
def _scorer(self, X, y, features, verbose): if np.sum(features) > 0: return np.mean(cross_val_score(self.estimator, X[:, features == 1], y, scoring=self.scoring, cv=self.cv, verbose=verbose)) else: return -np.inf def _print_features(self, features, score): features = np.arange(len(features))[features == 1] print("best feature set so far is {0} with score = {1}". format(features, score))