"""
Base class for all selectors.
"""
from abc import ABCMeta, abstractmethod
from functools import reduce
from typing import List, Optional, Type, Union
from numpy.random import RandomState
from pandas import Series
from shap import Explainer
[docs]class BaseSelector(metaclass=ABCMeta):
"""Abstract base class for all selectors in shapicant.
Args:
estimator: A supervised learning estimator with a 'fit' method.
explainer_type: A SHAP explainer type.
n_iter: The number of iterations to perform.
verbose: Controls verbosity of output.
random_state: Parameter to control the random number generator used.
Attributes:
p_values_ (Series): Series containing the empirical p-values of the features.
"""
def __init__(
self,
estimator: object,
explainer_type: Type[Explainer],
n_iter: int = 100,
verbose: Union[int, bool] = 1,
random_state: Optional[Union[int, RandomState]] = None,
) -> None:
self.estimator = estimator
self.explainer_type = explainer_type
self.n_iter = n_iter
self.verbose = verbose
self.random_state = random_state
self.p_values_ = None
self._n_outputs = None
[docs] @abstractmethod
def fit(self, *args, **kwargs):
"""
Abstract 'fit' method.
"""
[docs] def get_features(self, alpha: float = 0.05) -> List[object]:
"""Get a list of the features selected.
Args:
alpha: Level at which the empirical p-values will get rejected.
Returns:
The list of features with a p-value <= alpha.
"""
# Check if 'fit' has been called before using this method
self._check_is_fitted()
# Select features with a p-value <= alpha
selected_features = self.p_values_.index[self.p_values_ <= alpha].tolist()
if len(selected_features) == 0:
print("No features were selected: either the data is too noisy or alpha too low.")
return selected_features
def _check_is_fitted(self) -> None:
if self.p_values_ is None:
raise AttributeError(
"This instance is not fitted yet. Call 'fit' with appropriate arguments before using this method."
)
def _validate_params(self) -> None:
if self.n_iter < 10:
raise ValueError("n_iter must be greater than or equal to 10.")
def _compute_p_values(
self,
true_pos_shap_values: List[Series],
null_pos_shap_values: List[Series],
true_neg_shap_values: List[Series],
null_neg_shap_values: List[Series],
) -> Series:
pos_results = [None] * self._n_outputs
neg_results = [None] * self._n_outputs
results = [None] * self._n_outputs
for i in range(self._n_outputs):
pos_results[i] = null_pos_shap_values[i].ge(true_pos_shap_values[i], axis=0)
neg_results[i] = null_neg_shap_values[i].le(true_neg_shap_values[i], axis=0)
results[i] = pos_results[i] | neg_results[i]
results = reduce(lambda df_0, df_1: df_0 & df_1, results).sum(axis=1)
p_values = (results + 1) / (self.n_iter + 1)
return p_values