Skip to content

Feature set selector

This file is part of the TPOT library.

TPOT was primarily developed at the University of Pennsylvania by: - Randal S. Olson (rso@randalolson.com) - Weixuan Fu (weixuanf@upenn.edu) - Daniel Angell (dpa34@drexel.edu) - and many more generous open source contributors

TPOT is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

TPOT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License along with TPOT. If not, see http://www.gnu.org/licenses/.

FeatureSetSelector

Bases: BaseEstimator, SelectorMixin

Select predefined feature subsets.

Source code in tpot2/builtin_modules/feature_set_selector.py
class FeatureSetSelector(BaseEstimator, SelectorMixin):
    """Select predefined feature subsets."""

    def __init__(self, sel_subset=None, name=None):
        """Create a FeatureSetSelector object.

        Parameters
        ----------
        sel_subset: list or int
            If X is a dataframe, items in sel_subset list must correspond to column names
            If X is a numpy array, items in sel_subset list must correspond to column indexes
            int: index of a single column
        Returns
        -------
        None

        """
        self.name = name
        self.sel_subset = sel_subset


    def fit(self, X, y=None):
        """Fit FeatureSetSelector for feature selection

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
            The training input samples.
        y: array-like, shape (n_samples,)
            The target values (integers that correspond to classes in classification, real numbers in regression).

        Returns
        -------
        self: object
            Returns a copy of the estimator
        """
        if isinstance(self.sel_subset, int) or isinstance(self.sel_subset, str):
            self.sel_subset = [self.sel_subset]

        #generate  self.feat_list_idx
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
            self.feat_list_idx = sorted([self.feature_names_in_.index(feat) for feat in self.sel_subset])


        elif isinstance(X, np.ndarray):
            self.feature_names_in_ = None#list(range(X.shape[1]))

            self.feat_list_idx = sorted(self.sel_subset)

        n_features = X.shape[1]
        self.mask = np.zeros(n_features, dtype=bool)
        self.mask[np.asarray(self.feat_list_idx)] = True

        return self

    #TODO keep returned as dataframe if input is dataframe? may not be consistent with sklearn

    # def transform(self, X):


    def _get_support_mask(self):
        """
        Get the boolean mask indicating which features are selected
        Returns
        -------
        support : boolean array of shape [# input features]
            An element is True iff its corresponding feature is selected for
            retention.
        """
        return self.mask

__init__(sel_subset=None, name=None)

Create a FeatureSetSelector object.

Parameters:

Name Type Description Default
sel_subset

If X is a dataframe, items in sel_subset list must correspond to column names If X is a numpy array, items in sel_subset list must correspond to column indexes int: index of a single column

None

Returns:

Type Description
None
Source code in tpot2/builtin_modules/feature_set_selector.py
def __init__(self, sel_subset=None, name=None):
    """Create a FeatureSetSelector object.

    Parameters
    ----------
    sel_subset: list or int
        If X is a dataframe, items in sel_subset list must correspond to column names
        If X is a numpy array, items in sel_subset list must correspond to column indexes
        int: index of a single column
    Returns
    -------
    None

    """
    self.name = name
    self.sel_subset = sel_subset

fit(X, y=None)

Fit FeatureSetSelector for feature selection

Parameters:

Name Type Description Default
X

The training input samples.

required
y

The target values (integers that correspond to classes in classification, real numbers in regression).

None

Returns:

Name Type Description
self object

Returns a copy of the estimator

Source code in tpot2/builtin_modules/feature_set_selector.py
def fit(self, X, y=None):
    """Fit FeatureSetSelector for feature selection

    Parameters
    ----------
    X: array-like of shape (n_samples, n_features)
        The training input samples.
    y: array-like, shape (n_samples,)
        The target values (integers that correspond to classes in classification, real numbers in regression).

    Returns
    -------
    self: object
        Returns a copy of the estimator
    """
    if isinstance(self.sel_subset, int) or isinstance(self.sel_subset, str):
        self.sel_subset = [self.sel_subset]

    #generate  self.feat_list_idx
    if isinstance(X, pd.DataFrame):
        self.feature_names_in_ = X.columns.tolist()
        self.feat_list_idx = sorted([self.feature_names_in_.index(feat) for feat in self.sel_subset])


    elif isinstance(X, np.ndarray):
        self.feature_names_in_ = None#list(range(X.shape[1]))

        self.feat_list_idx = sorted(self.sel_subset)

    n_features = X.shape[1]
    self.mask = np.zeros(n_features, dtype=bool)
    self.mask[np.asarray(self.feat_list_idx)] = True

    return self