GeneticFeatureSelectorNode¶
Whereas the FSSNode
selects from a predefined list of subsets of features, the GeneticFeatureSelectorNode
uses evolutionary algorithms to optimize a novel subset of features from scratch. This is useful where there is no predefined grouping of features.
To initalize the GeneticFeatureSelectorNode
you simply need to pass in the total number of features (i.e number of columns) in your dataset.
For these examples, we create a dummy dataset where the first six columns are informative and the rest are uninformative.
import tpot2
from tpot2.search_spaces.nodes import *
from tpot2.search_spaces.pipelines import *
import tpot2
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import tpot2
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from tpot2.search_spaces.nodes import *
from tpot2.search_spaces.pipelines import *
from tpot2.config import get_search_space
X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=6, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
X = np.hstack([X, np.random.rand(X.shape[0],6)]) #add six uninformative features
X = pd.DataFrame(X, columns=['a','b','c','d','e','f','g','h','i', 'j', 'k', 'l']) # a, b ,c the rest are uninformative
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
X.head()
a | b | c | d | e | f | g | h | i | j | k | l | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.557033 | 1.079369 | -0.652366 | -2.345172 | 3.579608 | 1.720050 | 0.500899 | 0.125597 | 0.262117 | 0.726395 | 0.766307 | 0.546374 |
1 | -0.775196 | 3.158042 | 0.571959 | -0.783506 | 2.420639 | 1.364403 | 0.318109 | 0.631452 | 0.784186 | 0.105712 | 0.294782 | 0.101737 |
2 | 0.243071 | -3.041308 | -0.397162 | 2.781182 | 2.407396 | 0.136103 | 0.290092 | 0.740930 | 0.673398 | 0.267161 | 0.710702 | 0.175107 |
3 | 1.389506 | -0.993958 | 0.655330 | 1.831326 | 0.080663 | 0.023581 | 0.962973 | 0.235456 | 0.859480 | 0.256727 | 0.899599 | 0.831491 |
4 | -0.024179 | 1.717804 | -1.599907 | 1.917392 | -0.808055 | 1.298912 | 0.590222 | 0.722350 | 0.385797 | 0.130779 | 0.697211 | 0.872331 |
gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
Each GeneticFeatureSelectorNode will select a new subset of features
selector = gfs_sp.generate().export_pipeline()
selector.set_output(transform="pandas") #by default sklearn selectors return numpy arrays. this will make it return pandas dataframes
selector.fit(X_train, y_train)
selector.transform(X_train)
g | |
---|---|
304 | 0.143646 |
867 | 0.442287 |
596 | 0.140160 |
410 | 0.195534 |
880 | 0.443872 |
... | ... |
116 | 0.542799 |
298 | 0.301036 |
811 | 0.444366 |
260 | 0.992575 |
422 | 0.373356 |
750 rows × 1 columns
selector = gfs_sp.generate().export_pipeline()
selector.set_output(transform="pandas") #by default sklearn selectors return numpy arrays. this will make it return pandas dataframes
selector.fit(X_train, y_train)
selector.transform(X_train)
a | e | g | i | j | k | l | |
---|---|---|---|---|---|---|---|
304 | 0.386607 | 3.021003 | 0.143646 | 0.826957 | 0.960345 | 0.989469 | 0.142616 |
867 | 2.508161 | -0.877686 | 0.442287 | 0.339937 | 0.946761 | 0.186116 | 0.407115 |
596 | 0.876675 | 1.185218 | 0.140160 | 0.150879 | 0.512864 | 0.378644 | 0.970835 |
410 | 2.201060 | -1.791596 | 0.195534 | 0.089165 | 0.394313 | 0.945995 | 0.396801 |
880 | 5.506138 | 0.326471 | 0.443872 | 0.062252 | 0.944865 | 0.525941 | 0.934821 |
... | ... | ... | ... | ... | ... | ... | ... |
116 | 0.031930 | 3.026118 | 0.542799 | 0.624332 | 0.565743 | 0.847792 | 0.720977 |
298 | -0.512784 | -2.697913 | 0.301036 | 0.838665 | 0.480591 | 0.803892 | 0.359138 |
811 | 2.598525 | 3.216680 | 0.444366 | 0.131156 | 0.499124 | 0.666406 | 0.716766 |
260 | 1.777059 | -4.618220 | 0.992575 | 0.547863 | 0.180111 | 0.065575 | 0.322207 |
422 | -3.336487 | 2.883106 | 0.373356 | 0.777447 | 0.010616 | 0.889032 | 0.576796 |
750 rows × 7 columns
Mutation and crossover can add or remove subsets from the learned feature set.
selector_ind = gfs_sp.generate()
selector = selector_ind.export_pipeline()
selected_features = X.columns[selector.mask]
print("selected features: ", selected_features)
selected features: Index(['g', 'i'], dtype='object')
selector_ind.mutate()
selector = selector_ind.export_pipeline()
selected_features = X.columns[selector.mask]
print("selected features: ", selected_features)
selected features: Index(['g', 'i'], dtype='object')
Training¶
import tpot2
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from tpot2.search_spaces.nodes import *
from tpot2.search_spaces.pipelines import *
gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
classifiers_sp = get_search_space('RandomForestClassifier')
final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp])
est = tpot2.TPOTEstimator( population_size=32,
generations=10,
scorers=["roc_auc_ovr", tpot2.objectives.complexity_scorer],
scorers_weights=[1.0, -1.0],
n_jobs=32,
classification=True,
search_space = final_classification_search_space,
verbose=1,
)
scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
Generation: 100%|██████████| 10/10 [00:45<00:00, 4.59s/it]
0.9023717948717949
est.fitted_pipeline_
Pipeline(steps=[('maskselector', MaskSelector(mask=array([ True, True, True, True, True, True, True, True, False, True, False, False]))), ('randomforestclassifier', RandomForestClassifier(criterion='entropy', max_features=0.2579898849876, min_samples_leaf=5, min_samples_split=8, n_estimators=128))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('maskselector', MaskSelector(mask=array([ True, True, True, True, True, True, True, True, False, True, False, False]))), ('randomforestclassifier', RandomForestClassifier(criterion='entropy', max_features=0.2579898849876, min_samples_leaf=5, min_samples_split=8, n_estimators=128))])
MaskSelector(mask=array([ True, True, True, True, True, True, True, True, False, True, False, False]))
RandomForestClassifier(criterion='entropy', max_features=0.2579898849876, min_samples_leaf=5, min_samples_split=8, n_estimators=128)
selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask]
print("selected features: ", selected_features)
selected features: Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j'], dtype='object')
Custom objective function to minimize number of selected features¶
We can create a custom objective function that returns the number of features selected per pipeline. The other_objective_functions
parameter is for objective functions that do not require fitted pipelines and do not require cross validation. Since we know that the selector instance gets its features from its parameters, not through fitting, we can create an objective for the other_objective_functions
parameter.
We set the weights to -1 because we would like to minimize the number of features selected. We also give it a name so that we can more easily access it in the evaluated_individuals
dataframe.
def number_of_selected_features(est):
return sum(est.steps[0][1].mask)
gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
classifiers_sp = get_search_space('RandomForestClassifier')
final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp])
est = tpot2.TPOTEstimator(
population_size=32,
generations=10,
scorers=["roc_auc_ovr", tpot2.objectives.complexity_scorer],
scorers_weights=[1.0, -1.0],
other_objective_functions=[number_of_selected_features],
other_objective_functions_weights = [-1],
objective_function_names = ["Number of selected features"],
n_jobs=32,
classification=True,
search_space = final_classification_search_space,
verbose=2,
)
scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
/home/perib/miniconda3/envs/myenv/lib/python3.10/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use. Perhaps you already have a cluster running? Hosting the HTTP server on port 33543 instead warnings.warn( Generation: 100%|██████████| 10/10 [00:50<00:00, 5.04s/it]
0.926923076923077
selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask]
print("selected features: ", selected_features)
selected features: Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
import seaborn as sns
import matplotlib.pyplot as plt
df = est.evaluated_individuals
col1 = "Number of selected features"
col2 = "roc_auc_score"
# Multiple orange dots show because the pareto front in this case is actually 3D along the auroc score, number of features, and complexity.
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(df[df['Pareto_Front']!=1], x=col1, y=col2, label='other', ax=ax)
sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax)
ax.title.set_text('Performance of all pipelines')
#log scale y
ax.set_yscale('log')
plt.show()
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax)
ax.title.set_text('Performance of only the Pareto Front')
#log scale y
# ax.set_yscale('log')
plt.show()
Other Examples¶
As with all search spaces, GeneticFeatureSelectorNode can be combined with any other search space.
You can also pair this with the existing prebuilt templates, for example:
linear_search_space = tpot2.config.template_search_spaces.get_template_search_spaces("linear", classification=True)
gfs_and_linear_search_space = SequentialPipeline([gfs_sp, linear_search_space])
# est = tpot2.TPOTEstimator(
# population_size=32,
# generations=10,
# scorers=["roc_auc_ovr", tpot2.objectives.complexity_scorer],
# scorers_weights=[1.0, -1.0],
# other_objective_functions=[number_of_selected_features],
# other_objective_functions_weights = [-1],
# objective_function_names = ["Number of selected features"],
# n_jobs=32,
# classification=True,
# search_space = gfs_and_linear_search_space,
# verbose=2,
# )
gfs_and_linear_search_space.generate(rng=1).export_pipeline()
Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, False, True, False, False, False, False, False, False, True, False, False]))), ('pipeline', Pipeline(steps=[('normalizer', Normalizer(norm='l1')), ('selectpercentile', SelectPercentile(percentile=74.2561844719571)), ('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('binarizer', Binarizer(threshold=0.0935770250992))])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()), ('passthrough', Passthrough())])), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320))]))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, False, True, False, False, False, False, False, False, True, False, False]))), ('pipeline', Pipeline(steps=[('normalizer', Normalizer(norm='l1')), ('selectpercentile', SelectPercentile(percentile=74.2561844719571)), ('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('binarizer', Binarizer(threshold=0.0935770250992))])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()), ('passthrough', Passthrough())])), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320))]))])
MaskSelector(mask=array([False, False, True, False, False, False, False, False, False, True, False, False]))
Pipeline(steps=[('normalizer', Normalizer(norm='l1')), ('selectpercentile', SelectPercentile(percentile=74.2561844719571)), ('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('binarizer', Binarizer(threshold=0.0935770250992))])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()), ('passthrough', Passthrough())])), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320))])
Normalizer(norm='l1')
SelectPercentile(percentile=74.2561844719571)
FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('binarizer', Binarizer(threshold=0.0935770250992))])), ('passthrough', Passthrough())])
Binarizer(threshold=0.0935770250992)
Passthrough()
FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()), ('passthrough', Passthrough())])
SkipTransformer()
Passthrough()
AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320)
Getting Fancy¶
If you want to get fancy, you can combine more search spaces in order to set up unique preprocessing pipelines per feature set. Here's an example:
dynamic_transformers = DynamicUnionPipeline(get_search_space("all_transformers"), max_estimators=4)
dynamic_transformers_with_passthrough = tpot2.search_spaces.pipelines.UnionPipeline([
dynamic_transformers,
tpot2.config.get_search_space("Passthrough")],
)
multi_step_engineering = DynamicLinearPipeline(dynamic_transformers_with_passthrough, max_length=4)
gfs_engineering_search_space = SequentialPipeline([gfs_sp, multi_step_engineering])
union_fss_engineering_search_space = DynamicUnionPipeline(gfs_engineering_search_space)
classification_search_space = get_search_space('classifiers')
final_fancy_search_space = SequentialPipeline([union_fss_engineering_search_space, classification_search_space])
final_fancy_search_space.generate(rng=1).export_pipeline()
Pipeline(steps=[('featureunion', FeatureUnion(transformer_list=[('pipeline', Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, True, False, False, False, False, False, False, True, False, False, False]))), ('pipeline', Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', Robu... ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('nystroem', Nystroem(gamma=0.3428025665559, kernel='linear', n_components=88))])), ('passthrough', Passthrough())]))]))]))])), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('featureunion', FeatureUnion(transformer_list=[('pipeline', Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, True, False, False, False, False, False, False, True, False, False, False]))), ('pipeline', Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', Robu... ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('nystroem', Nystroem(gamma=0.3428025665559, kernel='linear', n_components=88))])), ('passthrough', Passthrough())]))]))]))])), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320))])
FeatureUnion(transformer_list=[('pipeline', Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, True, False, False, False, False, False, False, True, False, False, False]))), ('pipeline', Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', RobustScaler(quantile_range=(0.1874078711948, 0.7642865555088))), ('columnonehotencoder', ColumnOneHotEncoder())])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('nystroem', Nystroem(gamma=0.3428025665559, kernel='linear', n_components=88))])), ('passthrough', Passthrough())]))]))]))])
MaskSelector(mask=array([False, True, False, False, False, False, False, False, True, False, False, False]))
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', RobustScaler(quantile_range=(0.1874078711948, 0.7642865555088))), ('columnonehotencoder', ColumnOneHotEncoder())])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('nystroem', Nystroem(gamma=0.3428025665559, kernel='linear', n_components=88))])), ('passthrough', Passthrough())]))])
FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', RobustScaler(quantile_range=(0.1874078711948, 0.7642865555088))), ('columnonehotencoder', ColumnOneHotEncoder())])), ('passthrough', Passthrough())])
RobustScaler(quantile_range=(0.1874078711948, 0.7642865555088))
ColumnOneHotEncoder()
Passthrough()
FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('nystroem', Nystroem(gamma=0.3428025665559, kernel='linear', n_components=88))])), ('passthrough', Passthrough())])
Nystroem(gamma=0.3428025665559, kernel='linear', n_components=88)
Passthrough()
AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320)