Amltk search space parser example
The AMLTK (https://github.com/automl/amltk) provides a framework for developing AutoML systems. One component of this system is the search space definitions.
TPOT2 provides a function called tpot2.utils.tpot2_parser which can convert a search space defined in the AMLTK API into the search space class used by TPOT2. This allows users to define a single search space to be used by both algorithms, facilitating better comparisons. Below is an example of a few search spaces defined in AMLTK and how to use them in TPOT2.
Note: this feature is still experimental and not all features present in the AMLTK API are fully supported in TPOT2 yet. (For example, automated splitting based on categorical vs numeric with amltk.pipeline.Split is not currently implemented in the parser.)
from sklearn.compose import make_column_selector
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from amltk.pipeline import Choice, Component, Sequential, Split
import tpot2
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
import tpot2
import numpy as np
import sklearn
import sklearn.datasets
import pandas as pd
# create dummy pandas dataset with both categorical and numerical columns
X, y = sklearn.datasets.make_classification(n_samples=100, n_features=5, n_informative=3, n_classes=2, random_state=42)
X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(5)])
# add 5 categorical columns
for i in range(5):
X[f"cat_{i}"] = np.random.choice(["A", "B", "C"], size=100)
y = y.flatten()
# train test split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.5)
# TODO: implement support for this condition
# select_categories = make_column_selector(dtype_include=object)
# select_numerical = make_column_selector(dtype_include=np.number)
# split_imputation = Split(
# {
# "categories": [SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first")],
# "numerics": Component(SimpleImputer, space={"strategy": ["mean", "median"]}),
# },
# config={"categories": select_categories, "numerics": select_numerical}, #not yet supported
# name="feature_preprocessing",
# )
# split_imputation
select_categories = make_column_selector(dtype_include=object)
select_numerical = make_column_selector(dtype_include=np.number)
cat_selector = make_column_transformer(("passthrough", select_categories))
num_selector = make_column_transformer(("passthrough", select_numerical))
split_imputation = Split(
{
"categories": [cat_selector,SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first", sparse_output=False)],
"numerics": [num_selector, Component(SimpleImputer, space={"strategy": ["mean", "median"]})],
},
name="split_imputation",
)
split_imputation
╭─ Split(split_imputation) ───────────────────────────────────────────────────────────────────────────────────────╮ │ ╭─ Sequential(categories) ───────────────────────────╮ ╭─ Sequential(numerics) ───────────────────────────────╮ │ │ │ ╭─ Fixed(ColumnTransformer) ─────────────────────╮ │ │ ╭─ Fixed(ColumnTransformer) ───────────────────────╮ │ │ │ │ │ item ColumnTransformer(transformers=[('passth… │ │ │ │ item ColumnTransformer(transformers=[('passthro… │ │ │ │ │ │ 'passthrough', │ │ │ │ 'passthrough', │ │ │ │ │ │ <sklear… │ │ │ │ <sklearn.… │ │ │ │ │ │ object at 0x7d354d946290>)]) │ │ │ │ object at 0x7d34edf94fa0>)]) │ │ │ │ │ ╰────────────────────────────────────────────────╯ │ │ ╰──────────────────────────────────────────────────╯ │ │ │ │ ↓ │ │ ↓ │ │ │ │ ╭─ Fixed(SimpleImputer) ─────────────────────────╮ │ │ ╭─ Component(SimpleImputer) ─────────────╮ │ │ │ │ │ item SimpleImputer(fill_value='missing', │ │ │ │ item class SimpleImputer(...) │ │ │ │ │ │ strategy='constant') │ │ │ │ space {'strategy': ['mean', 'median']} │ │ │ │ │ ╰────────────────────────────────────────────────╯ │ │ ╰────────────────────────────────────────╯ │ │ │ │ ↓ │ ╰──────────────────────────────────────────────────────╯ │ │ │ ╭─ Fixed(OneHotEncoder) ─────────────────────────╮ │ │ │ │ │ item OneHotEncoder(drop='first', │ │ │ │ │ │ sparse_output=False) │ │ │ │ │ ╰────────────────────────────────────────────────╯ │ │ │ ╰────────────────────────────────────────────────────╯ │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
from tpot2.builtin_modules import Passthrough, ZeroCount
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, SelectKBest
selectors = Choice(
Component(VarianceThreshold, space={"threshold": (0.1,1)}),
Component(SelectKBest, space={"k": (1, 10)}),
name="selectors",
)
transformers = Split(
{
"passthrough": Passthrough(),
"polynomial": Component(PolynomialFeatures, space={"degree": [2, 3]}),
"zerocount" : ZeroCount(),
},
# config={"categories": select_categories, "numerics": select_numerical},
name="transformers",
)
pipeline = (
Sequential(name="my_pipeline")
>> split_imputation
# >> Component(SimpleImputer, space={"strategy": ["mean", "median"]}) # Choose either mean or median
>> selectors
>> transformers
>> Choice(
# Our pipeline can choose between two different estimators
Component(
RandomForestClassifier,
space={"n_estimators": (10, 100), "criterion": ["gini", "log_loss"]},
config={"max_depth": 3},
),
Component(SVC, space={"kernel": ["linear", "rbf", "poly"]}),
name="estimator",
)
)
# Display the amltk Pipeline
pipeline
╭─ Sequential(my_pipeline) ───────────────────────────────────────────────────────────────────────────────────────╮ │ ╭─ Split(split_imputation) ───────────────────────────────────────────────────────────────────────────────────╮ │ │ │ ╭─ Sequential(categories) ─────────────────────────╮ ╭─ Sequential(numerics) ─────────────────────────────╮ │ │ │ │ │ ╭─ Fixed(ColumnTransformer) ───────────────────╮ │ │ ╭─ Fixed(ColumnTransformer) ─────────────────────╮ │ │ │ │ │ │ │ item ColumnTransformer(transformers=[('pass… │ │ │ │ item ColumnTransformer(transformers=[('passth… │ │ │ │ │ │ │ │ 'passthrough', │ │ │ │ 'passthrough', │ │ │ │ │ │ │ │ <skle… │ │ │ │ <sklear… │ │ │ │ │ │ │ │ object at 0x7d354d946290>)]) │ │ │ │ object at 0x7d34edf94fa0>)]) │ │ │ │ │ │ │ ╰──────────────────────────────────────────────╯ │ │ ╰────────────────────────────────────────────────╯ │ │ │ │ │ │ ↓ │ │ ↓ │ │ │ │ │ │ ╭─ Fixed(SimpleImputer) ───────────────────────╮ │ │ ╭─ Component(SimpleImputer) ─────────────╮ │ │ │ │ │ │ │ item SimpleImputer(fill_value='missing', │ │ │ │ item class SimpleImputer(...) │ │ │ │ │ │ │ │ strategy='constant') │ │ │ │ space {'strategy': ['mean', 'median']} │ │ │ │ │ │ │ ╰──────────────────────────────────────────────╯ │ │ ╰────────────────────────────────────────╯ │ │ │ │ │ │ ↓ │ ╰────────────────────────────────────────────────────╯ │ │ │ │ │ ╭─ Fixed(OneHotEncoder) ───────────────────────╮ │ │ │ │ │ │ │ item OneHotEncoder(drop='first', │ │ │ │ │ │ │ │ sparse_output=False) │ │ │ │ │ │ │ ╰──────────────────────────────────────────────╯ │ │ │ │ │ ╰──────────────────────────────────────────────────╯ │ │ │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │ │ ↓ │ │ ╭─ Choice(selectors) ─────────────────────────────────────────────────────╮ │ │ │ ╭─ Component(SelectKBest) ─────╮ ╭─ Component(VarianceThreshold) ─────╮ │ │ │ │ │ item class SelectKBest(...) │ │ item class VarianceThreshold(...) │ │ │ │ │ │ space {'k': (1, 10)} │ │ space {'threshold': (0.1, 1)} │ │ │ │ │ ╰──────────────────────────────╯ ╰────────────────────────────────────╯ │ │ │ ╰─────────────────────────────────────────────────────────────────────────╯ │ │ ↓ │ │ ╭─ Split(transformers) ─────────────────────────────────────────────────────────────────────────────────╮ │ │ │ ╭─ Sequential(passthrough) ─╮ ╭─ Sequential(polynomial) ────────────────╮ ╭─ Sequential(zerocount) ─╮ │ │ │ │ │ ╭─ Fixed(Passthrough) ─╮ │ │ ╭─ Component(PolynomialFeatures) ─────╮ │ │ ╭─ Fixed(ZeroCount) ─╮ │ │ │ │ │ │ │ item Passthrough() │ │ │ │ item class PolynomialFeatures(...) │ │ │ │ item ZeroCount() │ │ │ │ │ │ │ ╰──────────────────────╯ │ │ │ space {'degree': [2, 3]} │ │ │ ╰────────────────────╯ │ │ │ │ │ ╰───────────────────────────╯ │ ╰─────────────────────────────────────╯ │ ╰─────────────────────────╯ │ │ │ │ ╰─────────────────────────────────────────╯ │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────────────────────╯ │ │ ↓ │ │ ╭─ Choice(estimator) ─────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ ╭─ Component(RandomForestClassifier) ──────────╮ ╭─ Component(SVC) ────────────────────────────╮ │ │ │ │ │ item class RandomForestClassifier(...) │ │ item class SVC(...) │ │ │ │ │ │ config {'max_depth': 3} │ │ space {'kernel': ['linear', 'rbf', 'poly']} │ │ │ │ │ │ space { │ ╰─────────────────────────────────────────────╯ │ │ │ │ │ 'n_estimators': (10, 100), │ │ │ │ │ │ 'criterion': [ │ │ │ │ │ │ 'gini', │ │ │ │ │ │ 'log_loss' │ │ │ │ │ │ ] │ │ │ │ │ │ } │ │ │ │ │ ╰──────────────────────────────────────────────╯ │ │ │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
#convert to tpot search space
tpot_search_space = tpot2.utils.tpot2_parser(pipeline)
# sample a pipeline from the tpot search space
tpot_search_space.generate().export_pipeline()
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencode... VarianceThreshold(threshold=0.6738938110936)), ('featureunion-2', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3))])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])), ('randomforestclassifier', RandomForestClassifier(n_estimators=16))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencode... VarianceThreshold(threshold=0.6738938110936)), ('featureunion-2', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3))])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])), ('randomforestclassifier', RandomForestClassifier(n_estimators=16))])
FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencoder', OneHotEncoder(drop='first', sparse_output=False))])), ('pipeline-2', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>)])), ('simpleimputer', SimpleImputer(strategy='median'))]))])
ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])
<sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>
passthrough
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(drop='first', sparse_output=False)
ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>)])
<sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>
passthrough
SimpleImputer(strategy='median')
VarianceThreshold(threshold=0.6738938110936)
FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3))])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])
Passthrough()
PolynomialFeatures(degree=3)
ZeroCount()
RandomForestClassifier(n_estimators=16)
est = tpot2.TPOTEstimator(
scorers = ["roc_auc"],
scorers_weights = [1],
classification = True,
cv = 5,
search_space = tpot_search_space, #converted search space goes here
population_size= 10,
generations = 2,
max_eval_time_mins = 60*5,
verbose = 5,
n_jobs=10,
)
est.fit(X_train, y_train)
Generation: 50%|█████ | 1/2 [00:02<00:02, 2.60s/it]
Generation: 1 Best roc_auc_score score: 0.976
Generation: 100%|██████████| 2/2 [00:03<00:00, 1.57s/it] 2024-09-09 17:25:40,301 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:39897' caused the cluster to lose scattered data, which can't be recovered: {'ndarray-3f2f44921e6e9cc40ef07cfcd8ae90fb', 'DataFrame-5551f84174fd651642ff10eb71e30b22'} (stimulus_id='handle-worker-cleanup-1725927940.3010821')
Generation: 2 Best roc_auc_score score: 0.984
TPOTEstimator(classification=True, generations=2, max_eval_time_mins=300, n_jobs=10, population_size=10, scorers=['roc_auc'], scorers_weights=[1], search_space=<tpot2.search_spaces.pipelines.sequential.SequentialPipeline object at 0x7d34ec1efbb0>, verbose=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
TPOTEstimator(classification=True, generations=2, max_eval_time_mins=300, n_jobs=10, population_size=10, scorers=['roc_auc'], scorers_weights=[1], search_space=<tpot2.search_spaces.pipelines.sequential.SequentialPipeline object at 0x7d34ec1efbb0>, verbose=5)
est.fitted_pipeline_
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencode... VarianceThreshold(threshold=0.1557560591318)), ('featureunion-2', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures())])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])), ('randomforestclassifier', RandomForestClassifier(criterion='log_loss', n_estimators=80))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencode... VarianceThreshold(threshold=0.1557560591318)), ('featureunion-2', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures())])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])), ('randomforestclassifier', RandomForestClassifier(criterion='log_loss', n_estimators=80))])
FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencoder', OneHotEncoder(drop='first', sparse_output=False))])), ('pipeline-2', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>)])), ('simpleimputer', SimpleImputer(strategy='median'))]))])
ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])
<sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>
passthrough
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(drop='first', sparse_output=False)
ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>)])
<sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>
passthrough
SimpleImputer(strategy='median')
VarianceThreshold(threshold=0.1557560591318)
FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures())])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])
Passthrough()
PolynomialFeatures()
ZeroCount()
RandomForestClassifier(criterion='log_loss', n_estimators=80)
est.predict(X_test)
array([1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0])