Amltk search space parser example

The AMLTK (https://github.com/automl/amltk) provides a framework for developing AutoML systems. One component of this system is the search space definitions.

TPOT2 provides a function called tpot2.utils.tpot2_parser which can convert a search space defined in the AMLTK API into the search space class used by TPOT2. This allows users to define a single search space to be used by both algorithms, facilitating better comparisons. Below is an example of a few search spaces defined in AMLTK and how to use them in TPOT2.

Note: this feature is still experimental and not all features present in the AMLTK API are fully supported in TPOT2 yet. (For example, automated splitting based on categorical vs numeric with amltk.pipeline.Split is not currently implemented in the parser.)

In [1]:

Copied!





from sklearn.compose import make_column_selector
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from amltk.pipeline import Choice, Component, Sequential, Split
import tpot2
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
import tpot2
import numpy as np
import sklearn
import sklearn.datasets
import pandas as pd
# create dummy pandas dataset with both categorical and numerical columns
X, y = sklearn.datasets.make_classification(n_samples=100, n_features=5, n_informative=3, n_classes=2, random_state=42)
X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(5)])
# add 5 categorical columns
for i in range(5):
    X[f"cat_{i}"] = np.random.choice(["A", "B", "C"], size=100)
y = y.flatten()
# train test split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.5)

# TODO: implement support for this condition
# select_categories = make_column_selector(dtype_include=object)
# select_numerical = make_column_selector(dtype_include=np.number)

# split_imputation = Split(
#     {
#         "categories": [SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first")],
#         "numerics": Component(SimpleImputer, space={"strategy": ["mean", "median"]}),
#     },
#     config={"categories": select_categories, "numerics": select_numerical}, #not yet supported
#     name="feature_preprocessing",
# )
# split_imputation

select_categories = make_column_selector(dtype_include=object)
select_numerical = make_column_selector(dtype_include=np.number)

cat_selector = make_column_transformer(("passthrough", select_categories))
num_selector = make_column_transformer(("passthrough", select_numerical))


split_imputation = Split(
    {
        "categories": [cat_selector,SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first", sparse_output=False)],
        "numerics": [num_selector, Component(SimpleImputer, space={"strategy": ["mean", "median"]})],
    },
    name="split_imputation",
)
split_imputation
from sklearn.compose import make_column_selector
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from amltk.pipeline import Choice, Component, Sequential, Split
import tpot2
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
import tpot2
import numpy as np
import sklearn
import sklearn.datasets
import pandas as pd
# create dummy pandas dataset with both categorical and numerical columns
X, y = sklearn.datasets.make_classification(n_samples=100, n_features=5, n_informative=3, n_classes=2, random_state=42)
X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(5)])
# add 5 categorical columns
for i in range(5):
    X[f"cat_{i}"] = np.random.choice(["A", "B", "C"], size=100)
y = y.flatten()
# train test split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.5)

# TODO: implement support for this condition
# select_categories = make_column_selector(dtype_include=object)
# select_numerical = make_column_selector(dtype_include=np.number)

# split_imputation = Split(
#     {
#         "categories": [SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first")],
#         "numerics": Component(SimpleImputer, space={"strategy": ["mean", "median"]}),
#     },
#     config={"categories": select_categories, "numerics": select_numerical}, #not yet supported
#     name="feature_preprocessing",
# )
# split_imputation

select_categories = make_column_selector(dtype_include=object)
select_numerical = make_column_selector(dtype_include=np.number)

cat_selector = make_column_transformer(("passthrough", select_categories))
num_selector = make_column_transformer(("passthrough", select_numerical))


split_imputation = Split(
    {
        "categories": [cat_selector,SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first", sparse_output=False)],
        "numerics": [num_selector, Component(SimpleImputer, space={"strategy": ["mean", "median"]})],
    },
    name="split_imputation",
)
split_imputation

╭─ Split(split_imputation) ───────────────────────────────────────────────────────────────────────────────────────╮
│ ╭─ Sequential(categories) ───────────────────────────╮ ╭─ Sequential(numerics) ───────────────────────────────╮ │
│ │ ╭─ Fixed(ColumnTransformer) ─────────────────────╮ │ │ ╭─ Fixed(ColumnTransformer) ───────────────────────╮ │ │
│ │ │ item ColumnTransformer(transformers=[('passth… │ │ │ │ item ColumnTransformer(transformers=[('passthro… │ │ │
│ │ │      'passthrough',                            │ │ │ │      'passthrough',                              │ │ │
│ │ │                                       <sklear… │ │ │ │                                       <sklearn.… │ │ │
│ │ │      object at 0x7d354d946290>)])              │ │ │ │      object at 0x7d34edf94fa0>)])                │ │ │
│ │ ╰────────────────────────────────────────────────╯ │ │ ╰──────────────────────────────────────────────────╯ │ │
│ │                         ↓                          │ │                          ↓                           │ │
│ │ ╭─ Fixed(SimpleImputer) ─────────────────────────╮ │ │ ╭─ Component(SimpleImputer) ─────────────╮           │ │
│ │ │ item SimpleImputer(fill_value='missing',       │ │ │ │ item  class SimpleImputer(...)         │           │ │
│ │ │      strategy='constant')                      │ │ │ │ space {'strategy': ['mean', 'median']} │           │ │
│ │ ╰────────────────────────────────────────────────╯ │ │ ╰────────────────────────────────────────╯           │ │
│ │                         ↓                          │ ╰──────────────────────────────────────────────────────╯ │
│ │ ╭─ Fixed(OneHotEncoder) ─────────────────────────╮ │                                                          │
│ │ │ item OneHotEncoder(drop='first',               │ │                                                          │
│ │ │      sparse_output=False)                      │ │                                                          │
│ │ ╰────────────────────────────────────────────────╯ │                                                          │
│ ╰────────────────────────────────────────────────────╯                                                          │
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

Out[1]:

In [2]:

Copied!





from tpot2.builtin_modules import Passthrough, ZeroCount
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

from sklearn.feature_selection import VarianceThreshold, SelectKBest

selectors = Choice(
    Component(VarianceThreshold, space={"threshold": (0.1,1)}),
    Component(SelectKBest, space={"k": (1, 10)}),
    name="selectors",
)


transformers = Split(
    {
        "passthrough": Passthrough(),
        "polynomial": Component(PolynomialFeatures, space={"degree": [2, 3]}),
        "zerocount" : ZeroCount(),
    },
    # config={"categories": select_categories, "numerics": select_numerical},
    name="transformers",
)

pipeline = (
    Sequential(name="my_pipeline")
    >> split_imputation
    # >> Component(SimpleImputer, space={"strategy": ["mean", "median"]})  # Choose either mean or median
    
    >> selectors
    >> transformers
    >> Choice(
        # Our pipeline can choose between two different estimators
        Component(
            RandomForestClassifier,
            space={"n_estimators": (10, 100), "criterion": ["gini", "log_loss"]},
            config={"max_depth": 3},
        ),
        Component(SVC, space={"kernel": ["linear", "rbf", "poly"]}),
        name="estimator",
    )
)

# Display the amltk Pipeline
pipeline
from tpot2.builtin_modules import Passthrough, ZeroCount
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

from sklearn.feature_selection import VarianceThreshold, SelectKBest

selectors = Choice(
    Component(VarianceThreshold, space={"threshold": (0.1,1)}),
    Component(SelectKBest, space={"k": (1, 10)}),
    name="selectors",
)


transformers = Split(
    {
        "passthrough": Passthrough(),
        "polynomial": Component(PolynomialFeatures, space={"degree": [2, 3]}),
        "zerocount" : ZeroCount(),
    },
    # config={"categories": select_categories, "numerics": select_numerical},
    name="transformers",
)

pipeline = (
    Sequential(name="my_pipeline")
    >> split_imputation
    # >> Component(SimpleImputer, space={"strategy": ["mean", "median"]})  # Choose either mean or median
    
    >> selectors
    >> transformers
    >> Choice(
        # Our pipeline can choose between two different estimators
        Component(
            RandomForestClassifier,
            space={"n_estimators": (10, 100), "criterion": ["gini", "log_loss"]},
            config={"max_depth": 3},
        ),
        Component(SVC, space={"kernel": ["linear", "rbf", "poly"]}),
        name="estimator",
    )
)

# Display the amltk Pipeline
pipeline

╭─ Sequential(my_pipeline) ───────────────────────────────────────────────────────────────────────────────────────╮
│ ╭─ Split(split_imputation) ───────────────────────────────────────────────────────────────────────────────────╮ │
│ │ ╭─ Sequential(categories) ─────────────────────────╮ ╭─ Sequential(numerics) ─────────────────────────────╮ │ │
│ │ │ ╭─ Fixed(ColumnTransformer) ───────────────────╮ │ │ ╭─ Fixed(ColumnTransformer) ─────────────────────╮ │ │ │
│ │ │ │ item ColumnTransformer(transformers=[('pass… │ │ │ │ item ColumnTransformer(transformers=[('passth… │ │ │ │
│ │ │ │      'passthrough',                          │ │ │ │      'passthrough',                            │ │ │ │
│ │ │ │                                       <skle… │ │ │ │                                       <sklear… │ │ │ │
│ │ │ │      object at 0x7d354d946290>)])            │ │ │ │      object at 0x7d34edf94fa0>)])              │ │ │ │
│ │ │ ╰──────────────────────────────────────────────╯ │ │ ╰────────────────────────────────────────────────╯ │ │ │
│ │ │                        ↓                         │ │                         ↓                          │ │ │
│ │ │ ╭─ Fixed(SimpleImputer) ───────────────────────╮ │ │ ╭─ Component(SimpleImputer) ─────────────╮         │ │ │
│ │ │ │ item SimpleImputer(fill_value='missing',     │ │ │ │ item  class SimpleImputer(...)         │         │ │ │
│ │ │ │      strategy='constant')                    │ │ │ │ space {'strategy': ['mean', 'median']} │         │ │ │
│ │ │ ╰──────────────────────────────────────────────╯ │ │ ╰────────────────────────────────────────╯         │ │ │
│ │ │                        ↓                         │ ╰────────────────────────────────────────────────────╯ │ │
│ │ │ ╭─ Fixed(OneHotEncoder) ───────────────────────╮ │                                                        │ │
│ │ │ │ item OneHotEncoder(drop='first',             │ │                                                        │ │
│ │ │ │      sparse_output=False)                    │ │                                                        │ │
│ │ │ ╰──────────────────────────────────────────────╯ │                                                        │ │
│ │ ╰──────────────────────────────────────────────────╯                                                        │ │
│ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │
│                                                        ↓                                                        │
│ ╭─ Choice(selectors) ─────────────────────────────────────────────────────╮                                     │
│ │ ╭─ Component(SelectKBest) ─────╮ ╭─ Component(VarianceThreshold) ─────╮ │                                     │
│ │ │ item  class SelectKBest(...) │ │ item  class VarianceThreshold(...) │ │                                     │
│ │ │ space {'k': (1, 10)}         │ │ space {'threshold': (0.1, 1)}      │ │                                     │
│ │ ╰──────────────────────────────╯ ╰────────────────────────────────────╯ │                                     │
│ ╰─────────────────────────────────────────────────────────────────────────╯                                     │
│                                                        ↓                                                        │
│ ╭─ Split(transformers) ─────────────────────────────────────────────────────────────────────────────────╮       │
│ │ ╭─ Sequential(passthrough) ─╮ ╭─ Sequential(polynomial) ────────────────╮ ╭─ Sequential(zerocount) ─╮ │       │
│ │ │ ╭─ Fixed(Passthrough) ─╮  │ │ ╭─ Component(PolynomialFeatures) ─────╮ │ │ ╭─ Fixed(ZeroCount) ─╮  │ │       │
│ │ │ │ item Passthrough()   │  │ │ │ item  class PolynomialFeatures(...) │ │ │ │ item ZeroCount()   │  │ │       │
│ │ │ ╰──────────────────────╯  │ │ │ space {'degree': [2, 3]}            │ │ │ ╰────────────────────╯  │ │       │
│ │ ╰───────────────────────────╯ │ ╰─────────────────────────────────────╯ │ ╰─────────────────────────╯ │       │
│ │                               ╰─────────────────────────────────────────╯                             │       │
│ ╰───────────────────────────────────────────────────────────────────────────────────────────────────────╯       │
│                                                        ↓                                                        │
│ ╭─ Choice(estimator) ─────────────────────────────────────────────────────────────────────────────────────────╮ │
│ │ ╭─ Component(RandomForestClassifier) ──────────╮ ╭─ Component(SVC) ────────────────────────────╮            │ │
│ │ │ item   class RandomForestClassifier(...)     │ │ item  class SVC(...)                        │            │ │
│ │ │ config {'max_depth': 3}                      │ │ space {'kernel': ['linear', 'rbf', 'poly']} │            │ │
│ │ │ space  {                                     │ ╰─────────────────────────────────────────────╯            │ │
│ │ │            'n_estimators': (10, 100),        │                                                            │ │
│ │ │            'criterion': [                    │                                                            │ │
│ │ │                'gini',                       │                                                            │ │
│ │ │                'log_loss'                    │                                                            │ │
│ │ │            ]                                 │                                                            │ │
│ │ │        }                                     │                                                            │ │
│ │ ╰──────────────────────────────────────────────╯                                                            │ │
│ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

Out[2]:

In [3]:

Copied!

#convert to tpot search space
tpot_search_space = tpot2.utils.tpot2_parser(pipeline)

# sample a pipeline from the tpot search space
tpot_search_space.generate().export_pipeline()
#convert to tpot search space
tpot_search_space = tpot2.utils.tpot2_parser(pipeline)

# sample a pipeline from the tpot search space
tpot_search_space.generate().export_pipeline()

Out[3]:

Pipeline(steps=[('featureunion-1',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columntransformer',
                                                                  ColumnTransformer(transformers=[('passthrough',
                                                                                                   'passthrough',
                                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])),
                                                                 ('simpleimputer',
                                                                  SimpleImputer(fill_value='missing',
                                                                                strategy='constant')),
                                                                 ('onehotencode...
                 VarianceThreshold(threshold=0.6738938110936)),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('passthrough',
                                                                  Passthrough())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('polynomialfeatures',
                                                                  PolynomialFeatures(degree=3))])),
                                                ('pipeline-3',
                                                 Pipeline(steps=[('zerocount',
                                                                  ZeroCount())]))])),
                ('randomforestclassifier',
                 RandomForestClassifier(n_estimators=16))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Pipeline?Documentation for PipelineiNot fitted

Pipeline(steps=[('featureunion-1',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columntransformer',
                                                                  ColumnTransformer(transformers=[('passthrough',
                                                                                                   'passthrough',
                                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])),
                                                                 ('simpleimputer',
                                                                  SimpleImputer(fill_value='missing',
                                                                                strategy='constant')),
                                                                 ('onehotencode...
                 VarianceThreshold(threshold=0.6738938110936)),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('passthrough',
                                                                  Passthrough())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('polynomialfeatures',
                                                                  PolynomialFeatures(degree=3))])),
                                                ('pipeline-3',
                                                 Pipeline(steps=[('zerocount',
                                                                  ZeroCount())]))])),
                ('randomforestclassifier',
                 RandomForestClassifier(n_estimators=16))])

featureunion-1: FeatureUnion?Documentation for featureunion-1: FeatureUnion

FeatureUnion(transformer_list=[('pipeline-1',
                                Pipeline(steps=[('columntransformer',
                                                 ColumnTransformer(transformers=[('passthrough',
                                                                                  'passthrough',
                                                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])),
                                                ('simpleimputer',
                                                 SimpleImputer(fill_value='missing',
                                                               strategy='constant')),
                                                ('onehotencoder',
                                                 OneHotEncoder(drop='first',
                                                               sparse_output=False))])),
                               ('pipeline-2',
                                Pipeline(steps=[('columntransformer',
                                                 ColumnTransformer(transformers=[('passthrough',
                                                                                  'passthrough',
                                                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>)])),
                                                ('simpleimputer',
                                                 SimpleImputer(strategy='median'))]))])

pipeline-1

columntransformer: ColumnTransformer?Documentation for columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])

passthrough

<sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>

passthrough

passthrough

SimpleImputer?Documentation for SimpleImputer

SimpleImputer(fill_value='missing', strategy='constant')

OneHotEncoder?Documentation for OneHotEncoder

OneHotEncoder(drop='first', sparse_output=False)

pipeline-2

columntransformer: ColumnTransformer?Documentation for columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>)])

passthrough

<sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>

passthrough

passthrough

SimpleImputer?Documentation for SimpleImputer

SimpleImputer(strategy='median')

VarianceThreshold?Documentation for VarianceThreshold

VarianceThreshold(threshold=0.6738938110936)

featureunion-2: FeatureUnion?Documentation for featureunion-2: FeatureUnion

FeatureUnion(transformer_list=[('pipeline-1',
                                Pipeline(steps=[('passthrough',
                                                 Passthrough())])),
                               ('pipeline-2',
                                Pipeline(steps=[('polynomialfeatures',
                                                 PolynomialFeatures(degree=3))])),
                               ('pipeline-3',
                                Pipeline(steps=[('zerocount', ZeroCount())]))])

pipeline-1

Passthrough

Passthrough()

pipeline-2

PolynomialFeatures?Documentation for PolynomialFeatures

PolynomialFeatures(degree=3)

pipeline-3

ZeroCount

ZeroCount()

RandomForestClassifier?Documentation for RandomForestClassifier

RandomForestClassifier(n_estimators=16)

In [4]:

Copied!





est = tpot2.TPOTEstimator(
    scorers = ["roc_auc"],
    scorers_weights = [1],
    classification = True,
    cv = 5,
    search_space = tpot_search_space, #converted search space goes here
    population_size= 10,
    generations = 2,
    max_eval_time_mins = 60*5,
    verbose = 5,
    n_jobs=10,
)

est.fit(X_train, y_train)



est = tpot2.TPOTEstimator(
    scorers = ["roc_auc"],
    scorers_weights = [1],
    classification = True,
    cv = 5,
    search_space = tpot_search_space, #converted search space goes here
    population_size= 10,
    generations = 2,
    max_eval_time_mins = 60*5,
    verbose = 5,
    n_jobs=10,
)

est.fit(X_train, y_train)

Generation:  50%|█████     | 1/2 [00:02<00:02,  2.60s/it]

Generation:  1
Best roc_auc_score score: 0.976

Generation: 100%|██████████| 2/2 [00:03<00:00,  1.57s/it]
2024-09-09 17:25:40,301 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:39897' caused the cluster to lose scattered data, which can't be recovered: {'ndarray-3f2f44921e6e9cc40ef07cfcd8ae90fb', 'DataFrame-5551f84174fd651642ff10eb71e30b22'} (stimulus_id='handle-worker-cleanup-1725927940.3010821')

Generation:  2
Best roc_auc_score score: 0.984

Out[4]:

TPOTEstimator(classification=True, generations=2, max_eval_time_mins=300,
              n_jobs=10, population_size=10, scorers=['roc_auc'],
              scorers_weights=[1],
              search_space=<tpot2.search_spaces.pipelines.sequential.SequentialPipeline object at 0x7d34ec1efbb0>,
              verbose=5)

TPOTEstimatoriFitted

TPOTEstimator(classification=True, generations=2, max_eval_time_mins=300,
              n_jobs=10, population_size=10, scorers=['roc_auc'],
              scorers_weights=[1],
              search_space=<tpot2.search_spaces.pipelines.sequential.SequentialPipeline object at 0x7d34ec1efbb0>,
              verbose=5)

In [5]:

Copied!

est.fitted_pipeline_
est.fitted_pipeline_

Out[5]:

Pipeline(steps=[('featureunion-1',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columntransformer',
                                                                  ColumnTransformer(transformers=[('passthrough',
                                                                                                   'passthrough',
                                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])),
                                                                 ('simpleimputer',
                                                                  SimpleImputer(fill_value='missing',
                                                                                strategy='constant')),
                                                                 ('onehotencode...
                 VarianceThreshold(threshold=0.1557560591318)),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('passthrough',
                                                                  Passthrough())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('polynomialfeatures',
                                                                  PolynomialFeatures())])),
                                                ('pipeline-3',
                                                 Pipeline(steps=[('zerocount',
                                                                  ZeroCount())]))])),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='log_loss',
                                        n_estimators=80))])

Pipeline?Documentation for PipelineiFitted

Pipeline(steps=[('featureunion-1',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columntransformer',
                                                                  ColumnTransformer(transformers=[('passthrough',
                                                                                                   'passthrough',
                                                                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])),
                                                                 ('simpleimputer',
                                                                  SimpleImputer(fill_value='missing',
                                                                                strategy='constant')),
                                                                 ('onehotencode...
                 VarianceThreshold(threshold=0.1557560591318)),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('passthrough',
                                                                  Passthrough())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('polynomialfeatures',
                                                                  PolynomialFeatures())])),
                                                ('pipeline-3',
                                                 Pipeline(steps=[('zerocount',
                                                                  ZeroCount())]))])),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='log_loss',
                                        n_estimators=80))])

featureunion-1: FeatureUnion?Documentation for featureunion-1: FeatureUnion

FeatureUnion(transformer_list=[('pipeline-1',
                                Pipeline(steps=[('columntransformer',
                                                 ColumnTransformer(transformers=[('passthrough',
                                                                                  'passthrough',
                                                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])),
                                                ('simpleimputer',
                                                 SimpleImputer(fill_value='missing',
                                                               strategy='constant')),
                                                ('onehotencoder',
                                                 OneHotEncoder(drop='first',
                                                               sparse_output=False))])),
                               ('pipeline-2',
                                Pipeline(steps=[('columntransformer',
                                                 ColumnTransformer(transformers=[('passthrough',
                                                                                  'passthrough',
                                                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>)])),
                                                ('simpleimputer',
                                                 SimpleImputer(strategy='median'))]))])

pipeline-1

columntransformer: ColumnTransformer?Documentation for columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])

passthrough

<sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>

passthrough

passthrough

SimpleImputer?Documentation for SimpleImputer

SimpleImputer(fill_value='missing', strategy='constant')

OneHotEncoder?Documentation for OneHotEncoder

OneHotEncoder(drop='first', sparse_output=False)

pipeline-2

columntransformer: ColumnTransformer?Documentation for columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>)])

passthrough

<sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>

passthrough

passthrough

SimpleImputer?Documentation for SimpleImputer

SimpleImputer(strategy='median')

VarianceThreshold?Documentation for VarianceThreshold

VarianceThreshold(threshold=0.1557560591318)

featureunion-2: FeatureUnion?Documentation for featureunion-2: FeatureUnion

FeatureUnion(transformer_list=[('pipeline-1',
                                Pipeline(steps=[('passthrough',
                                                 Passthrough())])),
                               ('pipeline-2',
                                Pipeline(steps=[('polynomialfeatures',
                                                 PolynomialFeatures())])),
                               ('pipeline-3',
                                Pipeline(steps=[('zerocount', ZeroCount())]))])

pipeline-1

Passthrough

Passthrough()

pipeline-2

PolynomialFeatures?Documentation for PolynomialFeatures

PolynomialFeatures()

pipeline-3

ZeroCount

ZeroCount()

RandomForestClassifier?Documentation for RandomForestClassifier

RandomForestClassifier(criterion='log_loss', n_estimators=80)

In [6]:

Copied!

est.predict(X_test)
est.predict(X_test)

Out[6]:

array([1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0])