Skip to content

Graphsklearn

This file is part of the TPOT library.

The current version of TPOT was developed at Cedars-Sinai by: - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) - Anil Saini (anil.saini@cshs.org) - Jose Hernandez (jgh9094@gmail.com) - Jay Moran (jay.moran@cshs.org) - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) - Hyunjun Choi (hyunjun.choi@cshs.org) - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) - Jason Moore (moorejh28@gmail.com)

The original version of TPOT was primarily developed at the University of Pennsylvania by: - Randal S. Olson (rso@randalolson.com) - Weixuan Fu (weixuanf@upenn.edu) - Daniel Angell (dpa34@drexel.edu) - Jason Moore (moorejh28@gmail.com) - and many more generous open-source contributors

TPOT is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

TPOT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License along with TPOT. If not, see http://www.gnu.org/licenses/.

GraphPipeline

Bases: _BaseComposition

Source code in tpot2/graphsklearn.py
class GraphPipeline(_BaseComposition):
    def __init__(
                self,
                graph,
                cross_val_predict_cv=0, #signature function(estimator, X, y=none)
                method='auto',
                memory=None,
                use_label_encoder=False,
                **kwargs,
                ):
        super().__init__(**kwargs)
        '''
        An sklearn baseestimator that uses genetic programming to optimize a pipeline.

        Parameters
        ----------

        graph: networkx.DiGraph
            A directed graph where the nodes are sklearn estimators and the edges are the inputs to those estimators.

        cross_val_predict_cv: int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy used in inner classifiers or regressors

        method: str, optional
            The prediction method to use for the inner classifiers or regressors. If 'auto', it will try to use predict_proba, decision_function, or predict in that order.

        memory: str or object with the joblib.Memory interface, optional
            Used to cache the input and outputs of nodes to prevent refitting or computationally heavy transformations. By default, no caching is performed. If a string is given, it is the path to the caching directory.

        use_label_encoder: bool, optional
            If True, the label encoder is used to encode the labels to be 0 to N. If False, the label encoder is not used.
            Mainly useful for classifiers (XGBoost) that require labels to be ints from 0 to N.

            Can also be a sklearn.preprocessing.LabelEncoder object. If so, that label encoder is used.

        '''

        self.graph = graph
        self.cross_val_predict_cv = cross_val_predict_cv
        self.method = method
        self.memory = memory
        self.use_label_encoder = use_label_encoder

        setup_ordered_successors(graph)

        self.topo_sorted_nodes = list(nx.topological_sort(self.graph))
        self.topo_sorted_nodes.reverse()

        self.root = self.topo_sorted_nodes[-1]

        if self.use_label_encoder:
            if type(self.use_label_encoder) == LabelEncoder:
                self.label_encoder = self.use_label_encoder
            else:
                self.label_encoder = LabelEncoder()


        #TODO clean this up
        try:
            nx.find_cycle(self.G)
            raise BaseException 
        except: 
            pass

    def __str__(self):
        if len(self.graph.edges) > 0:
            return str(self.graph.edges)
        else:
            return str(self.graph.nodes)

    def fit(self, X, y):


        if self.use_label_encoder:
            if type(self.use_label_encoder) == LabelEncoder:
                y = self.label_encoder.transform(y)
            else:
                y = self.label_encoder.fit_transform(y)



        fit_sklearn_digraph(   graph=self.graph,
                                X=X,
                                y=y,
                                method=self.method,
                                cross_val_predict_cv = self.cross_val_predict_cv,
                                memory = self.memory,
                                topo_sort = self.topo_sorted_nodes,
                                )

        return self

    def plot(self, ):
        plot(graph = self.graph)

    def __sklearn_is_fitted__(self):
        '''Indicate whether pipeline has been fit.'''
        try:
            # check if the last step of the pipeline is fitted
            # we only check the last step since if the last step is fit, it
            # means the previous steps should also be fit. This is faster than
            # checking if every step of the pipeline is fit.
            sklearn.utils.validation.check_is_fitted(self.graph.nodes[self.root]["instance"])
            return True
        except sklearn.exceptions.NotFittedError:
            return False

    @available_if(_estimator_has('predict'))
    def predict(self, X, **predict_params):


        this_X = get_inputs_to_node(self.graph,
                    X, 
                    self.root,
                    method = self.method,
                    topo_sort = self.topo_sorted_nodes,
                    )

        preds = self.graph.nodes[self.root]["instance"].predict(this_X, **predict_params)

        if self.use_label_encoder:
            preds = self.label_encoder.inverse_transform(preds)

        return preds

    @available_if(_estimator_has('predict_proba'))
    def predict_proba(self, X, **predict_params):


        this_X = get_inputs_to_node(self.graph,
                    X, 
                    self.root,
                    method = self.method,
                    topo_sort = self.topo_sorted_nodes,
                    )
        return self.graph.nodes[self.root]["instance"].predict_proba(this_X, **predict_params)

    @available_if(_estimator_has('decision_function'))
    def decision_function(self, X, **predict_params):

        this_X = get_inputs_to_node(self.graph,
                    X, 
                    self.root,
                    method = self.method,
                    topo_sort = self.topo_sorted_nodes,
                    )
        return self.graph.nodes[self.root]["instance"].decision_function(this_X, **predict_params)

    @available_if(_estimator_has('transform'))
    def transform(self, X, **predict_params):

        this_X = get_inputs_to_node(self.graph,
                    X, 
                    self.root,
                    method = self.method,
                    topo_sort = self.topo_sorted_nodes,
                    )
        return self.graph.nodes[self.root]["instance"].transform(this_X, **predict_params)

    @property
    def classes_(self):
        """The classes labels. Only exist if the last step is a classifier."""

        if self.use_label_encoder:
            return self.label_encoder.classes_
        else:
            return self.graph.nodes[self.root]["instance"].classes_

    @property
    def _estimator_type(self):
        return self.graph.nodes[self.root]["instance"]._estimator_type

classes_ property

The classes labels. Only exist if the last step is a classifier.

__sklearn_is_fitted__()

Indicate whether pipeline has been fit.

Source code in tpot2/graphsklearn.py
def __sklearn_is_fitted__(self):
    '''Indicate whether pipeline has been fit.'''
    try:
        # check if the last step of the pipeline is fitted
        # we only check the last step since if the last step is fit, it
        # means the previous steps should also be fit. This is faster than
        # checking if every step of the pipeline is fit.
        sklearn.utils.validation.check_is_fitted(self.graph.nodes[self.root]["instance"])
        return True
    except sklearn.exceptions.NotFittedError:
        return False