Source code for ai.recommender.average_recommender

"""~This file is part of the Aliro library~

Copyright (C) 2023 Epistasis Lab, 
Center for Artificial Intelligence Research and Education (CAIRE),
Department of Computational Biomedicine (CBM),
Cedars-Sinai Medical Center.

Aliro is maintained by:
    - Hyunjun Choi (hyunjun.choi@cshs.org)
    - Miguel Hernandez (miguel.e.hernandez@cshs.org)
    - Nick Matsumoto (nicholas.matsumoto@cshs.org)
    - Jay Moran (jay.moran@cshs.org)
    - and many other generous open source contributors

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.

(Autogenerated header, do not modify)

"""
"""
Recommender system for Aliro.
"""
import pdb
import pandas as pd
from .base import BaseRecommender
import logging
logger = logging.getLogger(__name__)
#logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
formatter = logging.Formatter('%(module)s: %(levelname)s: %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

[docs]class AverageRecommender(BaseRecommender): """Aliro average recommender. Recommends machine learning algorithms and parameters based on their average performance across all evaluated datasets. Parameters ---------- ml_type: str, 'classifier' or 'regressor' Recommending classifiers or regressors. Used to determine ML options. metric: str (default: accuracy for classifiers, mse for regressors) The metric by which to assess performance on the datasets. """ def __init__(self, ml_type='classifier', metric=None, ml_p=None, random_state=None, knowledgebase_results=None, knowledgebase_metafeatures=None, load_serialized_rec="if_exists", serialized_rec_directory=None, serialized_rec_filename=None): """ set default recommender specific parameters; might be overwritten by loading serialized recommender""" """Initialize recommendation system.""" # number of datasets trained on so far self.w = 0 # empty scores pandas series self.scores = pd.Series() super().__init__( ml_type, metric, ml_p, random_state=random_state, knowledgebase_results=knowledgebase_results, knowledgebase_metafeatures=knowledgebase_metafeatures, load_serialized_rec=load_serialized_rec, serialized_rec_directory=serialized_rec_directory, serialized_rec_filename=serialized_rec_filename) def _train_empty_rec(self, ml_type, metric, ml_p, random_state, knowledgebase_results, knowledgebase_metafeatures, load_serialized_rec, serialized_rec_directory, serialized_rec_filename): super()._train_empty_rec( ml_type, metric, ml_p, random_state, knowledgebase_results, knowledgebase_metafeatures, load_serialized_rec, serialized_rec_directory, serialized_rec_filename)
[docs] def update(self, results_data, results_mf=None, source='pennai'): """Update ML / Parameter recommendations based on overall performance in results_data. Updates self.scores Parameters ---------- results_data: DataFrame with columns corresponding to: 'dataset' 'algorithm' 'parameters' self.metric """ # update trained dataset models and hash table super().update(results_data, results_mf, source) # make combined data columns of classifiers and parameters results_data.loc[:, 'algorithm-parameters'] = ( results_data['algorithm'].values + '|' + results_data['parameter_hash'].apply(str).values) # ml_p = results_data['algorithm-parameters'].unique() # get average balanced accuracy by classifier-parameter combo new_scores = results_data.groupby( ('algorithm-parameters'))[self.metric].mean() new_weights = results_data.groupby('algorithm-parameters').size() # update scores self._update_scores(new_scores, new_weights)
[docs] def recommend(self, dataset_id=None, n_recs=1, dataset_mf=None): """Return a model and parameter values expected to do best on dataset. Parameters ---------- dataset_id: string ID of the dataset for which the recommender is generating recommendations. n_recs: int (default: 1), optional Return a list of length n_recs in order of estimators and parameters expected to do best. """ # dataset hash table super().recommend(dataset_id, n_recs, dataset_mf) dataset_hash = dataset_id # return ML+P for best average y try: rec = self.scores.sort_values(ascending=False).index.values # if a dataset is specified, do not make recommendations for # algorithm-parameter combos that have already been run if dataset_id is not None: rec_filt = [r for r in rec if dataset_hash + '|' + r not in self.trained_dataset_models] if len(rec_filt) >= n_recs: rec = rec_filt else: logger.warning("can't filter recommendations, sending repeats") ml_rec = [r.split('|')[0] for r in rec[:n_recs]] phash_rec = [r.split('|')[1] for r in rec[:n_recs]] rec_score = [self.scores[r] for r in rec[:n_recs]] except AttributeError: logger.error('rec:', rec) logger.error('self.scores:', self.scores) logger.error('self.w:', self.w) raise AttributeError # get parameters from hash table p_rec = [self.hash_2_param[p] for p in phash_rec] # update the recommender's memory with the new algorithm-parameter combos # that it recommended self._update_trained_dataset_models_from_rec(dataset_id, ml_rec, phash_rec) return ml_rec, p_rec, rec_score
def _update_scores(self, new_scores, new_weights): """Update scores based on new_scores.""" new_ind = new_scores.index.values if len(self.scores.index) == 0: self.scores = new_scores self.w = new_weights else: for n in new_ind: if n in self.scores.index.values: step = new_weights[n] / float(self.w[n] + new_weights[n]) self.scores.loc[n] = (self.scores[n] + step * (new_scores[n] - self.scores[n])) else: self.scores.loc[n] = new_scores[n] # update weights self.w = self.w.add(new_weights, fill_value=0)