"""~This file is part of the Aliro library~
Copyright (C) 2023 Epistasis Lab,
Center for Artificial Intelligence Research and Education (CAIRE),
Department of Computational Biomedicine (CBM),
Cedars-Sinai Medical Center.
Aliro is maintained by:
- Hyunjun Choi (hyunjun.choi@cshs.org)
- Miguel Hernandez (miguel.e.hernandez@cshs.org)
- Nick Matsumoto (nicholas.matsumoto@cshs.org)
- Jay Moran (jay.moran@cshs.org)
- and many other generous open source contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
(Autogenerated header, do not modify)
"""
"""
Recommender system for Aliro.
"""
import pdb
import pandas as pd
from .base import BaseRecommender
import logging
logger = logging.getLogger(__name__)
#logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
formatter = logging.Formatter('%(module)s: %(levelname)s: %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
[docs]class AverageRecommender(BaseRecommender):
"""Aliro average recommender.
Recommends machine learning algorithms and parameters based on their average
performance across all evaluated datasets.
Parameters
----------
ml_type: str, 'classifier' or 'regressor'
Recommending classifiers or regressors. Used to determine ML options.
metric: str (default: accuracy for classifiers, mse for regressors)
The metric by which to assess performance on the datasets.
"""
def __init__(self,
ml_type='classifier',
metric=None,
ml_p=None,
random_state=None,
knowledgebase_results=None,
knowledgebase_metafeatures=None,
load_serialized_rec="if_exists",
serialized_rec_directory=None,
serialized_rec_filename=None):
""" set default recommender specific parameters; might be overwritten
by loading serialized recommender"""
"""Initialize recommendation system."""
# number of datasets trained on so far
self.w = 0
# empty scores pandas series
self.scores = pd.Series()
super().__init__(
ml_type,
metric,
ml_p,
random_state=random_state,
knowledgebase_results=knowledgebase_results,
knowledgebase_metafeatures=knowledgebase_metafeatures,
load_serialized_rec=load_serialized_rec,
serialized_rec_directory=serialized_rec_directory,
serialized_rec_filename=serialized_rec_filename)
def _train_empty_rec(self,
ml_type,
metric,
ml_p,
random_state,
knowledgebase_results,
knowledgebase_metafeatures,
load_serialized_rec,
serialized_rec_directory,
serialized_rec_filename):
super()._train_empty_rec(
ml_type,
metric,
ml_p,
random_state,
knowledgebase_results,
knowledgebase_metafeatures,
load_serialized_rec,
serialized_rec_directory,
serialized_rec_filename)
[docs] def update(self, results_data, results_mf=None, source='pennai'):
"""Update ML / Parameter recommendations based on overall performance in
results_data.
Updates self.scores
Parameters
----------
results_data: DataFrame with columns corresponding to:
'dataset'
'algorithm'
'parameters'
self.metric
"""
# update trained dataset models and hash table
super().update(results_data, results_mf, source)
# make combined data columns of classifiers and parameters
results_data.loc[:, 'algorithm-parameters'] = (
results_data['algorithm'].values + '|' +
results_data['parameter_hash'].apply(str).values)
# ml_p = results_data['algorithm-parameters'].unique()
# get average balanced accuracy by classifier-parameter combo
new_scores = results_data.groupby(
('algorithm-parameters'))[self.metric].mean()
new_weights = results_data.groupby('algorithm-parameters').size()
# update scores
self._update_scores(new_scores, new_weights)
[docs] def recommend(self, dataset_id=None, n_recs=1, dataset_mf=None):
"""Return a model and parameter values expected to do best on dataset.
Parameters
----------
dataset_id: string
ID of the dataset for which the recommender is generating
recommendations.
n_recs: int (default: 1), optional
Return a list of length n_recs in order of estimators and parameters
expected to do best.
"""
# dataset hash table
super().recommend(dataset_id, n_recs, dataset_mf)
dataset_hash = dataset_id
# return ML+P for best average y
try:
rec = self.scores.sort_values(ascending=False).index.values
# if a dataset is specified, do not make recommendations for
# algorithm-parameter combos that have already been run
if dataset_id is not None:
rec_filt = [r for r in rec if dataset_hash + '|' + r not in
self.trained_dataset_models]
if len(rec_filt) >= n_recs:
rec = rec_filt
else:
logger.warning("can't filter recommendations, sending repeats")
ml_rec = [r.split('|')[0] for r in rec[:n_recs]]
phash_rec = [r.split('|')[1] for r in rec[:n_recs]]
rec_score = [self.scores[r] for r in rec[:n_recs]]
except AttributeError:
logger.error('rec:', rec)
logger.error('self.scores:', self.scores)
logger.error('self.w:', self.w)
raise AttributeError
# get parameters from hash table
p_rec = [self.hash_2_param[p] for p in phash_rec]
# update the recommender's memory with the new algorithm-parameter combos
# that it recommended
self._update_trained_dataset_models_from_rec(dataset_id, ml_rec,
phash_rec)
return ml_rec, p_rec, rec_score
def _update_scores(self, new_scores, new_weights):
"""Update scores based on new_scores."""
new_ind = new_scores.index.values
if len(self.scores.index) == 0:
self.scores = new_scores
self.w = new_weights
else:
for n in new_ind:
if n in self.scores.index.values:
step = new_weights[n] / float(self.w[n] + new_weights[n])
self.scores.loc[n] = (self.scores[n] +
step * (new_scores[n] - self.scores[n]))
else:
self.scores.loc[n] = new_scores[n]
# update weights
self.w = self.w.add(new_weights, fill_value=0)