Source code for ai.recommender.knn_meta_recommender
"""~This file is part of the Aliro library~
Copyright (C) 2023 Epistasis Lab,
Center for Artificial Intelligence Research and Education (CAIRE),
Department of Computational Biomedicine (CBM),
Cedars-Sinai Medical Center.
Aliro is maintained by:
- Hyunjun Choi (hyunjun.choi@cshs.org)
- Miguel Hernandez (miguel.e.hernandez@cshs.org)
- Nick Matsumoto (nicholas.matsumoto@cshs.org)
- Jay Moran (jay.moran@cshs.org)
- and many other generous open source contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
(Autogenerated header, do not modify)
"""
# Recommender system for Aliro.
import pandas as pd
# import json
# import urllib.request, urllib.parse
from .base import BaseRecommender
#from ..metalearning import get_metafeatures
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
import numpy as np
from collections import defaultdict, OrderedDict
from sklearn.neighbors import NearestNeighbors
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
formatter = logging.Formatter('%(module)s: %(levelname)s: %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
[docs]class KNNMetaRecommender(BaseRecommender):
"""Aliro KNN meta recommender.
Recommends machine learning algorithms and parameters as follows:
- store the best ML + P on every dataset.
- given a new dataset, measure its distance to all results in metafeature space.
- recommend ML + P with best performance on closest dataset.
Parameters
----------
ml_type: str, 'classifier' or 'regressor'
Recommending classifiers or regressors. Used to determine ML options.
metric: str (default: accuracy for classifiers, mse for regressors)
The metric by which to assess performance on the datasets.
ml_p: DataFrame (default: None)
Contains all valid ML parameter combos, with columns 'algorithm' and
'parameters'
"""
def __init__(self,
ml_type='classifier',
metric=None,
ml_p=None,
random_state=None,
knowledgebase_results=None,
knowledgebase_metafeatures=None,
load_serialized_rec="if_exists",
serialized_rec_directory=None,
serialized_rec_filename=None):
""" set default recommender specific parameters; might be overwritten by loading serialized recommender"""
# lookup table: dataset name to best ML+P
self.best_mlp = pd.DataFrame(columns=['_id','algorithm',
'parameters', 'score'])
self.best_mlp.set_index('_id',inplace=True)
# local dataframe of datasets and their metafeatures
self.all_dataset_mf = pd.DataFrame()
"""Initialize recommendation system."""
super().__init__(
ml_type,
metric,
ml_p,
random_state=random_state,
knowledgebase_results=knowledgebase_results,
knowledgebase_metafeatures=knowledgebase_metafeatures,
load_serialized_rec=load_serialized_rec,
serialized_rec_directory=serialized_rec_directory,
serialized_rec_filename=serialized_rec_filename)
[docs] def update(self, results_data, results_mf, source='pennai'):
"""Update ML / Parameter recommendations.
Parameters
----------
results_data: DataFrame
columns corresponding to:
'algorithm'
'parameters'
self.metric
results_mf: DataFrame
columns corresponding to metafeatures of each dataset in
results_data.
"""
# update trained dataset models and hash table
super().update(results_data, results_mf, source)
# save a copy of the results_mf with NaNs filled with zero
drop_cols = [c for c in results_mf.columns
if c[0] == '_' and c !='_id']
self.all_dataset_mf = \
results_mf.drop(columns=drop_cols).fillna(0.0).set_index('_id')
# update internal model
self.update_model(results_data)
[docs] def update_model(self,results_data):
"""Stores best ML-P on each dataset."""
logger.debug('len(self.hash_2_param)): ' + str(len(self.hash_2_param)))
for d,dfg in results_data.groupby('_id'):
if (len(self.best_mlp) == 0 or
d not in self.best_mlp.index or
dfg[self.metric].max() > self.best_mlp.loc[d,'score']):
self.best_mlp.loc[d,'score'] = dfg[self.metric].max()
dfg = dfg.reset_index()
idx = dfg[self.metric].idxmax()
# print('dfg:\n',dfg)
logger.debug('new best for '+d+': '+
dfg.loc[idx,'algorithm']+', idx:'+str(idx))
self.best_mlp.loc[d,'algorithm'] = dfg.loc[idx,'algorithm']
self.best_mlp.loc[d,'parameters'] = dfg.loc[idx,
'parameter_hash']
else:
logger.debug('skipping'+d)
# print('model updated')
[docs] def recommend(self, dataset_id, n_recs=1, dataset_mf = None):
"""Return a model and parameter values expected to do best on dataset.
Parameters
----------
dataset_id: string
ID of the dataset for which the recommender is generating
recommendations.
n_recs: int (default: 1), optional
Return a list of length n_recs in order of estimators and parameters
expected to do best.
"""
if dataset_mf is None:
raise ValueError('dataset_mf is None for',dataset_id,
"can't recommend")
# dataset hash table
super().recommend(dataset_id, n_recs, dataset_mf)
logger.debug('dataset_mf columns:{}'.format(dataset_mf.columns))
drop_cols = [c for c in dataset_mf.columns
if c[0] == '_' and c !='_id']
dataset_mf = dataset_mf.drop(columns=drop_cols)
logger.debug('dataset_mf columns:{}'.format(dataset_mf.columns))
try:
ml_rec, phash_rec, rec_score = self.best_model_prediction(
dataset_id, dataset_mf)
if len(ml_rec) < n_recs:
logger.info(f'len(ml_rec)={len(ml_rec)}, recommending random')
iters = 0
while len(ml_rec) < n_recs and iters < 1000:
# add random ml_p recommendations until n_recs is met
new_ml_rec = np.random.choice(self.ml_p['algorithm'].unique())
new_phash_rec = self._hash_simple_dict(np.random.choice(
self.ml_p.loc[self.ml_p['algorithm']==new_ml_rec]
['parameters'].values))
if (dataset_id + '|' + new_ml_rec + '|' + new_phash_rec
not in self.trained_dataset_models):
ml_rec.append(new_ml_rec)
phash_rec.append(new_phash_rec)
rec_score.append(np.nan)
iters = iters+1
if iters == 1000:
logger.info(f'couldn''t find {n_recs} unique recommendations! '
'returning',len(ml_rec))
subset = \
[dataset_id in tdm for tdm in self.trained_dataset_models]
num_results = len(
[tdm for i,tdm in enumerate(self.trained_dataset_models)
if subset[i]])
logger.info(f'btw, there are {num_results} results for '
'{dataset_id} already')
ml_rec, p_rec, rec_score = (ml_rec[:n_recs],
[self.hash_2_param[p] for p in phash_rec[:n_recs]],
rec_score[:n_recs])
assert(len(ml_rec) == n_recs)
except Exception as e:
logger.error('error running self.best_model_prediction for'
+dataset_id)
raise e
# logger.error('ml_rec:'+ ml_rec)
# logger.error('p_rec'+ p_rec)
# logger.error('rec_score'+rec_score)
# update the recommender's memory with the new algorithm-parameter
# combos that it recommended
self._update_trained_dataset_models_from_rec(
dataset_id,
ml_rec,
phash_rec
)
return ml_rec, p_rec, rec_score
[docs] def best_model_prediction(self, dataset_id, df_mf, n_recs=1):
"""Predict scores over many variations of ML+P and pick the best"""
# get dataset metafeatures
for cols in ['dataset','_id']:
if cols in df_mf.columns:
df_mf = df_mf.drop(cols,axis=1)
mf = df_mf.fillna(0.0).values.flatten()
# compute the neighbors of past results
nbrs = NearestNeighbors(n_neighbors=len(self.all_dataset_mf),
algorithm='ball_tree')
rs = RobustScaler()
X = rs.fit_transform(self.all_dataset_mf.values)
nbrs.fit(X)
# find n_recs nearest neighbors to new dataset
# print('querying neighbors with mf of shape',mf.shape)
distances,indices = nbrs.kneighbors(rs.transform(mf.reshape(1,-1)))
# print('distances:',distances)
# print('indices:',indices)
dataset_idx = [self.all_dataset_mf.index[i] for i in indices[0]]
# recommend the mlp results closest to the dataset in metafeature space
ml_recs, p_recs, scores = [],[],[]
# print('self.best_mlp:',self.best_mlp)
for i,(d,dist) in enumerate(zip(dataset_idx,distances[0])):
if d not in self.best_mlp.index:
continue
if i < 10:
logger.debug('closest dataset:'+d+'; distance:'+ str(dist))
# don't recommend based on the same dataset
if round(dist,6) > 0.0:
alg_params = (self.best_mlp.loc[d,'algorithm'] + '|' +
self.best_mlp.loc[d,'parameters'])
# only recommend if not already recommended
if (dataset_id+'|'+alg_params
not in self.trained_dataset_models):
ml_recs.append(self.best_mlp.loc[d,'algorithm'])
p_recs.append(self.best_mlp.loc[d,'parameters'])
scores.append(dist)
return ml_recs,p_recs,scores