"""~This file is part of the Aliro library~
Copyright (C) 2023 Epistasis Lab,
Center for Artificial Intelligence Research and Education (CAIRE),
Department of Computational Biomedicine (CBM),
Cedars-Sinai Medical Center.
Aliro is maintained by:
- Hyunjun Choi (hyunjun.choi@cshs.org)
- Miguel Hernandez (miguel.e.hernandez@cshs.org)
- Nick Matsumoto (nicholas.matsumoto@cshs.org)
- Jay Moran (jay.moran@cshs.org)
- and many other generous open source contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
(Autogenerated header, do not modify)
"""
import argparse
import requests
import json
import os
import time
import requests
import pandas as pd
from io import StringIO
# get Aliro environment information
LAB_HOST = os.environ.get('LAB_HOST', 'lab')
LAB_PORT = os.environ.get('LAB_PORT', '5080')
basedir = os.environ.get('PROJECT_ROOT', '.')
[docs]class Experiment:
def __init__(self, args, basedir=basedir):
"""Experiment class for Aliro.
Parameters
----------
args: dict
Arguments of a experiment from Aliro API
basedir: string
Base directory for this project
Returns
-------
None
"""
self.args = args
self.method_name = self.args['method']
self.basedir = basedir
# temporary directory
self.tmpdir = '{}/machine/learn/tmp/{}/'.format(
self.basedir, self.method_name)
if not os.path.isdir(self.tmpdir):
os.makedirs(self.tmpdir)
[docs] def get_model(self):
"""Build scikit learn method based on arguments from Aliro API.
Returns
-------
model: scikit-learn Estimator
a machine learning model with scikit-learn API
method_type: string
'classification': classification model
'regression': regression model
"""
projects = get_projects()
pdict = next(
item for item in projects if item["name"] == self.method_name)
params = pdict['schema']
import_path = pdict['path']
method_type = pdict['category']
encoding_strategy = pdict['categorical_encoding_strategy']
method_args = {k: self.args[k] for k in params.keys()}
# update static parameters
if 'static_parameters' in pdict:
method_args.update(pdict['static_parameters'])
print(f"method_args: {method_args}")
exec('from {} import {}'.format(import_path, self.method_name))
method = eval(self.method_name)
model = method(**method_args)
return model, method_type, encoding_strategy
[docs]def get_projects():
"""Get all machine learning algorithm's information from Aliro API This
information should be the same with projects.json.
Returns
-------
projects: dict
A dict of all machine learning algorithm's information
"""
uri = 'http://' + LAB_HOST + ':' + LAB_PORT + '/api/v1/projects'
projects = json.loads(requests.get(uri).text)
return projects
[docs]def parse_args():
"""Parse arguments for machine learning algorithm.
Returns
-------
args: dict
Arguments of a experiment from Aliro API
param_grid: dict
Dictionary with parameters names (string) as keys
and lists of parameter settings to try as values,
or a list of such dictionaries, in which case the
grids spanned by each dictionary in the list are
explored. This enables searching over any sequence
of parameter settings.
"""
projects = get_projects()
parser = argparse.ArgumentParser(
description='Driver for all machine learning algorithms in Aliro')
subparsers = parser.add_subparsers(
dest='method', help="ML Learning Algorithm")
for pdict in projects:
method = pdict['name']
params = pdict['schema']
subparser = subparsers.add_parser(method)
subparser.add_argument(
'--_id',
action='store',
dest='_id',
default=None,
type=str,
help="Experiment id in database")
subparser.add_argument(
'--grid_search',
action='store',
dest='grid_search',
default=False,
type=bool,
help=(
'If grid_search is True, then '
'the experiment will perform GridSearchCV'))
param_grid = {}
# parse args for each parameter
for key, val in params.items():
arg = '--' + key
arg_dest = key
arg_default = val['default']
arg_type = get_type(val['type'])
subparser.add_argument(arg, action='store', dest=arg_dest,
default=arg_default, type=arg_type)
if "grid_search" in val['ui']:
values = val['ui']["grid_search"]
elif "values" in val['ui']:
values = val['ui']["values"]
else:
values = val['ui']["choices"]
param_grid[key] = [arg_type(v) for v in values]
args = vars(parser.parse_args())
print('parsed args:', args)
return args, param_grid
[docs]def get_file_data(file_id):
"""Attempt to retrieve dataset file. If the file is corrupt or an error
response is returned, it will rasie an ValueError.
Parameters
----------
file_id: string
File ID from the Aliro database
Return: string
Dataset strings which will be read by pandas
and converted to pd.DataFrame
"""
uri = 'http://' + LAB_HOST + ':' + LAB_PORT + '/api/v1/files/' + file_id
res = requests.get(uri)
if res.status_code != requests.codes.ok:
msg = ('Unable to retrieve file {file_id}. '
'Status code: {status_code}. '
'Response text: {res_text}'.format(file_id=file_id,
status_code=status_code,
res_text=res.text))
raise ValueError(msg)
return res.text
[docs]def check_column(column_name, dataframe):
""" check if a column exists in Pandas DataFrame.
Parameters
----------
column_name: string
column name
dataframe: pandas.DataFrame
input dataset DataFrame
Returns
-------
None
"""
if column_name not in dataframe.columns.values:
raise ValueError(
'The provided data file does '
'not seem to have target column {}.'.format(column_name)
)
[docs]def bool_type(val):
"""Convert argument to boolean type.
Parameters
----------
val: string
Value of a parameter in string type
Returns
-------
_: boolean
Converted value in boolean type
"""
if(val.lower() == 'true'):
return True
elif(val.lower() == 'false'):
return False
else:
raise argparse.ArgumentTypeError(val + ' is not a valid boolean value')
[docs]def none(val):
"""Convert nono argument to None.
Parameters
----------
val: string
Value of a parameter in string type
Returns
-------
_: None
If input value if "none", then the function will return None,
otherwise it will retune string.
"""
if(val.lower() == 'none' or 'null'):
return None
else:
raise argparse.ArgumentTypeError(val + ' is not a valid str value')
[docs]def get_type(param_type):
"""Return convertion function for input type.
Parameters
----------
param_type: string or list
string, type of a parameter which is defined in projects.json
list, list of parameter types
(for parameter supportting multiple input types)
Returns
-------
known_types[type]: function
Function for converting argument from Aliro UI
for assigning to scikit-learn estimator
"""
known_types = {
'int': int, # change this later
'float': float,
'string': str,
'bool': bool_type,
'none': none
}
if isinstance(param_type, list):
def convert_func(val):
conv_val = ''
for t in param_type:
try:
if isinstance(val, str):
if val.lower() == 'none' and t == "none":
conv_val = None
break
elif val.lower() in ["true", "false"] and t == "bool":
conv_val = bool_type(val)
break
conv_val = known_types[t](val)
# for mixed type in tree-based model
if isinstance(conv_val, (int, float)):
if conv_val < 1:
conv_val = float(conv_val)
else:
conv_val = int(conv_val)
break
except BaseException:
pass
if conv_val == '':
raise argparse.ArgumentTypeError(val + ' is not a valid value')
return conv_val
return convert_func
else:
return known_types[param_type]