Source code for machine.learn.io_utils

"""~This file is part of the Aliro library~

Copyright (C) 2023 Epistasis Lab, 
Center for Artificial Intelligence Research and Education (CAIRE),
Department of Computational Biomedicine (CBM),
Cedars-Sinai Medical Center.

Aliro is maintained by:
    - Hyunjun Choi (hyunjun.choi@cshs.org)
    - Miguel Hernandez (miguel.e.hernandez@cshs.org)
    - Nick Matsumoto (nicholas.matsumoto@cshs.org)
    - Jay Moran (jay.moran@cshs.org)
    - and many other generous open source contributors

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.

(Autogenerated header, do not modify)

"""
import argparse
import requests
import json
import os
import time
import requests
import pandas as pd
from io import StringIO

# get Aliro environment information
LAB_HOST = os.environ.get('LAB_HOST', 'lab')
LAB_PORT = os.environ.get('LAB_PORT', '5080')
basedir = os.environ.get('PROJECT_ROOT', '.')


[docs]class Experiment:
    def __init__(self, args, basedir=basedir):
        """Experiment class for Aliro.

        Parameters
        ----------
        args: dict
            Arguments of a experiment from Aliro API
        basedir: string
            Base directory for this project

        Returns
        -------
        None
        """
        self.args = args
        self.method_name = self.args['method']
        self.basedir = basedir
        # temporary directory
        self.tmpdir = '{}/machine/learn/tmp/{}/'.format(
            self.basedir, self.method_name)
        if not os.path.isdir(self.tmpdir):
            os.makedirs(self.tmpdir)

[docs]    def get_input(self):
        """Get input data based on experiment ID (_id) from Aliro API.

        Returns
        -------
        input_data: pandas.Dataframe or list of two pandas.Dataframe
            pandas.DataFrame: Aliro will use train_test_split to
            make train/test splits list of two pandas.DataFrame:
            The 1st pandas.DataFrame is training dataset,
            while the 2nd one is testing dataset
        """
        return get_input_data(self.args['_id'], self.tmpdir)

[docs]    def get_model(self):
        """Build scikit learn method based on arguments from Aliro API.

        Returns
        -------
        model: scikit-learn Estimator
            a machine learning model with scikit-learn API
        method_type: string
            'classification': classification model
            'regression': regression model
        """
        projects = get_projects()
        pdict = next(
            item for item in projects if item["name"] == self.method_name)
        params = pdict['schema']
        import_path = pdict['path']
        method_type = pdict['category']
        encoding_strategy = pdict['categorical_encoding_strategy']
        method_args = {k: self.args[k] for k in params.keys()}
        # update static parameters
        if 'static_parameters' in pdict:
            method_args.update(pdict['static_parameters'])
        print(f"method_args: {method_args}")
        exec('from {} import {}'.format(import_path, self.method_name))
        method = eval(self.method_name)
        model = method(**method_args)
        return model, method_type, encoding_strategy


[docs]def get_projects():
    """Get all machine learning algorithm's information from Aliro API This
    information should be the same with projects.json.

    Returns
    -------
    projects: dict
        A dict of all machine learning algorithm's information
    """
    uri = 'http://' + LAB_HOST + ':' + LAB_PORT + '/api/v1/projects'
    projects = json.loads(requests.get(uri).text)
    return projects


[docs]def parse_args():
    """Parse arguments for machine learning algorithm.

    Returns
    -------
    args: dict
        Arguments of a experiment from Aliro API
    param_grid: dict
        Dictionary with parameters names (string) as keys
        and lists of parameter settings to try as values,
        or a list of such dictionaries, in which case the
        grids spanned by each dictionary in the list are
        explored. This enables searching over any sequence
        of parameter settings.
    """
    projects = get_projects()
    parser = argparse.ArgumentParser(
        description='Driver for all machine learning algorithms in Aliro')
    subparsers = parser.add_subparsers(
        dest='method', help="ML Learning Algorithm")

    for pdict in projects:
        method = pdict['name']
        params = pdict['schema']
        subparser = subparsers.add_parser(method)
        subparser.add_argument(
            '--_id',
            action='store',
            dest='_id',
            default=None,
            type=str,
            help="Experiment id in database")
        subparser.add_argument(
            '--grid_search',
            action='store',
            dest='grid_search',
            default=False,
            type=bool,
            help=(
                'If grid_search is True, then '
                'the experiment will perform GridSearchCV'))
        param_grid = {}
        # parse args for each parameter
        for key, val in params.items():
            arg = '--' + key
            arg_dest = key
            arg_default = val['default']
            arg_type = get_type(val['type'])

            subparser.add_argument(arg, action='store', dest=arg_dest,
                                   default=arg_default, type=arg_type)
            if "grid_search" in val['ui']:
                values = val['ui']["grid_search"]
            elif "values" in val['ui']:
                values = val['ui']["values"]
            else:
                values = val['ui']["choices"]
            param_grid[key] = [arg_type(v) for v in values]

    args = vars(parser.parse_args())
    print('parsed args:', args)
    return args, param_grid


[docs]def get_input_data(_id, tmpdir):
    """ Get input dataset information from Aliro API.
    
    Parameters
    ----------
    _id: string
        Experiment ID in Aliro
    tmpdir: string
        Path of temporary directory

    Returns
    -------
    input_data: pandas.Dataframe or list of two pandas.Dataframe
        pandas.DataFrame: Aliro will use train_test_split
        to make train/test splits
        list of two pandas.DataFrame.
        The 1st pandas.DataFrame is training dataset,
        while the 2nd one is testing dataset
    data_info: dict
        * target_name: string, target column name
        * filename: list, filename(s)
        * categories: list, categorical feature name(s)
        * ordinals: dict

           * keys: categorical feature name(s)
           * values: categorical values
    """
    expdir = tmpdir + _id + '/'
    if not os.path.exists(expdir):
        os.makedirs(expdir)
    response = requests.get('http://' + LAB_HOST +
                            ':' + LAB_PORT + '/api/v1/experiments/' + _id)
    jsondata = json.loads(response.text)

    _dataset_id = jsondata['_dataset_id']
    if (_dataset_id is None):
        raise RuntimeError("Error when running experiment '{}'"
                           ": Unable to get _dataset_id "
                           " from lab.  "
                           "Response: {}".format(_id, str(jsondata)))

    response = requests.get(
        'http://' +
        LAB_HOST +
        ':' +
        LAB_PORT +
        '/api/v1/datasets/' +
        _dataset_id)
    jsondata = json.loads(response.text)
    files = jsondata['files']
    filename = [file['filename'] for file in files]
    target_name = ''
    categories = None
    ordinals = None
    prediction_type = "classification"  # by default
    for file in files:
        if 'dependent_col' not in file:
            raise RuntimeError(
                "Target column is missing in {}.".format(
                    " or ".join(filename)))
        if target_name and target_name != file['dependent_col']:
            raise RuntimeError(
                "Files in one experiment should has the same"
                " target column name. Related files: {}.".format(
                    ','.join(filename))
            )
        else:
            target_name = file['dependent_col']
        if 'categorical_features' in file:
            categories = file['categorical_features']
        if 'ordinal_features' in file:
            ordinals = file['ordinal_features']
        if 'prediction_type' in file:
            prediction_type = file['prediction_type']

    if len(files) == 1:  # only 1 file
        input_data = pd.read_csv(
            StringIO(
                get_file_data(
                    files[0]['_id'])),
            sep=None,
            engine='python')
        check_column(target_name, input_data)
    else:  # two files for cross-validation
        input_data = []
        # need api support !!
        # the 1st one is training dataset and 2nd one is testing datast
        for file in files:
            indata = pd.read_csv(
                StringIO(
                    get_file_data(
                        file['_id'])),
                sep=None,
                engine='python')
            check_column(target_name, indata)
            input_data.append(indata)
    data_info = {
        'target_name': target_name,
        'filename': filename,
        'categories': categories,
        'ordinals': ordinals,
        'prediction_type': prediction_type
    }
    return input_data, data_info


[docs]def get_file_data(file_id):
    """Attempt to retrieve dataset file. If the file is corrupt or an error
    response is returned, it will rasie an ValueError.

    Parameters
    ----------
    file_id: string
        File ID from the Aliro database

    Return: string
        Dataset strings which will be read by pandas
        and converted to pd.DataFrame
    """
    uri = 'http://' + LAB_HOST + ':' + LAB_PORT + '/api/v1/files/' + file_id
    res = requests.get(uri)

    if res.status_code != requests.codes.ok:
        msg = ('Unable to retrieve file {file_id}.  '
               'Status code: {status_code}. '
               'Response text: {res_text}'.format(file_id=file_id,
                                                  status_code=status_code,
                                                  res_text=res.text))
        raise ValueError(msg)

    return res.text


[docs]def check_column(column_name, dataframe):
    """ check if a column exists in Pandas DataFrame.
    Parameters
    ----------
    column_name: string
        column name
    dataframe: pandas.DataFrame
        input dataset DataFrame

    Returns
    -------
    None
    """
    if column_name not in dataframe.columns.values:
        raise ValueError(
            'The provided data file does '
            'not seem to have target column {}.'.format(column_name)
        )


[docs]def bool_type(val):
    """Convert argument to boolean type.
    Parameters
    ----------
    val: string
        Value of a parameter in string type

    Returns
    -------
    _: boolean
        Converted value in boolean type
    """
    if(val.lower() == 'true'):
        return True
    elif(val.lower() == 'false'):
        return False
    else:
        raise argparse.ArgumentTypeError(val + ' is not a valid boolean value')


[docs]def none(val):
    """Convert nono argument to None.
    Parameters
    ----------
    val: string
        Value of a parameter in string type

    Returns
    -------
    _: None
        If input value if "none", then the function will return None,
        otherwise it will retune string.
    """
    if(val.lower() == 'none' or 'null'):
        return None
    else:
        raise argparse.ArgumentTypeError(val + ' is not a valid str value')


[docs]def get_type(param_type):
    """Return convertion function for input type.

    Parameters
    ----------
    param_type: string or list
        string, type of a parameter which is defined in projects.json
        list, list of parameter types
        (for parameter supportting multiple input types)

    Returns
    -------
    known_types[type]: function
        Function for converting argument from Aliro UI
        for assigning to scikit-learn estimator
    """
    known_types = {
        'int': int,  # change this later
        'float': float,
        'string': str,
        'bool': bool_type,
        'none': none
    }
    if isinstance(param_type, list):
        def convert_func(val):
            conv_val = ''
            for t in param_type:
                try:
                    if isinstance(val, str):
                        if val.lower() == 'none' and t == "none":
                            conv_val = None
                            break
                        elif val.lower() in ["true", "false"] and t == "bool":
                            conv_val = bool_type(val)
                            break
                    conv_val = known_types[t](val)
                    # for mixed type in tree-based model
                    if isinstance(conv_val, (int, float)):
                        if conv_val < 1:
                            conv_val = float(conv_val)
                        else:
                            conv_val = int(conv_val)
                        break
                except BaseException:
                    pass
            if conv_val == '':
                raise argparse.ArgumentTypeError(val + ' is not a valid value')
            return conv_val
        return convert_func
    else:
        return known_types[param_type]