Skip to content

ecnet.tasks

feature_selection

Feature selection functions

select_rfr(dataset, total_importance=0.95, **kwargs)

select_rfr: reduces input data dimensionality such that specified proportion of total feature importance (derived from random forest regression) is retained in feature subset

Parameters:

Name Type Description Default
dataset QSPRDataset

input data

required
total_importance float

total feature importance to retain

0.95
**kwargs

additional arguments passed to sklearn.ensemble.RandomForestRegressor

{}

Returns:

Type Description
Tuple[List[int], List[float]]

tuple[list[int], list[float]]: (selected feature indices, selected feature importances)

Source code in ecnet/tasks/feature_selection.py
def select_rfr(dataset: QSPRDataset, total_importance: float = 0.95,
               **kwargs) -> Tuple[List[int], List[float]]:
    """
    select_rfr: reduces input data dimensionality such that specified proportion of total feature
    importance (derived from random forest regression) is retained in feature subset

    Args:
        dataset (QSPRDataset): input data
        total_importance (float): total feature importance to retain
        **kwargs: additional arguments passed to sklearn.ensemble.RandomForestRegressor

    Returns:
        tuple[list[int], list[float]]: (selected feature indices, selected feature importances)
    """

    X = dataset.desc_vals
    y = [dv[0] for dv in dataset.target_vals]
    regr = RandomForestRegressor(**kwargs)
    regr.fit(X, y)
    importances = sorted(
        [(regr.feature_importances_[i], i)
         for i in range(len(dataset.desc_vals[0]))],
        key=lambda x: x[0], reverse=True
    )
    tot_imp = 0.0
    for idx, i in enumerate(importances):
        tot_imp += i[0]
        idx_cutoff = idx
        if tot_imp >= total_importance:
            break
    desc_imp = [i[0] for i in importances][:idx_cutoff]
    desc_idx = [i[1] for i in importances][:idx_cutoff]
    return (desc_idx, desc_imp)

parameter_tuning

tune_batch_size(n_bees, n_iter, dataset_train, dataset_eval, n_processes=1, **kwargs)

Tunes the batch size during training; additional **kwargs can include any in: [ # ECNet parameters 'epochs' (default 100), 'valid_size' (default 0.2), 'patience' (default 32), 'lr_decay' (default 0.0), 'hidden_dim' (default 128), 'n_hidden' (default 2), 'dropout': (default 0.0), # Adam optim. alg. arguments 'lr' (default 0.001), 'beta_1' (default 0.9), 'beta_2' (default 0.999), 'eps' (default 1e-8), 'weight_decay' (default 0.0), 'amsgrad' (default False) ]

Parameters:

Name Type Description Default
n_bees int

number of employer bees to use in ABC algorithm

required
n_iter int

number of iterations, or "search cycles", for ABC algorithm

required
dataset_train QSPRDataset

dataset used to train evaluation models

required
dataset_eval QSPRDataset

dataset used for evaluation

required
n_processes int

if > 1, uses multiprocessing when evaluating at an iteration

1
**kwargs

additional arguments

{}

Returns:

Type Description
dict

dict: {'batch_size': int}

Source code in ecnet/tasks/parameter_tuning.py
def tune_batch_size(n_bees: int, n_iter: int, dataset_train: QSPRDataset,
                    dataset_eval: QSPRDataset, n_processes: int = 1,
                    **kwargs) -> dict:
    """
    Tunes the batch size during training; additional **kwargs can include any in:
        [
            # ECNet parameters
            'epochs' (default 100),
            'valid_size' (default 0.2),
            'patience' (default 32),
            'lr_decay' (default 0.0),
            'hidden_dim' (default 128),
            'n_hidden' (default 2),
            'dropout': (default 0.0),
            # Adam optim. alg. arguments
            'lr' (default 0.001),
            'beta_1' (default 0.9),
            'beta_2' (default 0.999),
            'eps' (default 1e-8),
            'weight_decay' (default 0.0),
            'amsgrad' (default False)
        ]

    Args:
        n_bees (int): number of employer bees to use in ABC algorithm
        n_iter (int): number of iterations, or "search cycles", for ABC algorithm
        dataset_train (QSPRDataset): dataset used to train evaluation models
        dataset_eval (QSPRDataset): dataset used for evaluation
        n_processes (int, optional): if > 1, uses multiprocessing when evaluating at an iteration
        **kwargs: additional arguments 

    Returns:
        dict: {'batch_size': int}
    """

    kwargs['train_ds'] = dataset_train
    kwargs['eval_ds'] = dataset_eval
    abc = ABC(n_bees, _cost_batch_size, num_processes=n_processes, obj_fn_args=kwargs)
    abc.add_param(1, len(kwargs.get('train_ds').desc_vals), name='batch_size')
    abc.initialize()
    for _ in range(n_iter):
        abc.search()
    return {'batch_size': abc.best_params['batch_size']}

tune_model_architecture(n_bees, n_iter, dataset_train, dataset_eval, n_processes=1, **kwargs)

Tunes model architecture parameters (number of hidden layers, neurons per hidden layer, neuron dropout); additional **kwargs can include any in: [ # ECNet parameters 'epochs' (default 100), 'batch_size' (default 32), 'valid_size' (default 0.2), 'patience' (default 32), 'lr_decay' (default 0.0), # Adam optim. alg. arguments 'lr' (default 0.001), 'beta_1' (default 0.9), 'beta_2' (default 0.999), 'eps' (default 1e-8), 'weight_decay' (default 0.0), 'amsgrad' (default False) ]

Parameters:

Name Type Description Default
n_bees int

number of employer bees to use in ABC algorithm

required
n_iter int

number of iterations, or "search cycles", for ABC algorithm

required
dataset_train QSPRDataset

dataset used to train evaluation models

required
dataset_eval QSPRDataset

dataset used for evaluation

required
n_processes int

if > 1, uses multiprocessing when evaluating at an iteration

1
**kwargs

additional arguments

{}

Returns:

Type Description
dict

dict: {'hidden_dim': int, 'n_hidden': int, 'dropout': float}

Source code in ecnet/tasks/parameter_tuning.py
def tune_model_architecture(n_bees: int, n_iter: int, dataset_train: QSPRDataset,
                            dataset_eval: QSPRDataset, n_processes: int = 1,
                            **kwargs) -> dict:
    """
    Tunes model architecture parameters (number of hidden layers, neurons per hidden layer, neuron
    dropout); additional **kwargs can include any in:
        [
            # ECNet parameters
            'epochs' (default 100),
            'batch_size' (default 32),
            'valid_size' (default 0.2),
            'patience' (default 32),
            'lr_decay' (default 0.0),
            # Adam optim. alg. arguments
            'lr' (default 0.001),
            'beta_1' (default 0.9),
            'beta_2' (default 0.999),
            'eps' (default 1e-8),
            'weight_decay' (default 0.0),
            'amsgrad' (default False)
        ]

    Args:
        n_bees (int): number of employer bees to use in ABC algorithm
        n_iter (int): number of iterations, or "search cycles", for ABC algorithm
        dataset_train (QSPRDataset): dataset used to train evaluation models
        dataset_eval (QSPRDataset): dataset used for evaluation
        n_processes (int, optional): if > 1, uses multiprocessing when evaluating at an iteration
        **kwargs: additional arguments 

    Returns:
        dict: {'hidden_dim': int, 'n_hidden': int, 'dropout': float}
    """

    kwargs['train_ds'] = dataset_train
    kwargs['eval_ds'] = dataset_eval
    abc = ABC(n_bees, _cost_arch, num_processes=n_processes, obj_fn_args=kwargs)
    abc.add_param(CONFIG['architecture_params_range']['hidden_dim'][0],
                  CONFIG['architecture_params_range']['hidden_dim'][1], name='hidden_dim')
    abc.add_param(CONFIG['architecture_params_range']['n_hidden'][0],
                  CONFIG['architecture_params_range']['n_hidden'][1], name='n_hidden')
    abc.add_param(CONFIG['architecture_params_range']['dropout'][0],
                  CONFIG['architecture_params_range']['dropout'][1], name='dropout')
    abc.initialize()
    for _ in range(n_iter):
        abc.search()
    return {
        'hidden_dim': abc.best_params['hidden_dim'],
        'n_hidden': abc.best_params['n_hidden'],
        'dropout': abc.best_params['dropout']
    }

tune_training_parameters(n_bees, n_iter, dataset_train, dataset_eval, n_processes=1, **kwargs)

Tunes learning rate, learning rate decay; additional **kwargs can include any in: [ # ECNet parameters 'epochs' (default 100), 'batch_size' (default 32), 'valid_size' (default 0.2), 'patience' (default 32), 'hidden_dim' (default 128), 'n_hidden' (default 2), 'dropout': (default 0.0), # Adam optim. alg. arguments 'beta_1' (default 0.9), 'beta_2' (default 0.999), 'eps' (default 1e-8), 'weight_decay' (default 0.0), 'amsgrad' (default False) ]

Parameters:

Name Type Description Default
n_bees int

number of employer bees to use in ABC algorithm

required
n_iter int

number of iterations, or "search cycles", for ABC algorithm

required
dataset_train QSPRDataset

dataset used to train evaluation models

required
dataset_eval QSPRDataset

dataset used for evaluation

required
n_processes int

if > 1, uses multiprocessing when evaluating at an iteration

1
**kwargs

additional arguments

{}

Returns:

Type Description
dict

dict: {'lr': float, 'lr_decay': float}

Source code in ecnet/tasks/parameter_tuning.py
def tune_training_parameters(n_bees: int, n_iter: int, dataset_train: QSPRDataset,
                             dataset_eval: QSPRDataset, n_processes: int = 1,
                             **kwargs) -> dict:
    """
    Tunes learning rate, learning rate decay; additional **kwargs can include any in:
        [
            # ECNet parameters
            'epochs' (default 100),
            'batch_size' (default 32),
            'valid_size' (default 0.2),
            'patience' (default 32),
            'hidden_dim' (default 128),
            'n_hidden' (default 2),
            'dropout': (default 0.0),
            # Adam optim. alg. arguments
            'beta_1' (default 0.9),
            'beta_2' (default 0.999),
            'eps' (default 1e-8),
            'weight_decay' (default 0.0),
            'amsgrad' (default False)
        ]

    Args:
        n_bees (int): number of employer bees to use in ABC algorithm
        n_iter (int): number of iterations, or "search cycles", for ABC algorithm
        dataset_train (QSPRDataset): dataset used to train evaluation models
        dataset_eval (QSPRDataset): dataset used for evaluation
        n_processes (int, optional): if > 1, uses multiprocessing when evaluating at an iteration
        **kwargs: additional arguments 

    Returns:
        dict: {'lr': float, 'lr_decay': float}
    """

    kwargs['train_ds'] = dataset_train
    kwargs['eval_ds'] = dataset_eval
    abc = ABC(n_bees, _cost_train_hp, num_processes=n_processes, obj_fn_args=kwargs)
    abc.add_param(CONFIG['training_params_range']['lr'][0],
                  CONFIG['training_params_range']['lr'][1], name='lr')
    abc.add_param(CONFIG['training_params_range']['lr_decay'][0],
                  CONFIG['training_params_range']['lr_decay'][1], name='lr_decay')
    abc.initialize()
    for _ in range(n_iter):
        abc.search()
    return {
        'lr': abc.best_params['lr'],
        'lr_decay': abc.best_params['lr_decay']
    }