Skip to content

ecnet.tasks

feature_selection

Feature selection functions

select_rfr(dataset, total_importance=0.95, **kwargs)

select_rfr: reduces input data dimensionality such that specified proportion of total feature importance (derived from random forest regression) is retained in feature subset

Parameters:

Name Type Description Default
dataset QSPRDataset

input data

required
total_importance float

total feature importance to retain

0.95
**kwargs

additional arguments passed to sklearn.ensemble.RandomForestRegressor

{}

Returns:

Type Description
Tuple[List[int], List[float]]

tuple[list[int], list[float]]: (selected feature indices, selected feature importances)

Source code in ecnet/tasks/feature_selection.py
def select_rfr(dataset: QSPRDataset, total_importance: float = 0.95,
               **kwargs) -> Tuple[List[int], List[float]]:
    """
    select_rfr: reduces input data dimensionality such that specified proportion of total feature
    importance (derived from random forest regression) is retained in feature subset

    Args:
        dataset (QSPRDataset): input data
        total_importance (float): total feature importance to retain
        **kwargs: additional arguments passed to sklearn.ensemble.RandomForestRegressor

    Returns:
        tuple[list[int], list[float]]: (selected feature indices, selected feature importances)
    """

    X = dataset.desc_vals
    y = [dv[0] for dv in dataset.target_vals]
    regr = RandomForestRegressor(**kwargs)
    regr.fit(X, y)
    importances = sorted(
        [(regr.feature_importances_[i], i)
         for i in range(len(dataset.desc_vals[0]))],
        key=lambda x: x[0], reverse=True
    )
    tot_imp = 0.0
    for idx, i in enumerate(importances):
        tot_imp += i[0]
        idx_cutoff = idx
        if tot_imp >= total_importance:
            break
    desc_imp = [i[0] for i in importances][:idx_cutoff]
    desc_idx = [i[1] for i in importances][:idx_cutoff]
    return (desc_idx, desc_imp)

parameter_tuning

tune_batch_size(n_bees, n_iter, n_processes=1, **kwargs)

Tunes the batch size during training

Parameters:

Name Type Description Default
n_bees int

number of employer bees to use in ABC algorithm

required
n_iter int

number of iterations, or "search cycles", for ABC algorithm

required
n_processes int

if > 1, uses multiprocessing when evaluating at an iteration

1
**kwargs

arguments passed to _cost_batch_size

{}

Returns:

Type Description
dict

dict: {'batch_size': tuned batch size}

Source code in ecnet/tasks/parameter_tuning.py
def tune_batch_size(n_bees: int, n_iter: int, n_processes: int = 1, **kwargs) -> dict:
    """
    Tunes the batch size during training

    Args:
        n_bees (int): number of employer bees to use in ABC algorithm
        n_iter (int): number of iterations, or "search cycles", for ABC algorithm
        n_processes (int): if > 1, uses multiprocessing when evaluating at an iteration
        **kwargs: arguments passed to _cost_batch_size

    Returns:
        dict: {'batch_size': tuned batch size}
    """

    abc = ABC(n_bees, _cost_batch_size, num_processes=n_processes, obj_fn_args=kwargs)
    abc.add_param(1, len(kwargs.get('train_ds').desc_vals), name='batch_size')
    abc.initialize()
    for _ in range(n_iter):
        abc.search()
    return {'batch_size': abc.best_params['batch_size']}

tune_model_architecture(n_bees, n_iter, n_processes=1, **kwargs)

Tunes the NN's architecture

Parameters:

Name Type Description Default
n_bees int

number of employer bees to use in ABC algorithm

required
n_iter int

number of iterations, or "search cycles", for ABC algorithm

required
n_processes int

if > 1, uses multiprocessing when evaluating at an iteration

1
**kwargs

arguments passed to _cost_batch_size

{}

Returns:

Type Description
dict

dict: {'batch_size': opt_val, 'n_hidden': opt_val, 'dropout': opt_val}

Source code in ecnet/tasks/parameter_tuning.py
def tune_model_architecture(n_bees: int, n_iter: int, n_processes: int = 1, **kwargs) -> dict:
    """
    Tunes the NN's architecture

    Args:
        n_bees (int): number of employer bees to use in ABC algorithm
        n_iter (int): number of iterations, or "search cycles", for ABC algorithm
        n_processes (int): if > 1, uses multiprocessing when evaluating at an iteration
        **kwargs: arguments passed to _cost_batch_size

    Returns:
        dict: {'batch_size': opt_val, 'n_hidden': opt_val, 'dropout': opt_val}
    """

    abc = ABC(n_bees, _cost_arch, num_processes=n_processes, obj_fn_args=kwargs)
    abc.add_param(CONFIG['architecture_params_range']['hidden_dim'][0],
                  CONFIG['architecture_params_range']['hidden_dim'][1], name='hidden_dim')
    abc.add_param(CONFIG['architecture_params_range']['n_hidden'][0],
                  CONFIG['architecture_params_range']['n_hidden'][1], name='n_hidden')
    abc.add_param(CONFIG['architecture_params_range']['dropout'][0],
                  CONFIG['architecture_params_range']['dropout'][1], name='dropout')
    abc.initialize()
    for _ in range(n_iter):
        abc.search()
    return {
        'hidden_dim': abc.best_params['hidden_dim'],
        'n_hidden': abc.best_params['n_hidden'],
        'dropout': abc.best_params['dropout']
    }

tune_training_parameters(n_bees, n_iter, n_processes=1, **kwargs)

Tunes the NN's training parameters (Adam optim. fn.)

Parameters:

Name Type Description Default
n_bees int

number of employer bees to use in ABC algorithm

required
n_iter int

number of iterations, or "search cycles", for ABC algorithm

required
n_processes int

if > 1, uses multiprocessing when evaluating at an iteration

1
**kwargs

arguments passed to _cost_batch_size

{}

Returns:

Type Description
dict

dict: {'lr': opt_val, 'lr_decay': opt_val}

Source code in ecnet/tasks/parameter_tuning.py
def tune_training_parameters(n_bees: int, n_iter: int, n_processes: int = 1, **kwargs) -> dict:
    """
    Tunes the NN's training parameters (Adam optim. fn.)

    Args:
        n_bees (int): number of employer bees to use in ABC algorithm
        n_iter (int): number of iterations, or "search cycles", for ABC algorithm
        n_processes (int): if > 1, uses multiprocessing when evaluating at an iteration
        **kwargs: arguments passed to _cost_batch_size

    Returns:
        dict: {'lr': opt_val, 'lr_decay': opt_val}
    """

    abc = ABC(n_bees, _cost_train_hp, num_processes=n_processes, obj_fn_args=kwargs)
    abc.add_param(CONFIG['training_params_range']['lr'][0],
                  CONFIG['training_params_range']['lr'][1], name='lr')
    abc.add_param(CONFIG['training_params_range']['lr_decay'][0],
                  CONFIG['training_params_range']['lr_decay'][1], name='lr_decay')
    abc.initialize()
    for _ in range(n_iter):
        abc.search()
    return {
        'lr': abc.best_params['lr'],
        'lr_decay': abc.best_params['lr_decay']
    }