Skip to content

ecnet.datasets

load_data

Pre-bundled data interface

load_bp(as_dataset=False, backend='padel')

Loads boiling point data; target values given in Celsius

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_bp(as_dataset: bool = False, backend: str = 'padel') -> Union[
            Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads boiling point data; target values given in Celsius

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('bp')
    return _load_set('bp', backend)

load_cn(as_dataset=False, backend='padel')

Loads cetane number data; target values given in CN units

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_cn(as_dataset: bool = False, backend: str = 'padel') -> Union[
            Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads cetane number data; target values given in CN units

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('cn')
    return _load_set('cn', backend)

load_cp(as_dataset=False, backend='padel')

Loads cloud point data; target values given in Celsius

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_cp(as_dataset: bool = False, backend: str = 'padel') -> Union[
            Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads cloud point data; target values given in Celsius

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('cp')
    return _load_set('cp', backend)

load_kv(as_dataset=False, backend='padel')

Loads kinematic viscosity data; target values given in mm^2/s (cSt) at 313 deg. K

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_kv(as_dataset: bool = False, backend: str = 'padel') -> Union[
            Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads kinematic viscosity data; target values given in mm^2/s (cSt) at 313 deg. K

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('kv')
    return _load_set('kv', backend)

load_lhv(as_dataset=False, backend='padel')

Loads lower heating value data; target values given in MJ/kg = kJ/g

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_lhv(as_dataset: bool = False, backend: str = 'padel') -> Union[
             Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads lower heating value data; target values given in MJ/kg = kJ/g

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('lhv')
    return _load_set('lhv', backend)

load_mon(as_dataset=False, backend='padel')

Loads motor octane number data; target values given in MON units

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_mon(as_dataset: bool = False, backend: str = 'padel') -> Union[
             Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads motor octane number data; target values given in MON units

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('mon')
    return _load_set('mon', backend)

load_mp(as_dataset=False, backend='padel')

Loads melting point data; target values given in Celsius

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_mp(as_dataset: bool = False, backend: str = 'padel') -> Union[
            Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads melting point data; target values given in Celsius

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('mp')
    return _load_set('mp', backend)

load_pp(as_dataset=False, backend='padel')

Loads pour point data; target values given in Celsius

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_pp(as_dataset: bool = False, backend: str = 'padel') -> Union[
            Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads pour point data; target values given in Celsius

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('pp')
    return _load_set('pp', backend)

load_ron(as_dataset=False, backend='padel')

Loads research octane number data; target values given in RON units

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_ron(as_dataset: bool = False, backend: str = 'padel') -> Union[
             Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads research octane number data; target values given in RON units

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('ron')
    return _load_set('ron', backend)

load_ysi(as_dataset=False, backend='padel')

Loads yield sooting index data; target values given in unified YSI units

Parameters:

Name Type Description Default
as_dataset bool

if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values

False
backend str

any in ['padel', 'alvadesc']

'padel'

Returns:

Type Description
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile]

Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile

Source code in ecnet/datasets/load_data.py
def load_ysi(as_dataset: bool = False, backend: str = 'padel') -> Union[
             Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
    """
    Loads yield sooting index data; target values given in unified YSI units

    Args:
        as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
            otherwise, return tuple of smiles and target values
        backend (str, optional): any in ['padel', 'alvadesc']

    Returns:
        Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
            target vals) or QSPRDatasetFromFile
    """

    if not as_dataset:
        return _get_file_data('ysi')
    return _load_set('ysi', backend)

structs

PyTorch-iterable/callable data structures

QSPRDataset

__getitem__(self, idx) special

Dictionary representation of compound at index idx

Parameters:

Name Type Description Default
idx int

compound to return

required
Source code in ecnet/datasets/structs.py
def __getitem__(self, idx: int):
    """
    Dictionary representation of compound at index `idx`

    Args:
        idx (int): compound to return
    """

    smiles = self.smiles[idx]
    target_val = self.target_vals[idx]
    dv = self.desc_vals[idx]
    return {
        'smiles': smiles,
        'target_val': target_val,
        'desc_vals': dv,
        'desc_names': self.desc_names
    }

__init__(self, smiles, target_vals, backend='padel') special

QSPRDataset: creates a torch.utils.data.Dataset from SMILES strings and target values

Parameters:

Name Type Description Default
smiles List[str]

SMILES strings

required
target_vals Iterable[Iterable[float]]

target values of shape (n_samples, n_targets)

required
backend str

backend for QSPR generation, ['padel', 'alvadesc']

'padel'
Source code in ecnet/datasets/structs.py
def __init__(self, smiles: List[str], target_vals: Iterable[Iterable[float]],
             backend: str = 'padel'):
    """
    QSPRDataset: creates a torch.utils.data.Dataset from SMILES strings and target values

    Args:
        smiles (list[str]): SMILES strings
        target_vals (Iterable[Iterable[float]]): target values of shape (n_samples, n_targets)
        backend (str, optional): backend for QSPR generation, ['padel', 'alvadesc']
    """

    self.smiles = smiles
    self.target_vals = torch.as_tensor(target_vals)
    self.desc_vals, self.desc_names = self.smi_to_qspr(smiles, backend)
    self.desc_vals = torch.as_tensor(self.desc_vals)

set_desc_index(self, index)

Reduce the number of features per sample; features retained given by supplied indices

Parameters:

Name Type Description Default
index List[int]

indices of the features to retain, all others are removed

required
Source code in ecnet/datasets/structs.py
def set_desc_index(self, index: List[int]):
    """
    Reduce the number of features per sample; features retained given by supplied indices

    Args:
        index (list[int]): indices of the features to retain, all others are removed
    """

    self.desc_vals = torch.as_tensor(
        [[val[i] for i in index] for val in self.desc_vals]
    )
    self.desc_names = [self.desc_names[i] for i in index]

set_index(self, index)

Reduce the number of samples in the dataset; samples retained given by supplied indices

Parameters:

Name Type Description Default
index List[int]

indices of the dataset to retain, all others are removed

required
Source code in ecnet/datasets/structs.py
def set_index(self, index: List[int]):
    """
    Reduce the number of samples in the dataset; samples retained given by supplied indices

    Args:
        index (list[int]): indices of the dataset to retain, all others are removed
    """

    self.smiles = [self.smiles[i] for i in index]
    self.target_vals = torch.as_tensor([self.target_vals[i].numpy() for i in index])
    self.desc_vals = torch.as_tensor(
        [self.desc_vals[i].numpy() for i in index]
    )

smi_to_qspr(smiles, backend) staticmethod

Generate QSPR descriptors for each supplied SMILES string

Parameters:

Name Type Description Default
smiles List[str]

SMILES strings

required
backend str

backend for QSPR generation, ['padel', 'alvadesc']

required

Returns:

Type Description
Tuple[List[List[float]], List[str]]

tuple[list[list[float]], list[str]]

Source code in ecnet/datasets/structs.py
@staticmethod
def smi_to_qspr(smiles: List[str], backend: str) -> Tuple[List[List[float]], List[str]]:
    """
    Generate QSPR descriptors for each supplied SMILES string

    Args:
        smiles (list[str]): SMILES strings
        backend (str): backend for QSPR generation, ['padel', 'alvadesc']

    Returns:
        tuple[list[list[float]], list[str]]
    """

    if backend == 'padel':
        return _qspr_from_padel(smiles)
    elif backend == 'alvadesc':
        return _qspr_from_alvadesc(smiles)
    else:
        raise ValueError('Unknown backend software: {}'.format(backend))

QSPRDatasetFromFile

__init__(self, smiles_fn, target_vals, backend='padel') special

QSPRDatasetFromFile: creates a torch.utils.data.Dataset given target values and a supplied filename/path to a SMILES file

Parameters:

Name Type Description Default
smiles_fn str

filename/path of SMILES file

required
target_vals Iterable[Iterable[float]]

target values of shape (n_samples, n_targets)

required
backend str

backend for QSPR generation, ['padel', 'alvadesc']

'padel'
Source code in ecnet/datasets/structs.py
def __init__(self, smiles_fn: str, target_vals: Iterable[Iterable[float]],
             backend: str = 'padel'):
    """
    QSPRDatasetFromFile: creates a torch.utils.data.Dataset given target values and a supplied
    filename/path to a SMILES file

    Args:
        smiles_fn (str): filename/path of SMILES file
        target_vals (Iterable[Iterable[float]]): target values of shape (n_samples, n_targets)
        backend (str, optional): backend for QSPR generation, ['padel', 'alvadesc']
    """

    self.smiles = self._open_smiles_file(smiles_fn)
    self.target_vals = torch.as_tensor(target_vals)
    if backend == 'padel':
        self.desc_vals, self.desc_names = self.smi_to_qspr(
            self.smiles, backend
        )
        self.desc_vals = torch.as_tensor(self.desc_vals)
    elif backend == 'alvadesc':
        self.desc_vals, self.desc_names = _qspr_from_alvadesc_smifile(
            smiles_fn
        )
        self.desc_vals = torch.as_tensor(self.desc_vals)

QSPRDatasetFromValues

__init__(self, desc_vals, target_vals) special

QSPRDatasetFromValues: creates a torch.utils.data.Dataset given supplied descriptor values, supplied target values

Parameters:

Name Type Description Default
desc_vals Iterable[Iterable[float]]

descriptor values, shape (n_samples, n_features)

required
target_vals Iterable[Iterable[float]]

target values, shape (n_samples, n_targets)

required
Source code in ecnet/datasets/structs.py
def __init__(self, desc_vals: Iterable[Iterable[float]],
             target_vals: Iterable[Iterable[float]]):
    """
    QSPRDatasetFromValues: creates a torch.utils.data.Dataset given supplied descriptor values,
    supplied target values

    Args:
        desc_vals (Iterable[Iterable[float]]): descriptor values, shape (n_samples, n_features)
        target_vals (Iterable[Iterable[float]]): target values, shape (n_samples, n_targets)
    """

    self.smiles = ['' for _ in range(len(target_vals))]
    self.desc_names = ['' for _ in range(len(desc_vals[0]))]
    self.desc_vals = torch.as_tensor(desc_vals)
    self.target_vals = torch.as_tensor(target_vals)

utils

Utility functions for generating QSPR descriptors