ecnet.datasets
load_data
Pre-bundled data interface
load_bp(as_dataset=False, backend='padel')
Loads boiling point data; target values given in Celsius
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_bp(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads boiling point data; target values given in Celsius
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('bp')
return _load_set('bp', backend)
load_cn(as_dataset=False, backend='padel')
Loads cetane number data; target values given in CN units
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_cn(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads cetane number data; target values given in CN units
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('cn')
return _load_set('cn', backend)
load_cp(as_dataset=False, backend='padel')
Loads cloud point data; target values given in Celsius
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_cp(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads cloud point data; target values given in Celsius
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('cp')
return _load_set('cp', backend)
load_kv(as_dataset=False, backend='padel')
Loads kinematic viscosity data; target values given in mm^2/s (cSt) at 313 deg. K
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_kv(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads kinematic viscosity data; target values given in mm^2/s (cSt) at 313 deg. K
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('kv')
return _load_set('kv', backend)
load_lhv(as_dataset=False, backend='padel')
Loads lower heating value data; target values given in MJ/kg = kJ/g
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_lhv(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads lower heating value data; target values given in MJ/kg = kJ/g
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('lhv')
return _load_set('lhv', backend)
load_mon(as_dataset=False, backend='padel')
Loads motor octane number data; target values given in MON units
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_mon(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads motor octane number data; target values given in MON units
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('mon')
return _load_set('mon', backend)
load_mp(as_dataset=False, backend='padel')
Loads melting point data; target values given in Celsius
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_mp(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads melting point data; target values given in Celsius
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('mp')
return _load_set('mp', backend)
load_pp(as_dataset=False, backend='padel')
Loads pour point data; target values given in Celsius
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_pp(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads pour point data; target values given in Celsius
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('pp')
return _load_set('pp', backend)
load_ron(as_dataset=False, backend='padel')
Loads research octane number data; target values given in RON units
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_ron(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads research octane number data; target values given in RON units
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('ron')
return _load_set('ron', backend)
load_ysi(as_dataset=False, backend='padel')
Loads yield sooting index data; target values given in unified YSI units
Parameters:
Name | Type | Description | Default |
---|---|---|---|
as_dataset |
bool |
if True, return QSPRDatasetFromFile object housing data; otherwise, return tuple of smiles and target values |
False |
backend |
str |
any in ['padel', 'alvadesc'] |
'padel' |
Returns:
Type | Description |
---|---|
Union[Tuple[List[str], List[List[float]]], ecnet.datasets.structs.QSPRDatasetFromFile] |
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles, target vals) or QSPRDatasetFromFile |
Source code in ecnet/datasets/load_data.py
def load_ysi(as_dataset: bool = False, backend: str = 'padel') -> Union[
Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]:
"""
Loads yield sooting index data; target values given in unified YSI units
Args:
as_dataset (bool, optional): if True, return QSPRDatasetFromFile object housing data;
otherwise, return tuple of smiles and target values
backend (str, optional): any in ['padel', 'alvadesc']
Returns:
Union[Tuple[List[str], List[List[float]]], QSPRDatasetFromFile]: either tuple of (smiles,
target vals) or QSPRDatasetFromFile
"""
if not as_dataset:
return _get_file_data('ysi')
return _load_set('ysi', backend)
structs
PyTorch-iterable/callable data structures
QSPRDataset
__getitem__(self, idx)
special
Dictionary representation of compound at index idx
Parameters:
Name | Type | Description | Default |
---|---|---|---|
idx |
int |
compound to return |
required |
Source code in ecnet/datasets/structs.py
def __getitem__(self, idx: int):
"""
Dictionary representation of compound at index `idx`
Args:
idx (int): compound to return
"""
smiles = self.smiles[idx]
target_val = self.target_vals[idx]
dv = self.desc_vals[idx]
return {
'smiles': smiles,
'target_val': target_val,
'desc_vals': dv,
'desc_names': self.desc_names
}
__init__(self, smiles, target_vals, backend='padel')
special
QSPRDataset: creates a torch.utils.data.Dataset from SMILES strings and target values
Parameters:
Name | Type | Description | Default |
---|---|---|---|
smiles |
List[str] |
SMILES strings |
required |
target_vals |
Iterable[Iterable[float]] |
target values of shape (n_samples, n_targets) |
required |
backend |
str |
backend for QSPR generation, ['padel', 'alvadesc'] |
'padel' |
Source code in ecnet/datasets/structs.py
def __init__(self, smiles: List[str], target_vals: Iterable[Iterable[float]],
backend: str = 'padel'):
"""
QSPRDataset: creates a torch.utils.data.Dataset from SMILES strings and target values
Args:
smiles (list[str]): SMILES strings
target_vals (Iterable[Iterable[float]]): target values of shape (n_samples, n_targets)
backend (str, optional): backend for QSPR generation, ['padel', 'alvadesc']
"""
self.smiles = smiles
self.target_vals = torch.as_tensor(target_vals)
self.desc_vals, self.desc_names = self.smi_to_qspr(smiles, backend)
self.desc_vals = torch.as_tensor(self.desc_vals)
set_desc_index(self, index)
Reduce the number of features per sample; features retained given by supplied indices
Parameters:
Name | Type | Description | Default |
---|---|---|---|
index |
List[int] |
indices of the features to retain, all others are removed |
required |
Source code in ecnet/datasets/structs.py
def set_desc_index(self, index: List[int]):
"""
Reduce the number of features per sample; features retained given by supplied indices
Args:
index (list[int]): indices of the features to retain, all others are removed
"""
self.desc_vals = torch.as_tensor(
[[val[i] for i in index] for val in self.desc_vals]
)
self.desc_names = [self.desc_names[i] for i in index]
set_index(self, index)
Reduce the number of samples in the dataset; samples retained given by supplied indices
Parameters:
Name | Type | Description | Default |
---|---|---|---|
index |
List[int] |
indices of the dataset to retain, all others are removed |
required |
Source code in ecnet/datasets/structs.py
def set_index(self, index: List[int]):
"""
Reduce the number of samples in the dataset; samples retained given by supplied indices
Args:
index (list[int]): indices of the dataset to retain, all others are removed
"""
self.smiles = [self.smiles[i] for i in index]
self.target_vals = torch.as_tensor([self.target_vals[i].numpy() for i in index])
self.desc_vals = torch.as_tensor(
[self.desc_vals[i].numpy() for i in index]
)
smi_to_qspr(smiles, backend)
staticmethod
Generate QSPR descriptors for each supplied SMILES string
Parameters:
Name | Type | Description | Default |
---|---|---|---|
smiles |
List[str] |
SMILES strings |
required |
backend |
str |
backend for QSPR generation, ['padel', 'alvadesc'] |
required |
Returns:
Type | Description |
---|---|
Tuple[List[List[float]], List[str]] |
tuple[list[list[float]], list[str]] |
Source code in ecnet/datasets/structs.py
@staticmethod
def smi_to_qspr(smiles: List[str], backend: str) -> Tuple[List[List[float]], List[str]]:
"""
Generate QSPR descriptors for each supplied SMILES string
Args:
smiles (list[str]): SMILES strings
backend (str): backend for QSPR generation, ['padel', 'alvadesc']
Returns:
tuple[list[list[float]], list[str]]
"""
if backend == 'padel':
return _qspr_from_padel(smiles)
elif backend == 'alvadesc':
return _qspr_from_alvadesc(smiles)
else:
raise ValueError('Unknown backend software: {}'.format(backend))
QSPRDatasetFromFile
__init__(self, smiles_fn, target_vals, backend='padel')
special
QSPRDatasetFromFile: creates a torch.utils.data.Dataset given target values and a supplied filename/path to a SMILES file
Parameters:
Name | Type | Description | Default |
---|---|---|---|
smiles_fn |
str |
filename/path of SMILES file |
required |
target_vals |
Iterable[Iterable[float]] |
target values of shape (n_samples, n_targets) |
required |
backend |
str |
backend for QSPR generation, ['padel', 'alvadesc'] |
'padel' |
Source code in ecnet/datasets/structs.py
def __init__(self, smiles_fn: str, target_vals: Iterable[Iterable[float]],
backend: str = 'padel'):
"""
QSPRDatasetFromFile: creates a torch.utils.data.Dataset given target values and a supplied
filename/path to a SMILES file
Args:
smiles_fn (str): filename/path of SMILES file
target_vals (Iterable[Iterable[float]]): target values of shape (n_samples, n_targets)
backend (str, optional): backend for QSPR generation, ['padel', 'alvadesc']
"""
self.smiles = self._open_smiles_file(smiles_fn)
self.target_vals = torch.as_tensor(target_vals)
if backend == 'padel':
self.desc_vals, self.desc_names = self.smi_to_qspr(
self.smiles, backend
)
self.desc_vals = torch.as_tensor(self.desc_vals)
elif backend == 'alvadesc':
self.desc_vals, self.desc_names = _qspr_from_alvadesc_smifile(
smiles_fn
)
self.desc_vals = torch.as_tensor(self.desc_vals)
QSPRDatasetFromValues
__init__(self, desc_vals, target_vals)
special
QSPRDatasetFromValues: creates a torch.utils.data.Dataset given supplied descriptor values, supplied target values
Parameters:
Name | Type | Description | Default |
---|---|---|---|
desc_vals |
Iterable[Iterable[float]] |
descriptor values, shape (n_samples, n_features) |
required |
target_vals |
Iterable[Iterable[float]] |
target values, shape (n_samples, n_targets) |
required |
Source code in ecnet/datasets/structs.py
def __init__(self, desc_vals: Iterable[Iterable[float]],
target_vals: Iterable[Iterable[float]]):
"""
QSPRDatasetFromValues: creates a torch.utils.data.Dataset given supplied descriptor values,
supplied target values
Args:
desc_vals (Iterable[Iterable[float]]): descriptor values, shape (n_samples, n_features)
target_vals (Iterable[Iterable[float]]): target values, shape (n_samples, n_targets)
"""
self.smiles = ['' for _ in range(len(target_vals))]
self.desc_names = ['' for _ in range(len(desc_vals[0]))]
self.desc_vals = torch.as_tensor(desc_vals)
self.target_vals = torch.as_tensor(target_vals)
utils
Utility functions for generating QSPR descriptors