Skip to content

ecnet.ECNet

Bases: nn.Module

Source code in ecnet/model.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
class ECNet(nn.Module):

    def __init__(self, input_dim: int, output_dim: int, hidden_dim: int, n_hidden: int,
                 dropout: float = 0.0, device: str = 'cpu'):
        """
        ECNet, child of torch.nn.Module: handles data preprocessing, multilayer perceptron training,
        stores multilayer perceptron layers/weights for continued usage/saving

        Args:
            input_dim (int): dimensionality of input data
            output_dim (int): dimensionalit of output data
            hidden_dim (int): number of neurons in hidden layer(s)
            n_hidden (int): number of hidden layers between input and output
            dropout (float, optional): neuron dropout probability, default 0.0
            device (str, optional): device to run tensor ops on, default cpu
        """

        super(ECNet, self).__init__()
        self._input_dim = input_dim
        self._output_dim = output_dim
        self._hidden_dim = hidden_dim
        self._n_hidden = n_hidden
        self._dropout = dropout
        self.model = nn.ModuleList()
        self._construct()
        self.to(torch.device(device))

    def _construct(self):
        """
        _construct: given supplied architecture params, construct multilayer perceptron
        """

        self.model = nn.ModuleList()
        self.model.append(nn.Linear(self._input_dim, self._hidden_dim))
        for _ in range(self._n_hidden):
            self.model.append(nn.Linear(self._hidden_dim, self._hidden_dim))
        self.model.append(nn.Linear(self._hidden_dim, self._output_dim))

    def fit(self, smiles: List[str] = None, target_vals: List[List[float]] = None,
            dataset: QSPRDataset = None, backend: str = 'padel', batch_size: int = 32,
            epochs: int = 100, lr_decay: float = 0.0, valid_size: float = 0.0,
            valid_eval_iter: int = 1, patience: int = 16, verbose: int = 0,
            random_state: int = None, shuffle: bool = False,
            **kwargs) -> Tuple[List[float], List[float]]:
        """
        fit: fits ECNet to either (1) SMILES and target values, or (2) a pre-loaded QSPRDataset;
        the training process utilizes the Adam optimization algorithm, MSE loss, ReLU activation
        functions between fully-connected layers, and optionally (1) a decaying learning rate, and
        (2) periodic validation during regression; periodic validation is used to determine when
        training ends (i.e. when a new minimum validation loss is not achieved after N epochs)

        Args:
            smiles (list[str], optional): if `dataset` not supplied, generates QSPR descriptors
                using these SMILES strings for use as input data
            target_vals (list[list[float]], optional): if `dataset` not supplied, this data is
                used for regression; should be shape (n_samples, n_targets)
            dataset (QSPRDataset, optional): pre-loaded dataset with descriptors + target values
            backend (str, optional): if using SMILES strings and target values, specifies backend
                software to use for QSPR generation; either 'padel' or 'alvadesc', default 'padel'
            batch_size (int, optional): training batch size; default = 32
            epochs (int, optional): number of training epochs; default = 100
            lr_decay (float, optional): linear rate of decay for learning rate; default = 0.0
            valid_size (float, optional): supply >0.0 to utilize periodic validation; value
                specifies proportion of supplied data to be used for validation
            valid_eval_iter (int, optional): validation set is evaluated every `this` epochs;
                default = 1 (evaluated every epoch)
            patience (int, optional): if new lowest validation loss not found after `this` many
                epochs, terminate training, set model parameters to those observed @ lowest
                validation loss
            verbose (int, optional): if > 0, will print every `this` epochs; default = 0
            random_state (int, optional): random_state used by sklearn.model_selection.
                train_test_split; default = None
            shuffle (bool, optional): if True, shuffles training/validation data between epochs;
                default = False; random_state should be None
            **kwargs: arguments accepted by torch.optim.Adam (i.e. learning rate, beta values)

        Returns:
            Tuple[List[float], List[Union[float, None]]]: (training losses, validation losses); if
                valid_size == 0.0, (training losses, [0, ..., 0])
        """

        # Data preparation
        if dataset is None:
            dataset = QSPRDataset(smiles, target_vals, backend)
        if valid_size > 0.0:
            index_train, index_valid = train_test_split(
                [i for i in range(len(dataset))], test_size=valid_size,
                random_state=random_state
            )
            dataloader_train = DataLoader(
                Subset(dataset, index_train), batch_size=batch_size, shuffle=True
            )
            dataloader_valid = DataLoader(
                Subset(dataset, index_valid), batch_size=len(index_valid), shuffle=True
            )
        else:
            dataloader_train = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Adam optimization algorithm
        optimizer = torch.optim.Adam(self.model.parameters(), **kwargs)

        # Set up callbacks
        CBO = CallbackOperator()
        if 'lr' in kwargs:
            _lr = kwargs.get('lr')
            _lrdecay = LRDecayLinear(_lr, lr_decay, optimizer)
            CBO.add_cb(_lrdecay)
        if valid_size > 0.0:
            _validator = Validator(dataloader_valid, self, valid_eval_iter, patience)
            CBO.add_cb(_validator)

        train_losses, valid_losses = [], []
        # TRAIN BEGIN
        CBO.on_train_begin()
        for epoch in range(epochs):

            # EPOCH BEGIN
            if not CBO.on_epoch_begin(epoch):
                break

            if shuffle:
                index_train, index_valid = train_test_split(
                    [i for i in range(len(dataset))], test_size=valid_size,
                    random_state=random_state
                )
                dataloader_train = DataLoader(
                    Subset(dataset, index_train), batch_size=batch_size, shuffle=True
                )
                dataloader_valid = DataLoader(
                    Subset(dataset, index_valid), batch_size=len(index_valid), shuffle=True
                )

            train_loss = 0.0
            self.train()

            for b_idx, batch in enumerate(dataloader_train):

                # BATCH BEGIN
                if not CBO.on_batch_begin(b_idx):
                    break

                optimizer.zero_grad()
                pred = self(batch['desc_vals'])
                target = batch['target_val']

                # BATCH END, LOSS BEGIN
                if not CBO.on_batch_end(b_idx):
                    break
                if not CBO.on_loss_begin(b_idx):
                    break

                loss = self.loss(pred, target)
                loss.backward()

                # LOSS END, STEP BEGIN
                if not CBO.on_loss_end(b_idx):
                    break
                if not CBO.on_step_begin(b_idx):
                    break

                optimizer.step()
                train_loss += loss.detach().item() * len(batch['target_val'])

                # STEP END
                if not CBO.on_step_end(b_idx):
                    break

            # Determine epoch loss for training, validation data
            train_loss /= len(dataloader_train.dataset)
            if valid_size > 0.0:
                valid_loss = _validator._most_recent_loss
            else:
                valid_loss = 0.0
            train_losses.append(train_loss)
            valid_losses.append(valid_loss)

            # Print losses if verbose
            if verbose:
                if epoch % verbose == 0:
                    print('Epoch: {} | Train loss: {} | Valid loss: {}'.format(
                        epoch, train_loss, valid_loss
                    ))

            # EPOCH END
            if not CBO.on_epoch_end(epoch):
                break

        # TRAIN END
        CBO.on_train_end()
        return (train_losses, valid_losses)

    def forward(self, x: torch.tensor) -> torch.tensor:
        """
        Forward propagation of data through multilayer perceptron

        Args:
            x (torch.tensor): input data to feed forward

        Returns:
            torch.tensor: output of final model layer
        """

        for i in range(len(self.model) - 1):
            x = self.model[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self._dropout, training=self.training)
        return self.model[-1](x)

    def loss(self, pred: torch.tensor, target: torch.tensor) -> torch.tensor:
        r"""
        Computes mean squared error between predicted values, target values

        Args:
            pred (torch.tensor): predicted values, shape (n_samples, n_features)
            target (torch.tensor): real values, shape (n_samples, n_features)

        Returns:
            torch.tensor: MSE loss, shape (*, 1)
        """

        return F.mse_loss(pred, target)

    def save(self, model_filename: str):
        """
        Saves the model for later use

        Args:
            model_filename (str): filename/path to save model
        """

        if _TORCH_MODEL_FN.match(model_filename) is None:
            raise ValueError('Models must be saved with a `.pt` extension')
        torch.save(self, model_filename)

__init__(input_dim, output_dim, hidden_dim, n_hidden, dropout=0.0, device='cpu')

ECNet, child of torch.nn.Module: handles data preprocessing, multilayer perceptron training, stores multilayer perceptron layers/weights for continued usage/saving

Parameters:

Name Type Description Default
input_dim int

dimensionality of input data

required
output_dim int

dimensionalit of output data

required
hidden_dim int

number of neurons in hidden layer(s)

required
n_hidden int

number of hidden layers between input and output

required
dropout float

neuron dropout probability, default 0.0

0.0
device str

device to run tensor ops on, default cpu

'cpu'
Source code in ecnet/model.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def __init__(self, input_dim: int, output_dim: int, hidden_dim: int, n_hidden: int,
             dropout: float = 0.0, device: str = 'cpu'):
    """
    ECNet, child of torch.nn.Module: handles data preprocessing, multilayer perceptron training,
    stores multilayer perceptron layers/weights for continued usage/saving

    Args:
        input_dim (int): dimensionality of input data
        output_dim (int): dimensionalit of output data
        hidden_dim (int): number of neurons in hidden layer(s)
        n_hidden (int): number of hidden layers between input and output
        dropout (float, optional): neuron dropout probability, default 0.0
        device (str, optional): device to run tensor ops on, default cpu
    """

    super(ECNet, self).__init__()
    self._input_dim = input_dim
    self._output_dim = output_dim
    self._hidden_dim = hidden_dim
    self._n_hidden = n_hidden
    self._dropout = dropout
    self.model = nn.ModuleList()
    self._construct()
    self.to(torch.device(device))

fit(smiles=None, target_vals=None, dataset=None, backend='padel', batch_size=32, epochs=100, lr_decay=0.0, valid_size=0.0, valid_eval_iter=1, patience=16, verbose=0, random_state=None, shuffle=False, **kwargs)

fit: fits ECNet to either (1) SMILES and target values, or (2) a pre-loaded QSPRDataset; the training process utilizes the Adam optimization algorithm, MSE loss, ReLU activation functions between fully-connected layers, and optionally (1) a decaying learning rate, and (2) periodic validation during regression; periodic validation is used to determine when training ends (i.e. when a new minimum validation loss is not achieved after N epochs)

Parameters:

Name Type Description Default
smiles list[str]

if dataset not supplied, generates QSPR descriptors using these SMILES strings for use as input data

None
target_vals list[list[float]]

if dataset not supplied, this data is used for regression; should be shape (n_samples, n_targets)

None
dataset QSPRDataset

pre-loaded dataset with descriptors + target values

None
backend str

if using SMILES strings and target values, specifies backend software to use for QSPR generation; either 'padel' or 'alvadesc', default 'padel'

'padel'
batch_size int

training batch size; default = 32

32
epochs int

number of training epochs; default = 100

100
lr_decay float

linear rate of decay for learning rate; default = 0.0

0.0
valid_size float

supply >0.0 to utilize periodic validation; value specifies proportion of supplied data to be used for validation

0.0
valid_eval_iter int

validation set is evaluated every this epochs; default = 1 (evaluated every epoch)

1
patience int

if new lowest validation loss not found after this many epochs, terminate training, set model parameters to those observed @ lowest validation loss

16
verbose int

if > 0, will print every this epochs; default = 0

0
random_state int

random_state used by sklearn.model_selection. train_test_split; default = None

None
shuffle bool

if True, shuffles training/validation data between epochs; default = False; random_state should be None

False
**kwargs

arguments accepted by torch.optim.Adam (i.e. learning rate, beta values)

{}

Returns:

Type Description
Tuple[List[float], List[float]]

Tuple[List[float], List[Union[float, None]]]: (training losses, validation losses); if valid_size == 0.0, (training losses, [0, ..., 0])

Source code in ecnet/model.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def fit(self, smiles: List[str] = None, target_vals: List[List[float]] = None,
        dataset: QSPRDataset = None, backend: str = 'padel', batch_size: int = 32,
        epochs: int = 100, lr_decay: float = 0.0, valid_size: float = 0.0,
        valid_eval_iter: int = 1, patience: int = 16, verbose: int = 0,
        random_state: int = None, shuffle: bool = False,
        **kwargs) -> Tuple[List[float], List[float]]:
    """
    fit: fits ECNet to either (1) SMILES and target values, or (2) a pre-loaded QSPRDataset;
    the training process utilizes the Adam optimization algorithm, MSE loss, ReLU activation
    functions between fully-connected layers, and optionally (1) a decaying learning rate, and
    (2) periodic validation during regression; periodic validation is used to determine when
    training ends (i.e. when a new minimum validation loss is not achieved after N epochs)

    Args:
        smiles (list[str], optional): if `dataset` not supplied, generates QSPR descriptors
            using these SMILES strings for use as input data
        target_vals (list[list[float]], optional): if `dataset` not supplied, this data is
            used for regression; should be shape (n_samples, n_targets)
        dataset (QSPRDataset, optional): pre-loaded dataset with descriptors + target values
        backend (str, optional): if using SMILES strings and target values, specifies backend
            software to use for QSPR generation; either 'padel' or 'alvadesc', default 'padel'
        batch_size (int, optional): training batch size; default = 32
        epochs (int, optional): number of training epochs; default = 100
        lr_decay (float, optional): linear rate of decay for learning rate; default = 0.0
        valid_size (float, optional): supply >0.0 to utilize periodic validation; value
            specifies proportion of supplied data to be used for validation
        valid_eval_iter (int, optional): validation set is evaluated every `this` epochs;
            default = 1 (evaluated every epoch)
        patience (int, optional): if new lowest validation loss not found after `this` many
            epochs, terminate training, set model parameters to those observed @ lowest
            validation loss
        verbose (int, optional): if > 0, will print every `this` epochs; default = 0
        random_state (int, optional): random_state used by sklearn.model_selection.
            train_test_split; default = None
        shuffle (bool, optional): if True, shuffles training/validation data between epochs;
            default = False; random_state should be None
        **kwargs: arguments accepted by torch.optim.Adam (i.e. learning rate, beta values)

    Returns:
        Tuple[List[float], List[Union[float, None]]]: (training losses, validation losses); if
            valid_size == 0.0, (training losses, [0, ..., 0])
    """

    # Data preparation
    if dataset is None:
        dataset = QSPRDataset(smiles, target_vals, backend)
    if valid_size > 0.0:
        index_train, index_valid = train_test_split(
            [i for i in range(len(dataset))], test_size=valid_size,
            random_state=random_state
        )
        dataloader_train = DataLoader(
            Subset(dataset, index_train), batch_size=batch_size, shuffle=True
        )
        dataloader_valid = DataLoader(
            Subset(dataset, index_valid), batch_size=len(index_valid), shuffle=True
        )
    else:
        dataloader_train = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Adam optimization algorithm
    optimizer = torch.optim.Adam(self.model.parameters(), **kwargs)

    # Set up callbacks
    CBO = CallbackOperator()
    if 'lr' in kwargs:
        _lr = kwargs.get('lr')
        _lrdecay = LRDecayLinear(_lr, lr_decay, optimizer)
        CBO.add_cb(_lrdecay)
    if valid_size > 0.0:
        _validator = Validator(dataloader_valid, self, valid_eval_iter, patience)
        CBO.add_cb(_validator)

    train_losses, valid_losses = [], []
    # TRAIN BEGIN
    CBO.on_train_begin()
    for epoch in range(epochs):

        # EPOCH BEGIN
        if not CBO.on_epoch_begin(epoch):
            break

        if shuffle:
            index_train, index_valid = train_test_split(
                [i for i in range(len(dataset))], test_size=valid_size,
                random_state=random_state
            )
            dataloader_train = DataLoader(
                Subset(dataset, index_train), batch_size=batch_size, shuffle=True
            )
            dataloader_valid = DataLoader(
                Subset(dataset, index_valid), batch_size=len(index_valid), shuffle=True
            )

        train_loss = 0.0
        self.train()

        for b_idx, batch in enumerate(dataloader_train):

            # BATCH BEGIN
            if not CBO.on_batch_begin(b_idx):
                break

            optimizer.zero_grad()
            pred = self(batch['desc_vals'])
            target = batch['target_val']

            # BATCH END, LOSS BEGIN
            if not CBO.on_batch_end(b_idx):
                break
            if not CBO.on_loss_begin(b_idx):
                break

            loss = self.loss(pred, target)
            loss.backward()

            # LOSS END, STEP BEGIN
            if not CBO.on_loss_end(b_idx):
                break
            if not CBO.on_step_begin(b_idx):
                break

            optimizer.step()
            train_loss += loss.detach().item() * len(batch['target_val'])

            # STEP END
            if not CBO.on_step_end(b_idx):
                break

        # Determine epoch loss for training, validation data
        train_loss /= len(dataloader_train.dataset)
        if valid_size > 0.0:
            valid_loss = _validator._most_recent_loss
        else:
            valid_loss = 0.0
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        # Print losses if verbose
        if verbose:
            if epoch % verbose == 0:
                print('Epoch: {} | Train loss: {} | Valid loss: {}'.format(
                    epoch, train_loss, valid_loss
                ))

        # EPOCH END
        if not CBO.on_epoch_end(epoch):
            break

    # TRAIN END
    CBO.on_train_end()
    return (train_losses, valid_losses)

forward(x)

Forward propagation of data through multilayer perceptron

Parameters:

Name Type Description Default
x torch.tensor

input data to feed forward

required

Returns:

Type Description
torch.tensor

torch.tensor: output of final model layer

Source code in ecnet/model.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
def forward(self, x: torch.tensor) -> torch.tensor:
    """
    Forward propagation of data through multilayer perceptron

    Args:
        x (torch.tensor): input data to feed forward

    Returns:
        torch.tensor: output of final model layer
    """

    for i in range(len(self.model) - 1):
        x = self.model[i](x)
        x = F.relu(x)
        x = F.dropout(x, p=self._dropout, training=self.training)
    return self.model[-1](x)

loss(pred, target)

Computes mean squared error between predicted values, target values

Parameters:

Name Type Description Default
pred torch.tensor

predicted values, shape (n_samples, n_features)

required
target torch.tensor

real values, shape (n_samples, n_features)

required

Returns:

Type Description
torch.tensor

torch.tensor: MSE loss, shape (*, 1)

Source code in ecnet/model.py
230
231
232
233
234
235
236
237
238
239
240
241
242
def loss(self, pred: torch.tensor, target: torch.tensor) -> torch.tensor:
    r"""
    Computes mean squared error between predicted values, target values

    Args:
        pred (torch.tensor): predicted values, shape (n_samples, n_features)
        target (torch.tensor): real values, shape (n_samples, n_features)

    Returns:
        torch.tensor: MSE loss, shape (*, 1)
    """

    return F.mse_loss(pred, target)

save(model_filename)

Saves the model for later use

Parameters:

Name Type Description Default
model_filename str

filename/path to save model

required
Source code in ecnet/model.py
244
245
246
247
248
249
250
251
252
253
254
def save(self, model_filename: str):
    """
    Saves the model for later use

    Args:
        model_filename (str): filename/path to save model
    """

    if _TORCH_MODEL_FN.match(model_filename) is None:
        raise ValueError('Models must be saved with a `.pt` extension')
    torch.save(self, model_filename)