Commit c1120241 by xlwang

add max grad clipping, learning rate decay and scheduled sampling

parent 9e69dbc2
...@@ -19,12 +19,12 @@ class BaseTrainer: ...@@ -19,12 +19,12 @@ class BaseTrainer:
self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.model = torch.nn.DataParallel(model, device_ids=device_ids)
self.loss = loss self.loss = loss
self.metrics = metrics self.metrics = metrics # a list of metric functions defined in metric.py
self.optimizer = optimizer self.optimizer = optimizer
cfg_trainer = config['trainer'] cfg_trainer = config['trainer']
self.epochs = cfg_trainer['epochs'] self.epochs = cfg_trainer['epochs']
self.save_period = cfg_trainer['save_period'] self.save_period = cfg_trainer['save_period'] # should be 1
self.monitor = cfg_trainer.get('monitor', 'off') self.monitor = cfg_trainer.get('monitor', 'off')
# configuration to monitor model performance and save best # configuration to monitor model performance and save best
...@@ -64,7 +64,7 @@ class BaseTrainer: ...@@ -64,7 +64,7 @@ class BaseTrainer:
for epoch in range(self.start_epoch, self.epochs + 1): for epoch in range(self.start_epoch, self.epochs + 1):
result = self._train_epoch(epoch) result = self._train_epoch(epoch)
# save logged informations into log dict # save logged information into log dict
log = {'epoch': epoch} log = {'epoch': epoch}
for key, value in result.items(): for key, value in result.items():
if key == 'metrics': if key == 'metrics':
...@@ -74,7 +74,14 @@ class BaseTrainer: ...@@ -74,7 +74,14 @@ class BaseTrainer:
else: else:
log[key] = value log[key] = value
# print logged informations to the screen # There is a chance that the training loss will explode, the temporary workaround
# is to restart from the last saved model before the explosion, or to decrease
# the learning rate earlier in the learning rate schedule.
if log['loss'] > 1e5:
self.logger.warning('Gradient explosion detected. Ending...')
break
# print logged information to the screen
for key, value in log.items(): for key, value in log.items():
self.logger.info(' {:15s}: {}'.format(str(key), value)) self.logger.info(' {:15s}: {}'.format(str(key), value))
......
...@@ -47,22 +47,21 @@ ...@@ -47,22 +47,21 @@
"masked_mae_np", "masked_mape_np", "masked_rmse_np" "masked_mae_np", "masked_mape_np", "masked_rmse_np"
], ],
"lr_scheduler": { "lr_scheduler": {
"type": "StepLR", "type": "MultiStepLR",
"args": { "args": {
"step_size": 50, "milestones": [20, 30, 40, 50],
"gamma": 0.1 "gamma": 0.1
} }
}, },
"trainer": { "trainer": {
"epochs": 100, "epochs": 100,
"cl_decay_steps": 2000,
"save_dir": "saved/", "save_dir": "saved/",
"save_period": 1, "save_period": 1,
"verbosity": 2, "verbosity": 2,
"max_grad_norm": 5,
"monitor": "min val_loss", "monitor": "min val_loss",
"early_stop": 10, "early_stop": 10,
"tensorboard": true "tensorboard": true
} }
} }
import numpy as np import numpy as np
import tensorflow as tf
import torch import torch
......
...@@ -128,30 +128,6 @@ class DCGRUDecoder(BaseModel): ...@@ -128,30 +128,6 @@ class DCGRUDecoder(BaseModel):
return outputs return outputs
# first input to the decoder is the GO Symbol
# current_inputs = inputs[0] # (1, 50, 207*1)
# output_hidden = [] # the output hidden states, shape (num_layers, batch, outdim)
# for i_layer in range(self._num_rnn_layers-1):
# hidden_state = initial_hidden_state[i_layer]
# output_inner = []
# current_inputs = inputs[0]
# for t in range(0, seq_length-1):
# output, hidden_state = self.decoding_cells[i_layer](current_inputs, hidden_state) # (50, 207*64)
# # output and hidden_state should be the same
#
# output_inner.append(hidden_state)
# teacher_force = random.random() < teacher_forcing_ratio
# current_inputs = (inputs[t] if teacher_force else hidden_state)
# output_hidden.append(hidden_state)
# inputs = torch.stack(output_inner, dim=0) # seq_len, B, ...
#
# # last layer. still need to go through all time steps
# hidden_state = initial_hidden_state[self._num_rnn_layers-1]
# for t in range(seq_length):
# outputs, hidden_state = self.decoding_cells[self._num_rnn_layers-1](current_inputs[t, ...], hidden_state)
# # output shape should be (12, 50, 207*1) (seq, batch, num_nodes * outdim)
# return outputs
class DCRNNModel(BaseModel): class DCRNNModel(BaseModel):
def __init__(self, adj_mat, batch_size, enc_input_dim, dec_input_dim, max_diffusion_step, num_nodes, def __init__(self, adj_mat, batch_size, enc_input_dim, dec_input_dim, max_diffusion_step, num_nodes,
...@@ -161,9 +137,6 @@ class DCRNNModel(BaseModel): ...@@ -161,9 +137,6 @@ class DCRNNModel(BaseModel):
# self._scaler = scaler # self._scaler = scaler
self._batch_size = batch_size self._batch_size = batch_size
# cl_decay_steps = int(model_kwargs.get('cl_decay_steps', 1000))
# filter_type = model_kwargs.get('filter_type', 'laplacian')
# self._horizon = horizon
# max_grad_norm parameter is actually defined in data_kwargs # max_grad_norm parameter is actually defined in data_kwargs
self._num_nodes = num_nodes # should be 207 self._num_nodes = num_nodes # should be 207
self._num_rnn_layers = num_rnn_layers # should be 2 self._num_rnn_layers = num_rnn_layers # should be 2
...@@ -187,11 +160,8 @@ class DCRNNModel(BaseModel): ...@@ -187,11 +160,8 @@ class DCRNNModel(BaseModel):
assert self.encoder.hid_dim == self.decoder.hid_dim, \ assert self.encoder.hid_dim == self.decoder.hid_dim, \
"Hidden dimensions of encoder and decoder must be equal!" "Hidden dimensions of encoder and decoder must be equal!"
def forward(self, source, target, teacher_forcing_ratio=0.5): def forward(self, source, target, teacher_forcing_ratio):
# the size of source/target would be (50, 12, 207, 2) # the size of source/target would be (50, 12, 207, 2)
seq_length = source.shape[1]
batch_size = source.shape[0]
source = torch.transpose(source, dim0=0, dim1=1) source = torch.transpose(source, dim0=0, dim1=1)
target = torch.transpose(target[..., :self._output_dim], dim0=0, dim1=1) target = torch.transpose(target[..., :self._output_dim], dim0=0, dim1=1)
target = torch.cat([self.GO_Symbol, target], dim=0) target = torch.cat([self.GO_Symbol, target], dim=0)
...@@ -202,7 +172,7 @@ class DCRNNModel(BaseModel): ...@@ -202,7 +172,7 @@ class DCRNNModel(BaseModel):
# last hidden state of the encoder is the context # last hidden state of the encoder is the context
context, _ = self.encoder(source, init_hidden_state) # (num_layers, batch, outdim) context, _ = self.encoder(source, init_hidden_state) # (num_layers, batch, outdim)
outputs = self.decoder(target, context, teacher_forcing_ratio=0.5) outputs = self.decoder(target, context, teacher_forcing_ratio=teacher_forcing_ratio)
return outputs # (seq_length+1, batch_size, num_nodes*output_dim) (13, 50, 207*1) return outputs # (seq_length+1, batch_size, num_nodes*output_dim) (13, 50, 207*1)
@property @property
......
import argparse import argparse
import collections import collections
import torch import torch
# import data_loader.data_loaders as module_data
# import model.loss as module_loss
##
# import model.metric as module_metric
import lib.metrics as module_metric import lib.metrics as module_metric
# import model.model as module_arch
import model.dcrnn_model as module_arch import model.dcrnn_model as module_arch
from parse_config import ConfigParser from parse_config import ConfigParser
from trainer.dcrnn_trainer import DCRNNTrainer from trainer.dcrnn_trainer import DCRNNTrainer
......
...@@ -6,11 +6,20 @@ from lib import utils ...@@ -6,11 +6,20 @@ from lib import utils
from lib import metrics from lib import metrics
from lib.utils import load_graph_data, count_parameters from lib.utils import load_graph_data, count_parameters
from lib.metrics import masked_mae_loss from lib.metrics import masked_mae_loss
from model.dcrnn_model import DCGRUModel from model.dcrnn_model import DCRNNModel
# from model.dcrnn_supervisor import DCRNNSupervisor
import time import time
import math import math
from tqdm import tqdm from tqdm import tqdm
import yaml
import argparse
import collections
import model.loss as module_loss
import model.metric as module_metric
import model.model as module_arch
from parse_config import ConfigParser
def train(model, train_loader, epoch, optimizer, criterion, clip): def train(model, train_loader, epoch, optimizer, criterion, clip):
...@@ -66,6 +75,7 @@ def evaluate(model, val_loader, epoch, criterion): ...@@ -66,6 +75,7 @@ def evaluate(model, val_loader, epoch, criterion):
return epoch_loss / cnt return epoch_loss / cnt
def test(model, test_loader, scaler): def test(model, test_loader, scaler):
model.eval() model.eval()
y_preds = torch.FloatTensor([]) y_preds = torch.FloatTensor([])
...@@ -105,8 +115,8 @@ def test(model, test_loader, scaler): ...@@ -105,8 +115,8 @@ def test(model, test_loader, scaler):
if __name__ == '__main__': if __name__ == '__main__':
# Parameter setting # Parameter setting
# Data parameters # Data parameters
batch_size = 200 batch_size = 50
graph_pkl_filename = 'data/sensor_graph/adj_mx.pkl' graph_pkl_filename = 'data/sensor_graph/adj_mx_unix.pkl'
# Model parameters # Model parameters
horizon = 12 horizon = 12
input_dim = 2 input_dim = 2
...@@ -149,10 +159,10 @@ if __name__ == '__main__': ...@@ -149,10 +159,10 @@ if __name__ == '__main__':
test_data_loader = data['test_loader'] test_data_loader = data['test_loader']
# Initialize model # Initialize model
model = DCGRUModel(batch_size=batch_size, enc_input_dim=input_dim, dec_input_dim=output_dim, model = DCRNNModel(batch_size=batch_size, enc_input_dim=input_dim, dec_input_dim=output_dim,
adj_mat=adj_mat, max_diffusion_step=max_diffusion_step, adj_mat=adj_mat, max_diffusion_step=max_diffusion_step,
num_nodes=num_nodes, num_rnn_layers=num_rnn_layers, num_nodes=num_nodes, num_rnn_layers=num_rnn_layers,
rnn_units=rnn_units, seq_len=seq_len, input_dim=input_dim, output_dim=output_dim) rnn_units=rnn_units, seq_len=seq_len, output_dim=output_dim)
# Count number of trainable parameters # Count number of trainable parameters
print(f'The model has {count_parameters(model):,} trainable parameters') print(f'The model has {count_parameters(model):,} trainable parameters')
# A GPU should be available # A GPU should be available
...@@ -189,10 +199,4 @@ if __name__ == '__main__': ...@@ -189,10 +199,4 @@ if __name__ == '__main__':
res = test(model, test_data_loader, scaler=scaler) res = test(model, test_data_loader, scaler=scaler)
# serialize test data # serialize test data
np.savez_compressed('data/results/dcrnn_predictions.npz', **res) np.savez_compressed('data/results/dcrnn_predictions.npz', **res)
print('Predictions saved as {}.'.format('data/results/dcrnn_predictions.npz')) print('Predictions saved as {}.'.format('saved/results/dcrnn_predictions.npz'))
import numpy as np import numpy as np
import torch import torch
from base import BaseTrainer from base import BaseTrainer
import math
# from lib.utils import inf_loop # from lib.utils import inf_loop
...@@ -15,11 +16,14 @@ class DCRNNTrainer(BaseTrainer): ...@@ -15,11 +16,14 @@ class DCRNNTrainer(BaseTrainer):
self.data_loader = data_loader self.data_loader = data_loader
self.len_epoch = len_epoch self.len_epoch = len_epoch
self.val_len_epoch = val_len_epoch self.val_len_epoch = val_len_epoch
self.cl_decay_steps = config["trainer"]["cl_decay_steps"]
self.max_grad_norm = config["trainer"]["max_grad_norm"]
self.valid_data_loader = valid_data_loader self.valid_data_loader = valid_data_loader
self.do_validation = self.valid_data_loader is not None self.do_validation = self.valid_data_loader is not None
self.lr_scheduler = lr_scheduler self.lr_scheduler = lr_scheduler
self.log_step = int(np.sqrt(data_loader.batch_size)) # sqrt(128) sqrt(64) self.log_step = int(20)
# self.log_step = int(np.sqrt(data_loader.batch_size)) # sqrt(128) sqrt(64)
def _eval_metrics(self, output, target): def _eval_metrics(self, output, target):
acc_metrics = np.zeros(len(self.metrics)) acc_metrics = np.zeros(len(self.metrics))
...@@ -55,13 +59,19 @@ class DCRNNTrainer(BaseTrainer): ...@@ -55,13 +59,19 @@ class DCRNNTrainer(BaseTrainer):
data, target = data.to(self.device), target.to(self.device) data, target = data.to(self.device), target.to(self.device)
self.optimizer.zero_grad() self.optimizer.zero_grad()
output = self.model(data, target)
# compute sampling ratio, which gradually decay to 0 during training
global_step = (epoch - 1) * self.len_epoch + batch_idx
teacher_forcing_ratio = self._compute_sampling_threshold(global_step, self.cl_decay_steps)
output = self.model(data, target, teacher_forcing_ratio)
output = torch.transpose(output[1:].view(12, self.model.batch_size, self.model.num_nodes, output = torch.transpose(output[1:].view(12, self.model.batch_size, self.model.num_nodes,
self.model.output_dim), 0, 1) # back to (50, 12, 207, 1) self.model.output_dim), 0, 1) # back to (50, 12, 207, 1)
loss = self.loss(output.cpu(), label) # loss is self-defined, need cpu input loss = self.loss(output.cpu(), label) # loss is self-defined, need cpu input
loss.backward() loss.backward()
# TODO: add grad norm clipping # add max grad clipping
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
self.optimizer.step() self.optimizer.step()
self.writer.set_step((epoch - 1) * self.len_epoch + batch_idx) self.writer.set_step((epoch - 1) * self.len_epoch + batch_idx)
...@@ -141,3 +151,13 @@ class DCRNNTrainer(BaseTrainer): ...@@ -141,3 +151,13 @@ class DCRNNTrainer(BaseTrainer):
current = batch_idx current = batch_idx
total = self.len_epoch total = self.len_epoch
return base.format(current, total, 100.0 * current / total) return base.format(current, total, 100.0 * current / total)
@staticmethod
def _compute_sampling_threshold(global_step, k):
"""
Computes the sampling probability for scheduled sampling using inverse sigmoid.
:param global_step:
:param k:
:return:
"""
return k / (k + math.exp(global_step / k))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment