Commit c1120241 by xlwang

add max grad clipping, learning rate decay and scheduled sampling

parent 9e69dbc2
......@@ -19,12 +19,12 @@ class BaseTrainer:
self.model = torch.nn.DataParallel(model, device_ids=device_ids)
self.loss = loss
self.metrics = metrics
self.metrics = metrics # a list of metric functions defined in metric.py
self.optimizer = optimizer
cfg_trainer = config['trainer']
self.epochs = cfg_trainer['epochs']
self.save_period = cfg_trainer['save_period']
self.save_period = cfg_trainer['save_period'] # should be 1
self.monitor = cfg_trainer.get('monitor', 'off')
# configuration to monitor model performance and save best
......@@ -64,7 +64,7 @@ class BaseTrainer:
for epoch in range(self.start_epoch, self.epochs + 1):
result = self._train_epoch(epoch)
# save logged informations into log dict
# save logged information into log dict
log = {'epoch': epoch}
for key, value in result.items():
if key == 'metrics':
......@@ -74,7 +74,14 @@ class BaseTrainer:
else:
log[key] = value
# print logged informations to the screen
# There is a chance that the training loss will explode, the temporary workaround
# is to restart from the last saved model before the explosion, or to decrease
# the learning rate earlier in the learning rate schedule.
if log['loss'] > 1e5:
self.logger.warning('Gradient explosion detected. Ending...')
break
# print logged information to the screen
for key, value in log.items():
self.logger.info(' {:15s}: {}'.format(str(key), value))
......
......@@ -47,22 +47,21 @@
"masked_mae_np", "masked_mape_np", "masked_rmse_np"
],
"lr_scheduler": {
"type": "StepLR",
"type": "MultiStepLR",
"args": {
"step_size": 50,
"milestones": [20, 30, 40, 50],
"gamma": 0.1
}
},
"trainer": {
"epochs": 100,
"cl_decay_steps": 2000,
"save_dir": "saved/",
"save_period": 1,
"verbosity": 2,
"max_grad_norm": 5,
"monitor": "min val_loss",
"early_stop": 10,
"tensorboard": true
}
}
import numpy as np
import tensorflow as tf
import torch
......
......@@ -128,30 +128,6 @@ class DCGRUDecoder(BaseModel):
return outputs
# first input to the decoder is the GO Symbol
# current_inputs = inputs[0] # (1, 50, 207*1)
# output_hidden = [] # the output hidden states, shape (num_layers, batch, outdim)
# for i_layer in range(self._num_rnn_layers-1):
# hidden_state = initial_hidden_state[i_layer]
# output_inner = []
# current_inputs = inputs[0]
# for t in range(0, seq_length-1):
# output, hidden_state = self.decoding_cells[i_layer](current_inputs, hidden_state) # (50, 207*64)
# # output and hidden_state should be the same
#
# output_inner.append(hidden_state)
# teacher_force = random.random() < teacher_forcing_ratio
# current_inputs = (inputs[t] if teacher_force else hidden_state)
# output_hidden.append(hidden_state)
# inputs = torch.stack(output_inner, dim=0) # seq_len, B, ...
#
# # last layer. still need to go through all time steps
# hidden_state = initial_hidden_state[self._num_rnn_layers-1]
# for t in range(seq_length):
# outputs, hidden_state = self.decoding_cells[self._num_rnn_layers-1](current_inputs[t, ...], hidden_state)
# # output shape should be (12, 50, 207*1) (seq, batch, num_nodes * outdim)
# return outputs
class DCRNNModel(BaseModel):
def __init__(self, adj_mat, batch_size, enc_input_dim, dec_input_dim, max_diffusion_step, num_nodes,
......@@ -161,9 +137,6 @@ class DCRNNModel(BaseModel):
# self._scaler = scaler
self._batch_size = batch_size
# cl_decay_steps = int(model_kwargs.get('cl_decay_steps', 1000))
# filter_type = model_kwargs.get('filter_type', 'laplacian')
# self._horizon = horizon
# max_grad_norm parameter is actually defined in data_kwargs
self._num_nodes = num_nodes # should be 207
self._num_rnn_layers = num_rnn_layers # should be 2
......@@ -187,11 +160,8 @@ class DCRNNModel(BaseModel):
assert self.encoder.hid_dim == self.decoder.hid_dim, \
"Hidden dimensions of encoder and decoder must be equal!"
def forward(self, source, target, teacher_forcing_ratio=0.5):
def forward(self, source, target, teacher_forcing_ratio):
# the size of source/target would be (50, 12, 207, 2)
seq_length = source.shape[1]
batch_size = source.shape[0]
source = torch.transpose(source, dim0=0, dim1=1)
target = torch.transpose(target[..., :self._output_dim], dim0=0, dim1=1)
target = torch.cat([self.GO_Symbol, target], dim=0)
......@@ -202,7 +172,7 @@ class DCRNNModel(BaseModel):
# last hidden state of the encoder is the context
context, _ = self.encoder(source, init_hidden_state) # (num_layers, batch, outdim)
outputs = self.decoder(target, context, teacher_forcing_ratio=0.5)
outputs = self.decoder(target, context, teacher_forcing_ratio=teacher_forcing_ratio)
return outputs # (seq_length+1, batch_size, num_nodes*output_dim) (13, 50, 207*1)
@property
......
import argparse
import collections
import torch
# import data_loader.data_loaders as module_data
# import model.loss as module_loss
##
# import model.metric as module_metric
import lib.metrics as module_metric
# import model.model as module_arch
import model.dcrnn_model as module_arch
from parse_config import ConfigParser
from trainer.dcrnn_trainer import DCRNNTrainer
......
......@@ -6,11 +6,20 @@ from lib import utils
from lib import metrics
from lib.utils import load_graph_data, count_parameters
from lib.metrics import masked_mae_loss
from model.dcrnn_model import DCGRUModel
from model.dcrnn_model import DCRNNModel
# from model.dcrnn_supervisor import DCRNNSupervisor
import time
import math
from tqdm import tqdm
import yaml
import argparse
import collections
import model.loss as module_loss
import model.metric as module_metric
import model.model as module_arch
from parse_config import ConfigParser
def train(model, train_loader, epoch, optimizer, criterion, clip):
......@@ -66,6 +75,7 @@ def evaluate(model, val_loader, epoch, criterion):
return epoch_loss / cnt
def test(model, test_loader, scaler):
model.eval()
y_preds = torch.FloatTensor([])
......@@ -105,8 +115,8 @@ def test(model, test_loader, scaler):
if __name__ == '__main__':
# Parameter setting
# Data parameters
batch_size = 200
graph_pkl_filename = 'data/sensor_graph/adj_mx.pkl'
batch_size = 50
graph_pkl_filename = 'data/sensor_graph/adj_mx_unix.pkl'
# Model parameters
horizon = 12
input_dim = 2
......@@ -149,10 +159,10 @@ if __name__ == '__main__':
test_data_loader = data['test_loader']
# Initialize model
model = DCGRUModel(batch_size=batch_size, enc_input_dim=input_dim, dec_input_dim=output_dim,
model = DCRNNModel(batch_size=batch_size, enc_input_dim=input_dim, dec_input_dim=output_dim,
adj_mat=adj_mat, max_diffusion_step=max_diffusion_step,
num_nodes=num_nodes, num_rnn_layers=num_rnn_layers,
rnn_units=rnn_units, seq_len=seq_len, input_dim=input_dim, output_dim=output_dim)
rnn_units=rnn_units, seq_len=seq_len, output_dim=output_dim)
# Count number of trainable parameters
print(f'The model has {count_parameters(model):,} trainable parameters')
# A GPU should be available
......@@ -189,10 +199,4 @@ if __name__ == '__main__':
res = test(model, test_data_loader, scaler=scaler)
# serialize test data
np.savez_compressed('data/results/dcrnn_predictions.npz', **res)
print('Predictions saved as {}.'.format('data/results/dcrnn_predictions.npz'))
print('Predictions saved as {}.'.format('saved/results/dcrnn_predictions.npz'))
import numpy as np
import torch
from base import BaseTrainer
import math
# from lib.utils import inf_loop
......@@ -15,11 +16,14 @@ class DCRNNTrainer(BaseTrainer):
self.data_loader = data_loader
self.len_epoch = len_epoch
self.val_len_epoch = val_len_epoch
self.cl_decay_steps = config["trainer"]["cl_decay_steps"]
self.max_grad_norm = config["trainer"]["max_grad_norm"]
self.valid_data_loader = valid_data_loader
self.do_validation = self.valid_data_loader is not None
self.lr_scheduler = lr_scheduler
self.log_step = int(np.sqrt(data_loader.batch_size)) # sqrt(128) sqrt(64)
self.log_step = int(20)
# self.log_step = int(np.sqrt(data_loader.batch_size)) # sqrt(128) sqrt(64)
def _eval_metrics(self, output, target):
acc_metrics = np.zeros(len(self.metrics))
......@@ -55,13 +59,19 @@ class DCRNNTrainer(BaseTrainer):
data, target = data.to(self.device), target.to(self.device)
self.optimizer.zero_grad()
output = self.model(data, target)
# compute sampling ratio, which gradually decay to 0 during training
global_step = (epoch - 1) * self.len_epoch + batch_idx
teacher_forcing_ratio = self._compute_sampling_threshold(global_step, self.cl_decay_steps)
output = self.model(data, target, teacher_forcing_ratio)
output = torch.transpose(output[1:].view(12, self.model.batch_size, self.model.num_nodes,
self.model.output_dim), 0, 1) # back to (50, 12, 207, 1)
loss = self.loss(output.cpu(), label) # loss is self-defined, need cpu input
loss.backward()
# TODO: add grad norm clipping
# add max grad clipping
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
self.optimizer.step()
self.writer.set_step((epoch - 1) * self.len_epoch + batch_idx)
......@@ -141,3 +151,13 @@ class DCRNNTrainer(BaseTrainer):
current = batch_idx
total = self.len_epoch
return base.format(current, total, 100.0 * current / total)
@staticmethod
def _compute_sampling_threshold(global_step, k):
"""
Computes the sampling probability for scheduled sampling using inverse sigmoid.
:param global_step:
:param k:
:return:
"""
return k / (k + math.exp(global_step / k))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment