Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
D
DCRNN
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
yan
DCRNN
Commits
c1120241
Commit
c1120241
authored
Aug 08, 2019
by
xlwang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add max grad clipping, learning rate decay and scheduled sampling
parent
9e69dbc2
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
56 additions
and
62 deletions
+56
-62
base_trainer.py
base/base_trainer.py
+11
-4
config.json
config.json
+4
-5
metrics.py
lib/metrics.py
+0
-1
dcrnn_model.py
model/dcrnn_model.py
+2
-32
train.py
train.py
+0
-5
train_DCRNN.py
train_DCRNN.py
+16
-12
dcrnn_trainer.py
trainer/dcrnn_trainer.py
+23
-3
No files found.
base/base_trainer.py
View file @
c1120241
...
...
@@ -19,12 +19,12 @@ class BaseTrainer:
self
.
model
=
torch
.
nn
.
DataParallel
(
model
,
device_ids
=
device_ids
)
self
.
loss
=
loss
self
.
metrics
=
metrics
self
.
metrics
=
metrics
# a list of metric functions defined in metric.py
self
.
optimizer
=
optimizer
cfg_trainer
=
config
[
'trainer'
]
self
.
epochs
=
cfg_trainer
[
'epochs'
]
self
.
save_period
=
cfg_trainer
[
'save_period'
]
self
.
save_period
=
cfg_trainer
[
'save_period'
]
# should be 1
self
.
monitor
=
cfg_trainer
.
get
(
'monitor'
,
'off'
)
# configuration to monitor model performance and save best
...
...
@@ -64,7 +64,7 @@ class BaseTrainer:
for
epoch
in
range
(
self
.
start_epoch
,
self
.
epochs
+
1
):
result
=
self
.
_train_epoch
(
epoch
)
# save logged information
s
into log dict
# save logged information into log dict
log
=
{
'epoch'
:
epoch
}
for
key
,
value
in
result
.
items
():
if
key
==
'metrics'
:
...
...
@@ -74,7 +74,14 @@ class BaseTrainer:
else
:
log
[
key
]
=
value
# print logged informations to the screen
# There is a chance that the training loss will explode, the temporary workaround
# is to restart from the last saved model before the explosion, or to decrease
# the learning rate earlier in the learning rate schedule.
if
log
[
'loss'
]
>
1e5
:
self
.
logger
.
warning
(
'Gradient explosion detected. Ending...'
)
break
# print logged information to the screen
for
key
,
value
in
log
.
items
():
self
.
logger
.
info
(
' {:15s}: {}'
.
format
(
str
(
key
),
value
))
...
...
config.json
View file @
c1120241
...
...
@@ -47,22 +47,21 @@
"masked_mae_np"
,
"masked_mape_np"
,
"masked_rmse_np"
],
"lr_scheduler"
:
{
"type"
:
"StepLR"
,
"type"
:
"
Multi
StepLR"
,
"args"
:
{
"step_size"
:
50
,
"milestones"
:
[
20
,
30
,
40
,
50
]
,
"gamma"
:
0.1
}
},
"trainer"
:
{
"epochs"
:
100
,
"cl_decay_steps"
:
2000
,
"save_dir"
:
"saved/"
,
"save_period"
:
1
,
"verbosity"
:
2
,
"max_grad_norm"
:
5
,
"monitor"
:
"min val_loss"
,
"early_stop"
:
10
,
"tensorboard"
:
true
}
}
lib/metrics.py
View file @
c1120241
import
numpy
as
np
import
tensorflow
as
tf
import
torch
...
...
model/dcrnn_model.py
View file @
c1120241
...
...
@@ -128,30 +128,6 @@ class DCGRUDecoder(BaseModel):
return
outputs
# first input to the decoder is the GO Symbol
# current_inputs = inputs[0] # (1, 50, 207*1)
# output_hidden = [] # the output hidden states, shape (num_layers, batch, outdim)
# for i_layer in range(self._num_rnn_layers-1):
# hidden_state = initial_hidden_state[i_layer]
# output_inner = []
# current_inputs = inputs[0]
# for t in range(0, seq_length-1):
# output, hidden_state = self.decoding_cells[i_layer](current_inputs, hidden_state) # (50, 207*64)
# # output and hidden_state should be the same
#
# output_inner.append(hidden_state)
# teacher_force = random.random() < teacher_forcing_ratio
# current_inputs = (inputs[t] if teacher_force else hidden_state)
# output_hidden.append(hidden_state)
# inputs = torch.stack(output_inner, dim=0) # seq_len, B, ...
#
# # last layer. still need to go through all time steps
# hidden_state = initial_hidden_state[self._num_rnn_layers-1]
# for t in range(seq_length):
# outputs, hidden_state = self.decoding_cells[self._num_rnn_layers-1](current_inputs[t, ...], hidden_state)
# # output shape should be (12, 50, 207*1) (seq, batch, num_nodes * outdim)
# return outputs
class
DCRNNModel
(
BaseModel
):
def
__init__
(
self
,
adj_mat
,
batch_size
,
enc_input_dim
,
dec_input_dim
,
max_diffusion_step
,
num_nodes
,
...
...
@@ -161,9 +137,6 @@ class DCRNNModel(BaseModel):
# self._scaler = scaler
self
.
_batch_size
=
batch_size
# cl_decay_steps = int(model_kwargs.get('cl_decay_steps', 1000))
# filter_type = model_kwargs.get('filter_type', 'laplacian')
# self._horizon = horizon
# max_grad_norm parameter is actually defined in data_kwargs
self
.
_num_nodes
=
num_nodes
# should be 207
self
.
_num_rnn_layers
=
num_rnn_layers
# should be 2
...
...
@@ -187,11 +160,8 @@ class DCRNNModel(BaseModel):
assert
self
.
encoder
.
hid_dim
==
self
.
decoder
.
hid_dim
,
\
"Hidden dimensions of encoder and decoder must be equal!"
def
forward
(
self
,
source
,
target
,
teacher_forcing_ratio
=
0.5
):
def
forward
(
self
,
source
,
target
,
teacher_forcing_ratio
):
# the size of source/target would be (50, 12, 207, 2)
seq_length
=
source
.
shape
[
1
]
batch_size
=
source
.
shape
[
0
]
source
=
torch
.
transpose
(
source
,
dim0
=
0
,
dim1
=
1
)
target
=
torch
.
transpose
(
target
[
...
,
:
self
.
_output_dim
],
dim0
=
0
,
dim1
=
1
)
target
=
torch
.
cat
([
self
.
GO_Symbol
,
target
],
dim
=
0
)
...
...
@@ -202,7 +172,7 @@ class DCRNNModel(BaseModel):
# last hidden state of the encoder is the context
context
,
_
=
self
.
encoder
(
source
,
init_hidden_state
)
# (num_layers, batch, outdim)
outputs
=
self
.
decoder
(
target
,
context
,
teacher_forcing_ratio
=
0.5
)
outputs
=
self
.
decoder
(
target
,
context
,
teacher_forcing_ratio
=
teacher_forcing_ratio
)
return
outputs
# (seq_length+1, batch_size, num_nodes*output_dim) (13, 50, 207*1)
@property
...
...
train.py
View file @
c1120241
import
argparse
import
collections
import
torch
# import data_loader.data_loaders as module_data
# import model.loss as module_loss
##
# import model.metric as module_metric
import
lib.metrics
as
module_metric
# import model.model as module_arch
import
model.dcrnn_model
as
module_arch
from
parse_config
import
ConfigParser
from
trainer.dcrnn_trainer
import
DCRNNTrainer
...
...
train_DCRNN.py
View file @
c1120241
...
...
@@ -6,11 +6,20 @@ from lib import utils
from
lib
import
metrics
from
lib.utils
import
load_graph_data
,
count_parameters
from
lib.metrics
import
masked_mae_loss
from
model.dcrnn_model
import
DCGRUModel
from
model.dcrnn_model
import
DCRNNModel
# from model.dcrnn_supervisor import DCRNNSupervisor
import
time
import
math
from
tqdm
import
tqdm
import
yaml
import
argparse
import
collections
import
model.loss
as
module_loss
import
model.metric
as
module_metric
import
model.model
as
module_arch
from
parse_config
import
ConfigParser
def
train
(
model
,
train_loader
,
epoch
,
optimizer
,
criterion
,
clip
):
...
...
@@ -66,6 +75,7 @@ def evaluate(model, val_loader, epoch, criterion):
return
epoch_loss
/
cnt
def
test
(
model
,
test_loader
,
scaler
):
model
.
eval
()
y_preds
=
torch
.
FloatTensor
([])
...
...
@@ -105,8 +115,8 @@ def test(model, test_loader, scaler):
if
__name__
==
'__main__'
:
# Parameter setting
# Data parameters
batch_size
=
20
0
graph_pkl_filename
=
'data/sensor_graph/adj_mx.pkl'
batch_size
=
5
0
graph_pkl_filename
=
'data/sensor_graph/adj_mx
_unix
.pkl'
# Model parameters
horizon
=
12
input_dim
=
2
...
...
@@ -149,10 +159,10 @@ if __name__ == '__main__':
test_data_loader
=
data
[
'test_loader'
]
# Initialize model
model
=
DC
GRU
Model
(
batch_size
=
batch_size
,
enc_input_dim
=
input_dim
,
dec_input_dim
=
output_dim
,
model
=
DC
RNN
Model
(
batch_size
=
batch_size
,
enc_input_dim
=
input_dim
,
dec_input_dim
=
output_dim
,
adj_mat
=
adj_mat
,
max_diffusion_step
=
max_diffusion_step
,
num_nodes
=
num_nodes
,
num_rnn_layers
=
num_rnn_layers
,
rnn_units
=
rnn_units
,
seq_len
=
seq_len
,
input_dim
=
input_dim
,
output_dim
=
output_dim
)
rnn_units
=
rnn_units
,
seq_len
=
seq_len
,
output_dim
=
output_dim
)
# Count number of trainable parameters
print
(
f
'The model has {count_parameters(model):,} trainable parameters'
)
# A GPU should be available
...
...
@@ -189,10 +199,4 @@ if __name__ == '__main__':
res
=
test
(
model
,
test_data_loader
,
scaler
=
scaler
)
# serialize test data
np
.
savez_compressed
(
'data/results/dcrnn_predictions.npz'
,
**
res
)
print
(
'Predictions saved as {}.'
.
format
(
'data/results/dcrnn_predictions.npz'
))
print
(
'Predictions saved as {}.'
.
format
(
'saved/results/dcrnn_predictions.npz'
))
trainer/dcrnn_trainer.py
View file @
c1120241
import
numpy
as
np
import
torch
from
base
import
BaseTrainer
import
math
# from lib.utils import inf_loop
...
...
@@ -15,11 +16,14 @@ class DCRNNTrainer(BaseTrainer):
self
.
data_loader
=
data_loader
self
.
len_epoch
=
len_epoch
self
.
val_len_epoch
=
val_len_epoch
self
.
cl_decay_steps
=
config
[
"trainer"
][
"cl_decay_steps"
]
self
.
max_grad_norm
=
config
[
"trainer"
][
"max_grad_norm"
]
self
.
valid_data_loader
=
valid_data_loader
self
.
do_validation
=
self
.
valid_data_loader
is
not
None
self
.
lr_scheduler
=
lr_scheduler
self
.
log_step
=
int
(
np
.
sqrt
(
data_loader
.
batch_size
))
# sqrt(128) sqrt(64)
self
.
log_step
=
int
(
20
)
# self.log_step = int(np.sqrt(data_loader.batch_size)) # sqrt(128) sqrt(64)
def
_eval_metrics
(
self
,
output
,
target
):
acc_metrics
=
np
.
zeros
(
len
(
self
.
metrics
))
...
...
@@ -55,13 +59,19 @@ class DCRNNTrainer(BaseTrainer):
data
,
target
=
data
.
to
(
self
.
device
),
target
.
to
(
self
.
device
)
self
.
optimizer
.
zero_grad
()
output
=
self
.
model
(
data
,
target
)
# compute sampling ratio, which gradually decay to 0 during training
global_step
=
(
epoch
-
1
)
*
self
.
len_epoch
+
batch_idx
teacher_forcing_ratio
=
self
.
_compute_sampling_threshold
(
global_step
,
self
.
cl_decay_steps
)
output
=
self
.
model
(
data
,
target
,
teacher_forcing_ratio
)
output
=
torch
.
transpose
(
output
[
1
:]
.
view
(
12
,
self
.
model
.
batch_size
,
self
.
model
.
num_nodes
,
self
.
model
.
output_dim
),
0
,
1
)
# back to (50, 12, 207, 1)
loss
=
self
.
loss
(
output
.
cpu
(),
label
)
# loss is self-defined, need cpu input
loss
.
backward
()
# TODO: add grad norm clipping
# add max grad clipping
torch
.
nn
.
utils
.
clip_grad_norm_
(
self
.
model
.
parameters
(),
self
.
max_grad_norm
)
self
.
optimizer
.
step
()
self
.
writer
.
set_step
((
epoch
-
1
)
*
self
.
len_epoch
+
batch_idx
)
...
...
@@ -141,3 +151,13 @@ class DCRNNTrainer(BaseTrainer):
current
=
batch_idx
total
=
self
.
len_epoch
return
base
.
format
(
current
,
total
,
100.0
*
current
/
total
)
@staticmethod
def
_compute_sampling_threshold
(
global_step
,
k
):
"""
Computes the sampling probability for scheduled sampling using inverse sigmoid.
:param global_step:
:param k:
:return:
"""
return
k
/
(
k
+
math
.
exp
(
global_step
/
k
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment