diff --git a/train.lua b/train.lua index b6fd5dc3..9aae4ec8 100644 --- a/train.lua +++ b/train.lua @@ -294,8 +294,8 @@ function feval(x) end ------------------------ misc ---------------------- -- transfer final state to initial state (BPTT) - init_state_global = rnn_state[#rnn_state] -- NOTE: I don't think this needs to be a clone, right? - -- grad_params:div(opt.seq_length) -- this line should be here but since we use rmsprop it would have no effect. Removing for efficiency + -- NOTE: line below actually needs a clone. Otherwise, at t=1 during the backpropagation, rnn_state[0] will be equal to the init_state_global of the next batch, different than the one used in the forward + init_state_global = clone_list(rnn_state[#rnn_state]) -- clip gradient element-wise grad_params:clamp(-opt.grad_clip, opt.grad_clip) return loss, grad_params