diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua index 3a604c1..2319594 100644 --- a/examples/noise-contrastive-estimate.lua +++ b/examples/noise-contrastive-estimate.lua @@ -221,7 +221,6 @@ while opt.maxepoch <= 0 or epoch <= opt.maxepoch do sumErr = sumErr + err -- backward local gradOutputs = criterion:backward(outputs, targets) - local a = torch.Timer() lm:zeroGradParameters() lm:backward(inputs, gradOutputs) diff --git a/examples/recurrent-language-model.lua b/examples/recurrent-language-model.lua index 7044aa2..09b3b42 100644 --- a/examples/recurrent-language-model.lua +++ b/examples/recurrent-language-model.lua @@ -94,20 +94,19 @@ for i,hiddensize in ipairs(opt.hiddensize) do if opt.gru then -- Gated Recurrent Units rnn = nn.GRU(inputsize, hiddensize, nil, opt.dropout/2) elseif opt.lstm then -- Long Short Term Memory units - require 'nngraph' - nn.FastLSTM.usenngraph = true -- faster - nn.FastLSTM.bn = opt.bn - rnn = nn.FastLSTM(inputsize, hiddensize) + rnn = nn.RecLSTM(inputsize, hiddensize) elseif opt.mfru then -- Multi Function Recurrent Unit rnn = nn.MuFuRu(inputsize, hiddensize) - else -- simple recurrent neural network + elseif i == 1 then -- simple recurrent neural network local rm = nn.Sequential() -- input is {x[t], h[t-1]} :add(nn.ParallelTable() - :add(i==1 and nn.Identity() or nn.Linear(inputsize, hiddensize)) -- input layer + :add(nn.Identity()) -- input layer :add(nn.Linear(hiddensize, hiddensize))) -- recurrent layer :add(nn.CAddTable()) -- merge :add(nn.Sigmoid()) -- transfer rnn = nn.Recurrence(rm, hiddensize, 1) + else + rnn = nn.LinearRNN(hiddensize, hiddensize) end stepmodule:add(rnn) diff --git a/examples/recurrent-time-series.lua b/examples/recurrent-time-series.lua index 4f47993..d8221e4 100644 --- a/examples/recurrent-time-series.lua +++ b/examples/recurrent-time-series.lua @@ -1,11 +1,11 @@ --- Multi-variate time-series example +-- Multi-variate time-series example require 'rnn' cmd = torch.CmdLine() cmd:text() cmd:text('Train a multivariate time-series model using RNN') -cmd:option('--rho', 5, 'maximum number of time steps for back-propagate through time (BPTT)') +cmd:option('--seqlen', 5, 'maximum number of time steps for back-propagate through time (BPTT)') cmd:option('--multiSize', 6, 'number of random variables as input and output') cmd:option('--hiddenSize', 10, 'number of hidden units used at output of the recurrent layer') cmd:option('--dataSize', 100, 'total number of time-steps in dataset') @@ -53,20 +53,12 @@ print('Sequence:'); print(sequence) offsets = torch.LongTensor(opt.batchSize):random(1,opt.dataSize) --- RNN -r = nn.Recurrent( - opt.hiddenSize, -- size of output - nn.Linear(opt.multiSize, opt.hiddenSize), -- input layer - nn.Linear(opt.hiddenSize, opt.hiddenSize), -- recurrent layer - nn.Sigmoid(), -- transfer function - opt.rho -) - +-- Simple RNN rnn = nn.Sequential() - :add(r) + :add(nn.LinearRNN(opt.multiSize, opt.hiddenSize)) :add(nn.Linear(opt.hiddenSize, opt.multiSize)) -criterion = nn.MSECriterion() +criterion = nn.MSECriterion() -- use Sequencer for better data handling rnn = nn.Sequencer(rnn) @@ -79,12 +71,12 @@ print(rnn) minErr = opt.multiSize -- report min error minK = 0 avgErrs = torch.Tensor(opt.nIterations):fill(0) -for k = 1, opt.nIterations do +for k = 1, opt.nIterations do + + -- 1. create a sequence of seqlen time-steps - -- 1. create a sequence of rho time-steps - local inputs, targets = {}, {} - for step = 1, opt.rho do + for step = 1, opt.seqlen do -- batch of inputs inputs[step] = inputs[step] or sequence.new() inputs[step]:index(sequence, 1, offsets) @@ -99,10 +91,10 @@ for k = 1, opt.nIterations do local outputs = rnn:forward(inputs) local err = criterion:forward(outputs, targets) - + -- report errors - - print('Iter: ' .. k .. ' Err: ' .. err) + + print('Iter: ' .. k .. ' Err: ' .. err) if opt.plot then logger:add{['Err'] = err} logger:style{['Err'] = '-'} @@ -116,14 +108,14 @@ for k = 1, opt.nIterations do end -- 3. backward sequence through rnn (i.e. backprop through time) - + rnn:zeroGradParameters() - + local gradOutputs = criterion:backward(outputs, targets) local gradInputs = rnn:backward(inputs, gradOutputs) -- 4. updates parameters - + rnn:updateParameters(opt.learningRate) end diff --git a/examples/recurrent-visual-attention.lua b/examples/recurrent-visual-attention.lua index 75e0746..b9bff24 100644 --- a/examples/recurrent-visual-attention.lua +++ b/examples/recurrent-visual-attention.lua @@ -1,13 +1,11 @@ -require 'dp' +local dl = require 'dataload' require 'rnn' +require 'optim' -- References : -- A. http://papers.nips.cc/paper/5542-recurrent-models-of-visual-attention.pdf -- B. http://incompleteideas.net/sutton/williams-92.pdf - -version = 12 - --[[command line arguments]]-- cmd = torch.CmdLine() cmd:text() @@ -15,57 +13,65 @@ cmd:text('Train a Recurrent Model for Visual Attention') cmd:text('Example:') cmd:text('$> th rnn-visual-attention.lua > results.txt') cmd:text('Options:') -cmd:option('--xpPath', '/path/to/saved_model.dat', 'path to a previously saved model') -cmd:option('--learningRate', 0.01, 'learning rate at t=0') -cmd:option('--minLR', 0.00001, 'minimum learning rate') -cmd:option('--saturateEpoch', 800, 'epoch at which linear decayed LR will reach minLR') -cmd:option('--momentum', 0.9, 'momentum') -cmd:option('--maxOutNorm', -1, 'max norm each layers output neuron weights') -cmd:option('--cutoffNorm', -1, 'max l2-norm of contatenation of all gradParam tensors') -cmd:option('--batchSize', 20, 'number of examples per batch') -cmd:option('--cuda', false, 'use CUDA') -cmd:option('--useDevice', 1, 'sets the device (GPU) to use') -cmd:option('--maxEpoch', 2000, 'maximum number of epochs to run') -cmd:option('--maxTries', 100, 'maximum number of epochs to try to find a better local minima for early-stopping') -cmd:option('--transfer', 'ReLU', 'activation function') -cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization') -cmd:option('--progress', false, 'print progress bar') -cmd:option('--silent', false, 'dont print anything to stdout') +cmd:option('-startlr', 0.01, 'learning rate at t=0') +cmd:option('-minlr', 0.00001, 'minimum learning rate') +cmd:option('-saturate', 800, 'epoch at which linear decayed LR will reach minLR') +cmd:option('-momentum', 0.9, 'momentum') +cmd:option('-maxnormout', -1, 'max norm each layers output neuron weights') +cmd:option('-cutoff', -1, 'max l2-norm of contatenation of all gradParam tensors') +cmd:option('-batchsize', 20, 'number of examples per batch') +cmd:option('-cuda', false, 'use CUDA') +cmd:option('-device', 1, 'sets the device (GPU) to use') +cmd:option('-maxepoch', 2000, 'maximum number of epochs to run') +cmd:option('-earlystop', 200, 'maximum number of epochs to try to find a better local minima for early-stopping') +cmd:option('-transfer', 'ReLU', 'activation function') +cmd:option('-uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization') +cmd:option('-progress', false, 'print progress bar') +cmd:option('-silent', false, 'dont print anything to stdout') --[[ reinforce ]]-- -cmd:option('--rewardScale', 1, "scale of positive reward (negative is 0)") -cmd:option('--unitPixels', 13, "the locator unit (1,1) maps to pixels (13,13), or (-1,-1) maps to (-13,-13)") -cmd:option('--locatorStd', 0.11, 'stdev of gaussian location sampler (between 0 and 1) (low values may cause NaNs)') -cmd:option('--stochastic', false, 'Reinforce modules forward inputs stochastically during evaluation') +cmd:option('-rewardScale', 1, "scale of positive reward (negative is 0)") +cmd:option('-unitPixels', 13, "the locator unit (1,1) maps to pixels (13,13), or (-1,-1) maps to (-13,-13)") +cmd:option('-locatorStd', 0.11, 'stdev of gaussian location sampler (between 0 and 1) (low values may cause NaNs)') +cmd:option('-stochastic', false, 'Reinforce modules forward inputs stochastically during evaluation') --[[ glimpse layer ]]-- -cmd:option('--glimpseHiddenSize', 128, 'size of glimpse hidden layer') -cmd:option('--glimpsePatchSize', 8, 'size of glimpse patch at highest res (height = width)') -cmd:option('--glimpseScale', 2, 'scale of successive patches w.r.t. original input image') -cmd:option('--glimpseDepth', 1, 'number of concatenated downscaled patches') -cmd:option('--locatorHiddenSize', 128, 'size of locator hidden layer') -cmd:option('--imageHiddenSize', 256, 'size of hidden layer combining glimpse and locator hiddens') +cmd:option('-glimpseHiddenSize', 128, 'size of glimpse hidden layer') +cmd:option('-glimpsePatchSize', 8, 'size of glimpse patch at highest res (height = width)') +cmd:option('-glimpseScale', 2, 'scale of successive patches w.r.t. original input image') +cmd:option('-glimpseDepth', 1, 'number of concatenated downscaled patches') +cmd:option('-locatorHiddenSize', 128, 'size of locator hidden layer') +cmd:option('-imageHiddenSize', 256, 'size of hidden layer combining glimpse and locator hiddens') --[[ recurrent layer ]]-- -cmd:option('--rho', 7, 'back-propagate through time (BPTT) for rho time-steps') -cmd:option('--hiddenSize', 256, 'number of hidden units used in Simple RNN.') -cmd:option('--LSTM', false, 'use LSTM instead of linear layer') +cmd:option('-seqlen', 7, 'back-propagate through time (BPTT) for seqlen time-steps') +cmd:option('-hiddenSize', 256, 'number of hidden units used in Simple RNN.') +cmd:option('-lstm', false, 'use LSTM instead of linear layer') --[[ data ]]-- -cmd:option('--dataset', 'Mnist', 'which dataset to use : Mnist | TranslattedMnist | etc') -cmd:option('--trainEpochSize', -1, 'number of train examples seen between each epoch') -cmd:option('--validEpochSize', -1, 'number of valid examples used for early stopping and cross-validation') -cmd:option('--noTest', false, 'dont propagate through the test set') -cmd:option('--overwrite', false, 'overwrite checkpoint') +cmd:option('-dataset', 'Mnist', 'which dataset to use : Mnist | TranslattedMnist | etc') +cmd:option('-trainsize', -1, 'number of train examples seen between each epoch') +cmd:option('-validsize', -1, 'number of valid examples used for early stopping and cross-validation') +cmd:option('-noTest', false, 'dont propagate through the test set') +cmd:option('-overwrite', false, 'overwrite checkpoint') +cmd:option('-savepath', paths.concat(dl.SAVE_PATH, 'rmva'), 'path to directory where experiment log (includes model) will be saved') +cmd:option('-id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)') cmd:text() local opt = cmd:parse(arg or {}) +opt.version = 13 +opt.id = opt.id == '' and ('ptb' .. ':' .. dl.uniqueid()) or opt.id if not opt.silent then table.print(opt) end +if opt.cuda then + require 'cunn' + cutorch.setDevice(opt.device) +end + --[[data]]-- -if opt.dataset == 'TranslatedMnist' then +--[[if opt.dataset == 'TranslatedMnist' then ds = torch.checkpoint( paths.concat(dp.DATA_DIR, 'checkpoint/dp.TranslatedMnist.t7'), function() return dp[opt.dataset]() end, @@ -73,186 +79,224 @@ if opt.dataset == 'TranslatedMnist' then ) else ds = dp[opt.dataset]() -end +end--]] + +assert(opt.dataset == 'Mnist') +trainset, validset, testset = dl.loadMNIST() --[[Model]]-- -if opt.xpPath ~= '' then - assert(paths.filep(opt.xpPath), opt.xpPath..' does not exist') - - if opt.cuda then - require 'cunn' - require 'optim' - cutorch.setDevice(opt.useDevice) - end - - xp = torch.load(opt.xpPath) - agent = xp:model() - local checksum = agent:parameters()[1]:sum() - xp.opt.progress = opt.progress - opt = xp.opt + +-- glimpse network (rnn input layer) +locationSensor = nn.Sequential() +locationSensor:add(nn.SelectTable(2)) +locationSensor:add(nn.Linear(2, opt.locatorHiddenSize)) +locationSensor:add(nn[opt.transfer]()) + +glimpseSensor = nn.Sequential() +glimpseSensor:add(nn.SpatialGlimpse(opt.glimpsePatchSize, opt.glimpseDepth, opt.glimpseScale):float()) +glimpseSensor:add(nn.Collapse(3)) +glimpseSensor:add(nn.Linear(trainset:isize()[1]*(opt.glimpsePatchSize^2)*opt.glimpseDepth, opt.glimpseHiddenSize)) +glimpseSensor:add(nn[opt.transfer]()) + +glimpse = nn.Sequential() +glimpse:add(nn.ConcatTable():add(locationSensor):add(glimpseSensor)) +glimpse:add(nn.JoinTable(1,1)) +glimpse:add(nn.Linear(opt.glimpseHiddenSize+opt.locatorHiddenSize, opt.imageHiddenSize)) +glimpse:add(nn[opt.transfer]()) + +-- RNN layer +if opt.lstm then + glimpse:add(nn.RecLSTM(opt.imageHiddenSize, opt.hiddenSize)) else + glimpse:add(nn.LinearRNN(opt.imageHiddenSize, opt.hiddenSize, nn[opt.transfer]())) +end - -- glimpse network (rnn input layer) - locationSensor = nn.Sequential() - locationSensor:add(nn.SelectTable(2)) - locationSensor:add(nn.Linear(2, opt.locatorHiddenSize)) - locationSensor:add(nn[opt.transfer]()) - - glimpseSensor = nn.Sequential() - glimpseSensor:add(nn.SpatialGlimpse(opt.glimpsePatchSize, opt.glimpseDepth, opt.glimpseScale):float()) - glimpseSensor:add(nn.Collapse(3)) - glimpseSensor:add(nn.Linear(ds:imageSize('c')*(opt.glimpsePatchSize^2)*opt.glimpseDepth, opt.glimpseHiddenSize)) - glimpseSensor:add(nn[opt.transfer]()) - - glimpse = nn.Sequential() - glimpse:add(nn.ConcatTable():add(locationSensor):add(glimpseSensor)) - glimpse:add(nn.JoinTable(1,1)) - glimpse:add(nn.Linear(opt.glimpseHiddenSize+opt.locatorHiddenSize, opt.imageHiddenSize)) - glimpse:add(nn[opt.transfer]()) - glimpse:add(nn.Linear(opt.imageHiddenSize, opt.hiddenSize)) - - -- rnn recurrent layer - if opt.LSTM then - recurrent = nn.RecLSTM(opt.hiddenSize, opt.hiddenSize) - else - recurrent = nn.Linear(opt.hiddenSize, opt.hiddenSize) +imageSize = trainset:isize() +assert(imageSize[2] == imageSize[3]) + +-- actions (locator) +locator = nn.Sequential() +locator:add(nn.Linear(opt.hiddenSize, 2)) +locator:add(nn.HardTanh()) -- bounds mean between -1 and 1 +locator:add(nn.ReinforceNormal(2*opt.locatorStd, opt.stochastic)) -- sample from normal, uses REINFORCE learning rule +assert(locator:get(3).stochastic == opt.stochastic, "Please update the dpnn package : luarocks install dpnn") +locator:add(nn.HardTanh()) -- bounds sample between -1 and 1 +locator:add(nn.MulConstant(opt.unitPixels*2/imageSize[2])) + +attention = nn.RecurrentAttention(glimpse, locator, opt.seqlen, {opt.hiddenSize}) + +-- model is a reinforcement learning agent +agent = nn.Sequential() +agent:add(nn.Convert()) +agent:add(attention) + +-- classifier : +agent:add(nn.SelectTable(-1)) +agent:add(nn.Linear(opt.hiddenSize, #testset.classes)) +agent:add(nn.LogSoftMax()) + +-- add the baseline reward predictor +seq = nn.Sequential() +seq:add(nn.Constant(1,1)) +seq:add(nn.Add(1)) +concat = nn.ConcatTable():add(nn.Identity()):add(seq) +concat2 = nn.ConcatTable():add(nn.Identity()):add(concat) + +-- output will be : {classpred, {classpred, basereward}} +agent:add(concat2) + +if opt.uniform > 0 then + for k,param in ipairs(agent:parameters()) do + param:uniform(-opt.uniform, opt.uniform) end +end +print("Recurrent visual attention model:") +print(agent) - -- recurrent neural network - rnn = nn.Recurrent(opt.hiddenSize, glimpse, recurrent, nn[opt.transfer](), 99999) +-- [[Criterion]] - imageSize = ds:imageSize('h') - assert(ds:imageSize('h') == ds:imageSize('w')) +criterion = nn.ParallelCriterion(true) + :add(nn.ClassNLLCriterion()) -- BACKPROP + :add(nn.VRClassReward(agent, opt.rewardScale)) -- REINFORCE - -- actions (locator) - locator = nn.Sequential() - locator:add(nn.Linear(opt.hiddenSize, 2)) - locator:add(nn.HardTanh()) -- bounds mean between -1 and 1 - locator:add(nn.ReinforceNormal(2*opt.locatorStd, opt.stochastic)) -- sample from normal, uses REINFORCE learning rule - assert(locator:get(3).stochastic == opt.stochastic, "Please update the dpnn package : luarocks install dpnn") - locator:add(nn.HardTanh()) -- bounds sample between -1 and 1 - locator:add(nn.MulConstant(opt.unitPixels*2/ds:imageSize("h"))) +targetmodule = nn.Convert() - attention = nn.RecurrentAttention(rnn, locator, opt.rho, {opt.hiddenSize}) - -- model is a reinforcement learning agent - agent = nn.Sequential() - agent:add(nn.Convert(ds:ioShapes(), 'bchw')) - agent:add(attention) +--[[ CUDA ]]-- - -- classifier : - agent:add(nn.SelectTable(-1)) - agent:add(nn.Linear(opt.hiddenSize, #ds:classes())) - agent:add(nn.LogSoftMax()) +if opt.cuda then + agent:cuda() + criterion:cuda() + targetmodule:cuda() +else + agent:float() + criterion:float() + targetmodule:float() +end - -- add the baseline reward predictor - seq = nn.Sequential() - seq:add(nn.Constant(1,1)) - seq:add(nn.Add(1)) - concat = nn.ConcatTable():add(nn.Identity()):add(seq) - concat2 = nn.ConcatTable():add(nn.Identity()):add(concat) +--[[ experiment log ]]-- + +-- is saved to file every time a new validation minima is found +local xplog = {} +xplog.opt = opt -- save all hyper-parameters and such +-- will only serialize params +xplog.model = nn.Serial(agent) +xplog.model:mediumSerial() +xplog.criterion = criterion +xplog.targetmodule = targetmodule +-- keep a log of NLL for each epoch +xplog.traincm = {} +xplog.validcm = {} +-- will be used for early-stopping +xplog.minvaliderr = 99999999 +xplog.epoch = 0 + +--[[ training loop ]]-- + +local ntrial = 0 +paths.mkdir(opt.savepath) + +local epoch = 1 +opt.lr = opt.startlr +opt.trainsize = opt.trainsize == -1 and trainset:size() or opt.trainsize +opt.validsize = opt.validsize == -1 and validset:size() or opt.validsize +while opt.maxepoch <= 0 or epoch <= opt.maxepoch do + print("") + print("Epoch #"..epoch.." :") + + local traincm = optim.ConfusionMatrix(10) + + -- 1. training + + local a = torch.Timer() + agent:training() + for i, input, target in trainset:sampleiter(opt.batchsize, opt.trainsize) do + target = targetmodule:forward(target) + -- forward + local output = agent:forward(input) + local err = criterion:forward(output, target) + traincm:batchAdd(output[1], target) + + -- backward + local gradOutput = criterion:backward(output, target) + agent:zeroGradParameters() + agent:backward(input, gradOutput) + + -- update + if opt.cutoff > 0 then + local norm = agent:gradParamClip(opt.cutoff) -- affects gradParams + opt.meanNorm = opt.meanNorm and (opt.meanNorm*0.9 + norm*0.1) or norm + end + agent:updateGradParameters(opt.momentum) -- affects gradParams + agent:updateParameters(opt.lr) -- affects params + agent:maxParamNorm(opt.maxnormout) -- affects params - -- output will be : {classpred, {classpred, basereward}} - agent:add(concat2) + if opt.progress then + xlua.progress(i, opt.trainsize) + end - if opt.uniform > 0 then - for k,param in ipairs(agent:parameters()) do - param:uniform(-opt.uniform, opt.uniform) + if i % 1000 == 0 then + collectgarbage() end + end -end ---[[Propagators]]-- -opt.decayFactor = (opt.minLR - opt.learningRate)/opt.saturateEpoch - -train = dp.Optimizer{ - loss = nn.ParallelCriterion(true) - :add(nn.ModuleCriterion(nn.ClassNLLCriterion(), nil, nn.Convert())) -- BACKPROP - :add(nn.ModuleCriterion(nn.VRClassReward(agent, opt.rewardScale), nil, nn.Convert())) -- REINFORCE - , - epoch_callback = function(model, report) -- called every epoch - if report.epoch > 0 then - opt.learningRate = opt.learningRate + opt.decayFactor - opt.learningRate = math.max(opt.minLR, opt.learningRate) - if not opt.silent then - print("learningRate", opt.learningRate) - end - end - end, - callback = function(model, report) - if opt.cutoffNorm > 0 then - local norm = model:gradParamClip(opt.cutoffNorm) -- affects gradParams - opt.meanNorm = opt.meanNorm and (opt.meanNorm*0.9 + norm*0.1) or norm - if opt.lastEpoch < report.epoch and not opt.silent then - print("mean gradParam norm", opt.meanNorm) - end + -- learning rate decay + opt.lr = opt.lr + (opt.minlr - opt.startlr)/opt.saturate + opt.lr = math.max(opt.minlr, opt.lr) + + if not opt.silent then + print("learning rate", opt.lr) + if opt.meanNorm then + print("mean gradParam norm", opt.meanNorm) end - model:updateGradParameters(opt.momentum) -- affects gradParams - model:updateParameters(opt.learningRate) -- affects params - model:maxParamNorm(opt.maxOutNorm) -- affects params - model:zeroGradParameters() -- affects gradParams - end, - feedback = dp.Confusion{output_module=nn.SelectTable(1)}, - sampler = dp.ShuffleSampler{ - epoch_size = opt.trainEpochSize, batch_size = opt.batchSize - }, - progress = opt.progress -} - - -valid = dp.Evaluator{ - feedback = dp.Confusion{output_module=nn.SelectTable(1)}, - sampler = dp.Sampler{epoch_size = opt.validEpochSize, batch_size = opt.batchSize}, - progress = opt.progress -} -if not opt.noTest then - tester = dp.Evaluator{ - feedback = dp.Confusion{output_module=nn.SelectTable(1)}, - sampler = dp.Sampler{batch_size = opt.batchSize} - } -end + end ---[[Experiment]]-- - -xp = dp.Experiment{ - model = agent, - optimizer = train, - validator = valid, - tester = tester, - observer = { - ad, - dp.FileLogger(), - dp.EarlyStopper{ - max_epochs = opt.maxTries, - error_report={'validator','feedback','confusion','accuracy'}, - maximize = true - } - }, - random_seed = os.time(), - max_epoch = opt.maxEpoch -} - ---[[GPU or CPU]]-- -if opt.cuda then - print"Using CUDA" - require 'cutorch' - require 'cunn' - cutorch.setDevice(opt.useDevice) - xp:cuda() -else - xp:float() -end + if cutorch then cutorch.synchronize() end + local speed = a:time().real/opt.trainsize + print(string.format("Speed : %f sec/batch ", speed)) -xp:verbose(not opt.silent) -if not opt.silent then - print"Agent :" - print(agent) -end + traincm:updateValids() + print("Training error : "..((1 - traincm.totalValid)*100).."%") + + xplog.traincm[epoch] = traincm + + -- 2. cross-validation + + agent:evaluate() + local validcm = optim.ConfusionMatrix(10) + for i, input, target in validset:subiter(opt.batchsize, opt.validsize) do + target = targetmodule:forward(target) + local output = agent:forward(input) + validcm:batchAdd(output[1], target) + end + + validcm:updateValids() + local validerr = 1 - validcm.totalValid + print("Validation error : "..(validerr*100).."%") + + xplog.validcm[epoch] = validcm + ntrial = ntrial + 1 + + -- early-stopping + if validerr < xplog.minvaliderr then + -- save best version of model + xplog.minvaliderr = validerr + xplog.epoch = epoch + local filename = paths.concat(opt.savepath, opt.id..'.t7') + print("Found new minima. Saving to "..filename) + torch.save(filename, xplog) + ntrial = 0 + elseif ntrial >= opt.earlystop then + print("No new minima found after "..ntrial.." epochs.") + print("Stopping experiment.") + break + end -xp.opt = opt -if checksum then - assert(math.abs(xp:model():parameters()[1]:sum() - checksum) < 0.0001, "Loaded model parameters were changed???") + collectgarbage() + epoch = epoch + 1 end -xp:run(ds) +print("Evaluate model using : ") +print("th scripts/evaluate-rva.lua -xplogpath "..paths.concat(opt.savepath, opt.id..'.t7')..(opt.cuda and ' -cuda' or '')..' -evaltest') diff --git a/examples/sequence-to-one.lua b/examples/sequence-to-one.lua index 04413b9..8200c86 100644 --- a/examples/sequence-to-one.lua +++ b/examples/sequence-to-one.lua @@ -1,8 +1,8 @@ require 'rnn' --- hyper-parameters +-- hyper-parameters batchSize = 8 -rho = 10 -- sequence length +seqlen = 10 -- sequence length hiddenSize = 100 nIndex = 100 -- input words nClass = 7 -- output classes @@ -10,16 +10,19 @@ lr = 0.1 -- build simple recurrent neural network -r = nn.Recurrent( - hiddenSize, nn.Identity(), - nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), - rho -) +rec = nn.Recurrence( + nn.Sequential() + :add(nn.ParallelTable() + :add(nn.Identity()) + :add(nn.Linear(hiddenSize, hiddenSize))) + :add(nn.CAddTable()) + :add(nn.Sigmoid()) + , hiddenSize, 1) rnn = nn.Sequential() :add(nn.LookupTable(nIndex, hiddenSize)) :add(nn.SplitTable(1,2)) - :add(nn.Sequencer(r)) + :add(nn.Sequencer(rec)) :add(nn.SelectTable(-1)) -- this selects the last time-step of the rnn output sequence :add(nn.Linear(hiddenSize, nClass)) :add(nn.LogSoftMax()) @@ -28,21 +31,21 @@ rnn = nn.Sequential() criterion = nn.ClassNLLCriterion() --- build dummy dataset (task is to predict class given rho words) +-- build dummy dataset (task is to predict class given seqlen words) -- similar to sentiment analysis datasets ds = {} ds.size = 1000 -ds.input = torch.LongTensor(ds.size,rho) +ds.input = torch.LongTensor(ds.size,seqlen) ds.target = torch.LongTensor(ds.size):random(nClass) -- this will make the inputs somewhat correlate with the targets, -- such that the reduction in training error should be more obvious -local correlate = torch.LongTensor(nClass, rho*3):random(nClass) -local indices = torch.LongTensor(rho) +local correlate = torch.LongTensor(nClass, seqlen*3):random(nClass) +local indices = torch.LongTensor(seqlen) local buffer = torch.LongTensor() local sortVal, sortIdx = torch.LongTensor(), torch.LongTensor() for i=1,ds.size do - indices:random(1,rho*3) + indices:random(1,seqlen*3) buffer:index(correlate[ds.target[i]], 1, indices) sortVal:sort(sortIdx, buffer, 1) ds.input[i]:copy(sortVal:view(-1)) @@ -54,27 +57,27 @@ indices:resize(batchSize) -- training local inputs, targets = torch.LongTensor(), torch.LongTensor() for iteration = 1, 1000 do - -- 1. create a sequence of rho time-steps - + -- 1. create a sequence of seqlen time-steps + indices:random(1,ds.size) -- choose some random samples inputs:index(ds.input, 1,indices) targets:index(ds.target, 1,indices) - + -- 2. forward sequence through rnn - - rnn:zeroGradParameters() - + + rnn:zeroGradParameters() + local outputs = rnn:forward(inputs) local err = criterion:forward(outputs, targets) - + print(string.format("Iteration %d ; NLL err = %f ", iteration, err)) -- 3. backward sequence through rnn (i.e. backprop through time) - + local gradOutputs = criterion:backward(outputs, targets) local gradInputs = rnn:backward(inputs, gradOutputs) - + -- 4. update - + rnn:updateParameters(lr) end diff --git a/examples/simple-bisequencer-network-variable.lua b/examples/simple-bisequencer-network-variable.lua index 3426329..389ad27 100644 --- a/examples/simple-bisequencer-network-variable.lua +++ b/examples/simple-bisequencer-network-variable.lua @@ -6,7 +6,7 @@ math.randomseed(0) -- hyper-parameters batchSize = 8 -rho = 10 -- sequence length +seqlen = 10 -- sequence length hiddenSize = 5 nIndex = 10 lr = 0.1 @@ -63,16 +63,16 @@ maxStep = {} for i=1,batchSize do table.insert(offsets, math.ceil(math.random()*sequence:size(1))) -- variable length for each sample - table.insert(maxStep, math.random(rho)) + table.insert(maxStep, math.random(seqlen)) end offsets = torch.LongTensor(offsets) -- training for iteration = 1, maxIter do - -- 1. create a sequence of rho time-steps + -- 1. create a sequence of seqlen time-steps local inputs, inputs_rev, targets = {}, {}, {} - for step=1,rho do + for step=1,seqlen do -- a batch of inputs inputs[step] = sequence:index(1, offsets) -- increment indices @@ -93,7 +93,7 @@ for iteration = 1, maxIter do end -- reverse - for step=1,rho do + for step=1,seqlen do inputs_rev[step] = torch.LongTensor(batchSize) for j=1,batchSize do if step <= maxStep[j] then @@ -113,7 +113,7 @@ for iteration = 1, maxIter do local correct = 0 local total = 0 - for step=1,rho do + for step=1,seqlen do probs = outputs[step] _, preds = probs:max(2) for j=1,batchSize do diff --git a/examples/simple-bisequencer-network.lua b/examples/simple-bisequencer-network.lua index ac86405..2d87004 100644 --- a/examples/simple-bisequencer-network.lua +++ b/examples/simple-bisequencer-network.lua @@ -1,8 +1,8 @@ require 'rnn' --- hyper-parameters +-- hyper-parameters batchSize = 8 -rho = 5 -- sequence length +seqlen = 5 -- sequence length hiddenSize = 7 nIndex = 10 lr = 0.1 @@ -11,9 +11,9 @@ lr = 0.1 -- forward rnn -- build simple recurrent neural network local fwd = nn.Recurrent( - hiddenSize, nn.LookupTable(nIndex, hiddenSize), - nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), - rho + hiddenSize, nn.LookupTable(nIndex, hiddenSize), + nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), + seqlen ) -- backward rnn (will be applied in reverse order of input sequence) @@ -22,7 +22,7 @@ bwd:reset() -- reinitializes parameters -- merges the output of one time-step of fwd and bwd rnns. -- You could also try nn.AddTable(), nn.Identity(), etc. -local merge = nn.JoinTable(1, 1) +local merge = nn.JoinTable(1, 1) -- we use BiSequencerLM because this is a language model (previous and next words to predict current word). -- If we used BiSequencer, x[t] would be used to predict y[t] = x[t] (which is cheating). @@ -30,7 +30,7 @@ local merge = nn.JoinTable(1, 1) local brnn = nn.BiSequencerLM(fwd, bwd, merge) local rnn = nn.Sequential() - :add(brnn) + :add(brnn) :add(nn.Sequencer(nn.Linear(hiddenSize*2, nIndex))) -- times two due to JoinTable :add(nn.Sequencer(nn.LogSoftMax())) @@ -54,10 +54,10 @@ offsets = torch.LongTensor(offsets) -- training local iteration = 1 while true do - -- 1. create a sequence of rho time-steps - + -- 1. create a sequence of seqlen time-steps + local inputs, targets = {}, {} - for step=1,rho do + for step=1,seqlen do -- a batch of inputs inputs[step] = sequence:index(1, offsets) -- incement indices @@ -69,24 +69,24 @@ while true do end targets[step] = sequence:index(1, offsets) end - + -- 2. forward sequence through rnn - - rnn:zeroGradParameters() - + + rnn:zeroGradParameters() + local outputs = rnn:forward(inputs) local err = criterion:forward(outputs, targets) - + print(string.format("Iteration %d ; NLL err = %f ", iteration, err)) -- 3. backward sequence through rnn (i.e. backprop through time) - + local gradOutputs = criterion:backward(outputs, targets) local gradInputs = rnn:backward(inputs, gradOutputs) - + -- 4. update - + rnn:updateParameters(lr) - + iteration = iteration + 1 end diff --git a/examples/simple-recurrence-network.lua b/examples/simple-recurrence-network.lua index 607ca7f..c6d376e 100644 --- a/examples/simple-recurrence-network.lua +++ b/examples/simple-recurrence-network.lua @@ -1,15 +1,15 @@ -- example use of nn.Recurrence require 'rnn' --- hyper-parameters +-- hyper-parameters batchSize = 8 -rho = 5 -- sequence length +seqlen = 5 -- sequence length hiddenSize = 7 nIndex = 10 lr = 0.1 --- the internal recurrentModule used by Recurrence -local rm = nn.Sequential() -- input is {x[t], h[t-1]} +-- the internal step module used by Recurrence +local stepmodule = nn.Sequential() -- input is {x[t], h[t-1]} :add(nn.ParallelTable() :add(nn.LookupTable(nIndex, hiddenSize)) -- input layer :add(nn.Linear(hiddenSize, hiddenSize))) -- recurrent layer @@ -17,7 +17,7 @@ local rm = nn.Sequential() -- input is {x[t], h[t-1]} :add(nn.Sigmoid()) -- transfer local rnn = nn.Sequential() - :add(nn.Recurrence(rm, hiddenSize, 0)) -- similar to nn.Recurrent, but more general, and no startModule + :add(nn.Recurrence(stepmodule, hiddenSize, 0)) -- essentially the same as nn.LookupRNN :add(nn.Linear(hiddenSize, nIndex)) :add(nn.LogSoftMax()) @@ -45,10 +45,10 @@ offsets = torch.LongTensor(offsets) -- training local iteration = 1 while true do - -- 1. create a sequence of rho time-steps - + -- 1. create a sequence of seqlen time-steps + local inputs, targets = {}, {} - for step=1,rho do + for step=1,seqlen do -- a batch of inputs inputs[step] = sequence:index(1, offsets) -- incement indices @@ -60,24 +60,24 @@ while true do end targets[step] = sequence:index(1, offsets) end - + -- 2. forward sequence through rnn - - rnn:zeroGradParameters() - + + rnn:zeroGradParameters() + local outputs = rnn:forward(inputs) local err = criterion:forward(outputs, targets) - + print(string.format("Iteration %d ; NLL err = %f ", iteration, err)) -- 3. backward sequence through rnn (i.e. backprop through time) - + local gradOutputs = criterion:backward(outputs, targets) local gradInputs = rnn:backward(inputs, gradOutputs) - + -- 4. update - + rnn:updateParameters(lr) - + iteration = iteration + 1 end diff --git a/examples/simple-recurrent-network.lua b/examples/simple-recurrent-network.lua index bc3305f..614ad69 100644 --- a/examples/simple-recurrent-network.lua +++ b/examples/simple-recurrent-network.lua @@ -1,29 +1,21 @@ require 'rnn' --- hyper-parameters +-- hyper-parameters batchSize = 8 -rho = 5 -- sequence length +seqlen = 5 -- sequence length hiddenSize = 7 nIndex = 10 lr = 0.1 - --- build simple recurrent neural network -local r = nn.Recurrent( - hiddenSize, nn.LookupTable(nIndex, hiddenSize), - nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), - rho -) - local rnn = nn.Sequential() - :add(r) + :add(nn.LookupRNN(nIndex, hiddenSize)) :add(nn.Linear(hiddenSize, nIndex)) :add(nn.LogSoftMax()) -- wrap the non-recurrent module (Sequential) in Recursor. -- This makes it a recurrent module -- i.e. Recursor is an AbstractRecurrent instance -rnn = nn.Recursor(rnn, rho) +rnn = nn.Recursor(rnn, seqlen) print(rnn) @@ -45,10 +37,10 @@ offsets = torch.LongTensor(offsets) -- training local iteration = 1 while true do - -- 1. create a sequence of rho time-steps - + -- 1. create a sequence of seqlen time-steps + local inputs, targets = {}, {} - for step=1,rho do + for step=1,seqlen do -- a batch of inputs inputs[step] = sequence:index(1, offsets) -- incement indices @@ -60,31 +52,31 @@ while true do end targets[step] = sequence:index(1, offsets) end - + -- 2. forward sequence through rnn - - rnn:zeroGradParameters() + + rnn:zeroGradParameters() rnn:forget() -- forget all past time-steps - + local outputs, err = {}, 0 - for step=1,rho do + for step=1,seqlen do outputs[step] = rnn:forward(inputs[step]) err = err + criterion:forward(outputs[step], targets[step]) end - + print(string.format("Iteration %d ; NLL err = %f ", iteration, err)) -- 3. backward sequence through rnn (i.e. backprop through time) - + local gradOutputs, gradInputs = {}, {} - for step=rho,1,-1 do -- reverse order of forward calls + for step=seqlen,1,-1 do -- reverse order of forward calls gradOutputs[step] = criterion:backward(outputs[step], targets[step]) gradInputs[step] = rnn:backward(inputs[step], gradOutputs[step]) end - + -- 4. update - + rnn:updateParameters(lr) - + iteration = iteration + 1 end diff --git a/examples/simple-sequencer-network.lua b/examples/simple-sequencer-network.lua index 09add26..9f2c9c7 100644 --- a/examples/simple-sequencer-network.lua +++ b/examples/simple-sequencer-network.lua @@ -1,22 +1,15 @@ require 'rnn' --- hyper-parameters +-- hyper-parameters batchSize = 8 -rho = 5 -- sequence length +seqlen = 5 -- sequence length hiddenSize = 7 nIndex = 10 lr = 0.1 --- build simple recurrent neural network -local r = nn.Recurrent( - hiddenSize, nn.LookupTable(nIndex, hiddenSize), - nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), - rho -) - local rnn = nn.Sequential() - :add(r) + :add(nn.LookupRNN(nIndex, hiddenSize)) :add(nn.Linear(hiddenSize, nIndex)) :add(nn.LogSoftMax()) @@ -43,10 +36,10 @@ offsets = torch.LongTensor(offsets) -- training local iteration = 1 while true do - -- 1. create a sequence of rho time-steps - + -- 1. create a sequence of seqlen time-steps + local inputs, targets = {}, {} - for step=1,rho do + for step=1,seqlen do -- a batch of inputs inputs[step] = sequence:index(1, offsets) -- incement indices @@ -58,24 +51,24 @@ while true do end targets[step] = sequence:index(1, offsets) end - + -- 2. forward sequence through rnn - - rnn:zeroGradParameters() - + + rnn:zeroGradParameters() + local outputs = rnn:forward(inputs) local err = criterion:forward(outputs, targets) - + print(string.format("Iteration %d ; NLL err = %f ", iteration, err)) -- 3. backward sequence through rnn (i.e. backprop through time) - + local gradOutputs = criterion:backward(outputs, targets) local gradInputs = rnn:backward(inputs, gradOutputs) - + -- 4. update - + rnn:updateParameters(lr) - + iteration = iteration + 1 end diff --git a/scripts/evaluate-rva.lua b/scripts/evaluate-rva.lua index 884dc33..10c0d04 100644 --- a/scripts/evaluate-rva.lua +++ b/scripts/evaluate-rva.lua @@ -1,4 +1,4 @@ -require 'dp' +local dl = require 'dataload' require 'rnn' require 'optim' @@ -11,45 +11,45 @@ cmd = torch.CmdLine() cmd:text() cmd:text('Evaluate a Recurrent Model for Visual Attention') cmd:text('Options:') -cmd:option('--xpPath', '', 'path to a previously saved model') -cmd:option('--cuda', false, 'model was saved with cuda') -cmd:option('--evalTest', false, 'model was saved with cuda') -cmd:option('--stochastic', false, 'evaluate the model stochatically. Generate glimpses stochastically') -cmd:option('--dataset', 'Mnist', 'which dataset to use : Mnist | TranslattedMnist | etc') -cmd:option('--overwrite', false, 'overwrite checkpoint') +cmd:option('-xplogpath', '', 'path to an xplog generated with examples/recurrent-visual-attention.lua') +cmd:option('-cuda', false, 'model was saved with cuda') +cmd:option('-evaltest', false, 'evaluate performance on test set') +cmd:option('-stochastic', false, 'evaluate the model stochatically. Generate glimpses stochastically') +cmd:option('-dataset', 'Mnist', 'which dataset to use : Mnist | TranslattedMnist | etc') +cmd:option('-overwrite', false, 'overwrite checkpoint') cmd:text() local opt = cmd:parse(arg or {}) -- check that saved model exists -assert(paths.filep(opt.xpPath), opt.xpPath..' does not exist') +assert(paths.filep(opt.xplogpath), opt.xplogpath..' does not exist') if opt.cuda then require 'cunn' end -xp = torch.load(opt.xpPath) -model = xp:model().module -tester = xp:tester() or xp:validator() -- dp.Evaluator -tester:sampler()._epoch_size = nil -conf = tester:feedback() -- dp.Confusion -cm = conf._cm -- optim.ConfusionMatrix +xplog = torch.load(opt.xplogpath) +model = torch.type(xplog.model) == 'nn.Serial' and xplog.model.modules[1] or xplog.model -print("Last evaluation of "..(xp:tester() and 'test' or 'valid').." set :") -print(cm) +print("Last evaluation of validation set") +print(xplog.validcm[#xplog.validcm]) +--[[ if opt.dataset == 'TranslatedMnist' then ds = torch.checkpoint( paths.concat(dp.DATA_DIR, 'checkpoint/dp.TranslatedMnist_test.t7'), - function() - local ds = dp[opt.dataset]{load_all=false} + function() + local ds = dp[opt.dataset]{load_all=false} ds:loadTest() return ds - end, + end, opt.overwrite ) else ds = dp[opt.dataset]() end +--]] +assert(opt.dataset == 'Mnist') +trainset, validset, testset = dl.loadMNIST() ra = model:findModules('nn.RecurrentAttention')[1] sg = model:findModules('nn.SpatialGlimpse')[1] @@ -60,18 +60,21 @@ for i=1,#ra.actions do rn.stochastic = opt.stochastic end -if opt.evalTest then - conf:reset() - tester:propagateEpoch(ds:testSet()) +local testcm = optim.ConfusionMatrix(10) +if opt.evaltest then + model:evaluate() + for i, input, target in testset:subiter(opt.batchsize) do + target = xplog.targetmodule:forward(target) + local output = model:forward(input) + testcm:batchAdd(output[1], target) + end - print((opt.stochastic and "Stochastic" or "Deterministic") .. "evaluation of test set :") - print(cm) + print((opt.stochastic and "Stochastic" or "Deterministic") .. " evaluation of test set :") + print(testcm) end -inputs = ds:get('test','inputs') -targets = ds:get('test','targets', 'b') -input = inputs:narrow(1,1,10) +input = testset.inputs:narrow(1,1,10) model:training() -- otherwise the rnn doesn't save intermediate time-step states if not opt.stochastic then for i=1,#ra.actions do @@ -105,8 +108,6 @@ function drawBox(img, bbox, channel) end locations = ra.actions - -input = nn.Convert(ds:ioShapes(),'bchw'):forward(input) glimpses = {} patches = {} @@ -118,7 +119,7 @@ for i=1,input:size(1) do glimpses[j] = glimpse local patch = patches[j] or {} patches[j] = patch - + local xy = location[i] -- (-1,-1) top left corner, (1,1) bottom right corner of image local x, y = xy:select(1,1), xy:select(1,2) @@ -126,7 +127,7 @@ for i=1,input:size(1) do x, y = (x+1)/2, (y+1)/2 -- (1,1), (input:size(3), input:size(4)) x, y = x*(input:size(3)-1)+1, y*(input:size(4)-1)+1 - + local gimg = img:clone() for d=1,sg.depth do local size = sg.height*(sg.scale^(d-1)) @@ -134,15 +135,10 @@ for i=1,input:size(1) do drawBox(gimg, bbox, 1) end glimpse[i] = gimg - - local sg_, ps - if j == 1 then - sg_ = ra.rnn.initialModule:findModules('nn.SpatialGlimpse')[1] - else - sg_ = ra.rnn.sharedClones[j]:findModules('nn.SpatialGlimpse')[1] - end + + local sg_ = ra.rnn.sharedClones[j]:findModules('nn.SpatialGlimpse')[1] patch[i] = image.scale(img:clone():float(), sg_.output[i]:narrow(1,1,1):float()) - + collectgarbage() end end