diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua
index 3a604c1..2319594 100644
--- a/examples/noise-contrastive-estimate.lua
+++ b/examples/noise-contrastive-estimate.lua
@@ -221,7 +221,6 @@ while opt.maxepoch <= 0 or epoch <= opt.maxepoch do
       sumErr = sumErr + err
       -- backward
       local gradOutputs = criterion:backward(outputs, targets)
-      local a = torch.Timer()
       lm:zeroGradParameters()
       lm:backward(inputs, gradOutputs)
 
diff --git a/examples/recurrent-language-model.lua b/examples/recurrent-language-model.lua
index 7044aa2..09b3b42 100644
--- a/examples/recurrent-language-model.lua
+++ b/examples/recurrent-language-model.lua
@@ -94,20 +94,19 @@ for i,hiddensize in ipairs(opt.hiddensize) do
    if opt.gru then -- Gated Recurrent Units
       rnn = nn.GRU(inputsize, hiddensize, nil, opt.dropout/2)
    elseif opt.lstm then -- Long Short Term Memory units
-      require 'nngraph'
-      nn.FastLSTM.usenngraph = true -- faster
-      nn.FastLSTM.bn = opt.bn
-      rnn = nn.FastLSTM(inputsize, hiddensize)
+      rnn = nn.RecLSTM(inputsize, hiddensize)
    elseif opt.mfru then -- Multi Function Recurrent Unit
       rnn = nn.MuFuRu(inputsize, hiddensize)
-   else -- simple recurrent neural network
+   elseif i == 1 then -- simple recurrent neural network
       local rm =  nn.Sequential() -- input is {x[t], h[t-1]}
          :add(nn.ParallelTable()
-            :add(i==1 and nn.Identity() or nn.Linear(inputsize, hiddensize)) -- input layer
+            :add(nn.Identity()) -- input layer
             :add(nn.Linear(hiddensize, hiddensize))) -- recurrent layer
          :add(nn.CAddTable()) -- merge
          :add(nn.Sigmoid()) -- transfer
       rnn = nn.Recurrence(rm, hiddensize, 1)
+   else
+      rnn = nn.LinearRNN(hiddensize, hiddensize)
    end
 
    stepmodule:add(rnn)
diff --git a/examples/recurrent-time-series.lua b/examples/recurrent-time-series.lua
index 4f47993..d8221e4 100644
--- a/examples/recurrent-time-series.lua
+++ b/examples/recurrent-time-series.lua
@@ -1,11 +1,11 @@
--- Multi-variate time-series example 
+-- Multi-variate time-series example
 
 require 'rnn'
 
 cmd = torch.CmdLine()
 cmd:text()
 cmd:text('Train a multivariate time-series model using RNN')
-cmd:option('--rho', 5, 'maximum number of time steps for back-propagate through time (BPTT)')
+cmd:option('--seqlen', 5, 'maximum number of time steps for back-propagate through time (BPTT)')
 cmd:option('--multiSize', 6, 'number of random variables as input and output')
 cmd:option('--hiddenSize', 10, 'number of hidden units used at output of the recurrent layer')
 cmd:option('--dataSize', 100, 'total number of time-steps in dataset')
@@ -53,20 +53,12 @@ print('Sequence:'); print(sequence)
 
 offsets = torch.LongTensor(opt.batchSize):random(1,opt.dataSize)
 
--- RNN
-r = nn.Recurrent(
-   opt.hiddenSize, -- size of output
-   nn.Linear(opt.multiSize, opt.hiddenSize), -- input layer
-   nn.Linear(opt.hiddenSize, opt.hiddenSize), -- recurrent layer
-   nn.Sigmoid(), -- transfer function
-   opt.rho
-)
-
+-- Simple RNN
 rnn = nn.Sequential()
-   :add(r)
+   :add(nn.LinearRNN(opt.multiSize, opt.hiddenSize))
    :add(nn.Linear(opt.hiddenSize, opt.multiSize))
 
-criterion = nn.MSECriterion() 
+criterion = nn.MSECriterion()
 
 -- use Sequencer for better data handling
 rnn = nn.Sequencer(rnn)
@@ -79,12 +71,12 @@ print(rnn)
 minErr = opt.multiSize -- report min error
 minK = 0
 avgErrs = torch.Tensor(opt.nIterations):fill(0)
-for k = 1, opt.nIterations do 
+for k = 1, opt.nIterations do
+
+   -- 1. create a sequence of seqlen time-steps
 
-   -- 1. create a sequence of rho time-steps
-   
    local inputs, targets = {}, {}
-   for step = 1, opt.rho do
+   for step = 1, opt.seqlen do
       -- batch of inputs
       inputs[step] = inputs[step] or sequence.new()
       inputs[step]:index(sequence, 1, offsets)
@@ -99,10 +91,10 @@ for k = 1, opt.nIterations do
 
    local outputs = rnn:forward(inputs)
    local err = criterion:forward(outputs, targets)
-   
+
    -- report errors
-   
-   print('Iter: ' .. k .. '   Err: ' .. err)   
+
+   print('Iter: ' .. k .. '   Err: ' .. err)
    if opt.plot then
       logger:add{['Err'] = err}
       logger:style{['Err'] = '-'}
@@ -116,14 +108,14 @@ for k = 1, opt.nIterations do
    end
 
    -- 3. backward sequence through rnn (i.e. backprop through time)
-   
+
    rnn:zeroGradParameters()
-   
+
    local gradOutputs = criterion:backward(outputs, targets)
    local gradInputs = rnn:backward(inputs, gradOutputs)
 
    -- 4. updates parameters
-   
+
    rnn:updateParameters(opt.learningRate)
 end
 
diff --git a/examples/recurrent-visual-attention.lua b/examples/recurrent-visual-attention.lua
index 75e0746..b9bff24 100644
--- a/examples/recurrent-visual-attention.lua
+++ b/examples/recurrent-visual-attention.lua
@@ -1,13 +1,11 @@
-require 'dp'
+local dl = require 'dataload'
 require 'rnn'
+require 'optim'
 
 -- References :
 -- A. http://papers.nips.cc/paper/5542-recurrent-models-of-visual-attention.pdf
 -- B. http://incompleteideas.net/sutton/williams-92.pdf
 
-
-version = 12
-
 --[[command line arguments]]--
 cmd = torch.CmdLine()
 cmd:text()
@@ -15,57 +13,65 @@ cmd:text('Train a Recurrent Model for Visual Attention')
 cmd:text('Example:')
 cmd:text('$> th rnn-visual-attention.lua > results.txt')
 cmd:text('Options:')
-cmd:option('--xpPath', '/path/to/saved_model.dat', 'path to a previously saved model')
-cmd:option('--learningRate', 0.01, 'learning rate at t=0')
-cmd:option('--minLR', 0.00001, 'minimum learning rate')
-cmd:option('--saturateEpoch', 800, 'epoch at which linear decayed LR will reach minLR')
-cmd:option('--momentum', 0.9, 'momentum')
-cmd:option('--maxOutNorm', -1, 'max norm each layers output neuron weights')
-cmd:option('--cutoffNorm', -1, 'max l2-norm of contatenation of all gradParam tensors')
-cmd:option('--batchSize', 20, 'number of examples per batch')
-cmd:option('--cuda', false, 'use CUDA')
-cmd:option('--useDevice', 1, 'sets the device (GPU) to use')
-cmd:option('--maxEpoch', 2000, 'maximum number of epochs to run')
-cmd:option('--maxTries', 100, 'maximum number of epochs to try to find a better local minima for early-stopping')
-cmd:option('--transfer', 'ReLU', 'activation function')
-cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization')
-cmd:option('--progress', false, 'print progress bar')
-cmd:option('--silent', false, 'dont print anything to stdout')
+cmd:option('-startlr', 0.01, 'learning rate at t=0')
+cmd:option('-minlr', 0.00001, 'minimum learning rate')
+cmd:option('-saturate', 800, 'epoch at which linear decayed LR will reach minLR')
+cmd:option('-momentum', 0.9, 'momentum')
+cmd:option('-maxnormout', -1, 'max norm each layers output neuron weights')
+cmd:option('-cutoff', -1, 'max l2-norm of contatenation of all gradParam tensors')
+cmd:option('-batchsize', 20, 'number of examples per batch')
+cmd:option('-cuda', false, 'use CUDA')
+cmd:option('-device', 1, 'sets the device (GPU) to use')
+cmd:option('-maxepoch', 2000, 'maximum number of epochs to run')
+cmd:option('-earlystop', 200, 'maximum number of epochs to try to find a better local minima for early-stopping')
+cmd:option('-transfer', 'ReLU', 'activation function')
+cmd:option('-uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization')
+cmd:option('-progress', false, 'print progress bar')
+cmd:option('-silent', false, 'dont print anything to stdout')
 
 --[[ reinforce ]]--
-cmd:option('--rewardScale', 1, "scale of positive reward (negative is 0)")
-cmd:option('--unitPixels', 13, "the locator unit (1,1) maps to pixels (13,13), or (-1,-1) maps to (-13,-13)")
-cmd:option('--locatorStd', 0.11, 'stdev of gaussian location sampler (between 0 and 1) (low values may cause NaNs)')
-cmd:option('--stochastic', false, 'Reinforce modules forward inputs stochastically during evaluation')
+cmd:option('-rewardScale', 1, "scale of positive reward (negative is 0)")
+cmd:option('-unitPixels', 13, "the locator unit (1,1) maps to pixels (13,13), or (-1,-1) maps to (-13,-13)")
+cmd:option('-locatorStd', 0.11, 'stdev of gaussian location sampler (between 0 and 1) (low values may cause NaNs)')
+cmd:option('-stochastic', false, 'Reinforce modules forward inputs stochastically during evaluation')
 
 --[[ glimpse layer ]]--
-cmd:option('--glimpseHiddenSize', 128, 'size of glimpse hidden layer')
-cmd:option('--glimpsePatchSize', 8, 'size of glimpse patch at highest res (height = width)')
-cmd:option('--glimpseScale', 2, 'scale of successive patches w.r.t. original input image')
-cmd:option('--glimpseDepth', 1, 'number of concatenated downscaled patches')
-cmd:option('--locatorHiddenSize', 128, 'size of locator hidden layer')
-cmd:option('--imageHiddenSize', 256, 'size of hidden layer combining glimpse and locator hiddens')
+cmd:option('-glimpseHiddenSize', 128, 'size of glimpse hidden layer')
+cmd:option('-glimpsePatchSize', 8, 'size of glimpse patch at highest res (height = width)')
+cmd:option('-glimpseScale', 2, 'scale of successive patches w.r.t. original input image')
+cmd:option('-glimpseDepth', 1, 'number of concatenated downscaled patches')
+cmd:option('-locatorHiddenSize', 128, 'size of locator hidden layer')
+cmd:option('-imageHiddenSize', 256, 'size of hidden layer combining glimpse and locator hiddens')
 
 --[[ recurrent layer ]]--
-cmd:option('--rho', 7, 'back-propagate through time (BPTT) for rho time-steps')
-cmd:option('--hiddenSize', 256, 'number of hidden units used in Simple RNN.')
-cmd:option('--LSTM', false, 'use LSTM instead of linear layer')
+cmd:option('-seqlen', 7, 'back-propagate through time (BPTT) for seqlen time-steps')
+cmd:option('-hiddenSize', 256, 'number of hidden units used in Simple RNN.')
+cmd:option('-lstm', false, 'use LSTM instead of linear layer')
 
 --[[ data ]]--
-cmd:option('--dataset', 'Mnist', 'which dataset to use : Mnist | TranslattedMnist | etc')
-cmd:option('--trainEpochSize', -1, 'number of train examples seen between each epoch')
-cmd:option('--validEpochSize', -1, 'number of valid examples used for early stopping and cross-validation')
-cmd:option('--noTest', false, 'dont propagate through the test set')
-cmd:option('--overwrite', false, 'overwrite checkpoint')
+cmd:option('-dataset', 'Mnist', 'which dataset to use : Mnist | TranslattedMnist | etc')
+cmd:option('-trainsize', -1, 'number of train examples seen between each epoch')
+cmd:option('-validsize', -1, 'number of valid examples used for early stopping and cross-validation')
+cmd:option('-noTest', false, 'dont propagate through the test set')
+cmd:option('-overwrite', false, 'overwrite checkpoint')
+cmd:option('-savepath', paths.concat(dl.SAVE_PATH, 'rmva'), 'path to directory where experiment log (includes model) will be saved')
+cmd:option('-id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)')
 
 cmd:text()
 local opt = cmd:parse(arg or {})
+opt.version = 13
+opt.id = opt.id == '' and ('ptb' .. ':' .. dl.uniqueid()) or opt.id
 if not opt.silent then
    table.print(opt)
 end
 
+if opt.cuda then
+   require 'cunn'
+   cutorch.setDevice(opt.device)
+end
+
 --[[data]]--
-if opt.dataset == 'TranslatedMnist' then
+--[[if opt.dataset == 'TranslatedMnist' then
    ds = torch.checkpoint(
       paths.concat(dp.DATA_DIR, 'checkpoint/dp.TranslatedMnist.t7'),
       function() return dp[opt.dataset]() end,
@@ -73,186 +79,224 @@ if opt.dataset == 'TranslatedMnist' then
    )
 else
    ds = dp[opt.dataset]()
-end
+end--]]
+
+assert(opt.dataset == 'Mnist')
+trainset, validset, testset = dl.loadMNIST()
 
 --[[Model]]--
-if opt.xpPath ~= '' then
-     assert(paths.filep(opt.xpPath), opt.xpPath..' does not exist')
-
-    if opt.cuda then
-        require 'cunn'
-        require 'optim'
-        cutorch.setDevice(opt.useDevice)
-    end
-
-    xp = torch.load(opt.xpPath)
-    agent = xp:model()
-    local checksum = agent:parameters()[1]:sum()
-    xp.opt.progress = opt.progress
-    opt = xp.opt
+
+-- glimpse network (rnn input layer)
+locationSensor = nn.Sequential()
+locationSensor:add(nn.SelectTable(2))
+locationSensor:add(nn.Linear(2, opt.locatorHiddenSize))
+locationSensor:add(nn[opt.transfer]())
+
+glimpseSensor = nn.Sequential()
+glimpseSensor:add(nn.SpatialGlimpse(opt.glimpsePatchSize, opt.glimpseDepth, opt.glimpseScale):float())
+glimpseSensor:add(nn.Collapse(3))
+glimpseSensor:add(nn.Linear(trainset:isize()[1]*(opt.glimpsePatchSize^2)*opt.glimpseDepth, opt.glimpseHiddenSize))
+glimpseSensor:add(nn[opt.transfer]())
+
+glimpse = nn.Sequential()
+glimpse:add(nn.ConcatTable():add(locationSensor):add(glimpseSensor))
+glimpse:add(nn.JoinTable(1,1))
+glimpse:add(nn.Linear(opt.glimpseHiddenSize+opt.locatorHiddenSize, opt.imageHiddenSize))
+glimpse:add(nn[opt.transfer]())
+
+-- RNN layer
+if opt.lstm then
+   glimpse:add(nn.RecLSTM(opt.imageHiddenSize, opt.hiddenSize))
 else
+   glimpse:add(nn.LinearRNN(opt.imageHiddenSize, opt.hiddenSize, nn[opt.transfer]()))
+end
 
-   -- glimpse network (rnn input layer)
-   locationSensor = nn.Sequential()
-   locationSensor:add(nn.SelectTable(2))
-   locationSensor:add(nn.Linear(2, opt.locatorHiddenSize))
-   locationSensor:add(nn[opt.transfer]())
-
-   glimpseSensor = nn.Sequential()
-   glimpseSensor:add(nn.SpatialGlimpse(opt.glimpsePatchSize, opt.glimpseDepth, opt.glimpseScale):float())
-   glimpseSensor:add(nn.Collapse(3))
-   glimpseSensor:add(nn.Linear(ds:imageSize('c')*(opt.glimpsePatchSize^2)*opt.glimpseDepth, opt.glimpseHiddenSize))
-   glimpseSensor:add(nn[opt.transfer]())
-
-   glimpse = nn.Sequential()
-   glimpse:add(nn.ConcatTable():add(locationSensor):add(glimpseSensor))
-   glimpse:add(nn.JoinTable(1,1))
-   glimpse:add(nn.Linear(opt.glimpseHiddenSize+opt.locatorHiddenSize, opt.imageHiddenSize))
-   glimpse:add(nn[opt.transfer]())
-   glimpse:add(nn.Linear(opt.imageHiddenSize, opt.hiddenSize))
-
-   -- rnn recurrent layer
-   if opt.LSTM then
-     recurrent = nn.RecLSTM(opt.hiddenSize, opt.hiddenSize)
-   else
-     recurrent = nn.Linear(opt.hiddenSize, opt.hiddenSize)
+imageSize = trainset:isize()
+assert(imageSize[2] == imageSize[3])
+
+-- actions (locator)
+locator = nn.Sequential()
+locator:add(nn.Linear(opt.hiddenSize, 2))
+locator:add(nn.HardTanh()) -- bounds mean between -1 and 1
+locator:add(nn.ReinforceNormal(2*opt.locatorStd, opt.stochastic)) -- sample from normal, uses REINFORCE learning rule
+assert(locator:get(3).stochastic == opt.stochastic, "Please update the dpnn package : luarocks install dpnn")
+locator:add(nn.HardTanh()) -- bounds sample between -1 and 1
+locator:add(nn.MulConstant(opt.unitPixels*2/imageSize[2]))
+
+attention = nn.RecurrentAttention(glimpse, locator, opt.seqlen, {opt.hiddenSize})
+
+-- model is a reinforcement learning agent
+agent = nn.Sequential()
+agent:add(nn.Convert())
+agent:add(attention)
+
+-- classifier :
+agent:add(nn.SelectTable(-1))
+agent:add(nn.Linear(opt.hiddenSize, #testset.classes))
+agent:add(nn.LogSoftMax())
+
+-- add the baseline reward predictor
+seq = nn.Sequential()
+seq:add(nn.Constant(1,1))
+seq:add(nn.Add(1))
+concat = nn.ConcatTable():add(nn.Identity()):add(seq)
+concat2 = nn.ConcatTable():add(nn.Identity()):add(concat)
+
+-- output will be : {classpred, {classpred, basereward}}
+agent:add(concat2)
+
+if opt.uniform > 0 then
+   for k,param in ipairs(agent:parameters()) do
+      param:uniform(-opt.uniform, opt.uniform)
    end
+end
 
+print("Recurrent visual attention model:")
+print(agent)
 
-   -- recurrent neural network
-   rnn = nn.Recurrent(opt.hiddenSize, glimpse, recurrent, nn[opt.transfer](), 99999)
+-- [[Criterion]]
 
-   imageSize = ds:imageSize('h')
-   assert(ds:imageSize('h') == ds:imageSize('w'))
+criterion = nn.ParallelCriterion(true)
+   :add(nn.ClassNLLCriterion()) -- BACKPROP
+   :add(nn.VRClassReward(agent, opt.rewardScale)) -- REINFORCE
 
-   -- actions (locator)
-   locator = nn.Sequential()
-   locator:add(nn.Linear(opt.hiddenSize, 2))
-   locator:add(nn.HardTanh()) -- bounds mean between -1 and 1
-   locator:add(nn.ReinforceNormal(2*opt.locatorStd, opt.stochastic)) -- sample from normal, uses REINFORCE learning rule
-   assert(locator:get(3).stochastic == opt.stochastic, "Please update the dpnn package : luarocks install dpnn")
-   locator:add(nn.HardTanh()) -- bounds sample between -1 and 1
-   locator:add(nn.MulConstant(opt.unitPixels*2/ds:imageSize("h")))
+targetmodule = nn.Convert()
 
-   attention = nn.RecurrentAttention(rnn, locator, opt.rho, {opt.hiddenSize})
 
-   -- model is a reinforcement learning agent
-   agent = nn.Sequential()
-   agent:add(nn.Convert(ds:ioShapes(), 'bchw'))
-   agent:add(attention)
+--[[ CUDA ]]--
 
-   -- classifier :
-   agent:add(nn.SelectTable(-1))
-   agent:add(nn.Linear(opt.hiddenSize, #ds:classes()))
-   agent:add(nn.LogSoftMax())
+if opt.cuda then
+   agent:cuda()
+   criterion:cuda()
+   targetmodule:cuda()
+else
+   agent:float()
+   criterion:float()
+   targetmodule:float()
+end
 
-   -- add the baseline reward predictor
-   seq = nn.Sequential()
-   seq:add(nn.Constant(1,1))
-   seq:add(nn.Add(1))
-   concat = nn.ConcatTable():add(nn.Identity()):add(seq)
-   concat2 = nn.ConcatTable():add(nn.Identity()):add(concat)
+--[[ experiment log ]]--
+
+-- is saved to file every time a new validation minima is found
+local xplog = {}
+xplog.opt = opt -- save all hyper-parameters and such
+-- will only serialize params
+xplog.model = nn.Serial(agent)
+xplog.model:mediumSerial()
+xplog.criterion = criterion
+xplog.targetmodule = targetmodule
+-- keep a log of NLL for each epoch
+xplog.traincm = {}
+xplog.validcm = {}
+-- will be used for early-stopping
+xplog.minvaliderr = 99999999
+xplog.epoch = 0
+
+--[[ training loop ]]--
+
+local ntrial = 0
+paths.mkdir(opt.savepath)
+
+local epoch = 1
+opt.lr = opt.startlr
+opt.trainsize = opt.trainsize == -1 and trainset:size() or opt.trainsize
+opt.validsize = opt.validsize == -1 and validset:size() or opt.validsize
+while opt.maxepoch <= 0 or epoch <= opt.maxepoch do
+   print("")
+   print("Epoch #"..epoch.." :")
+
+   local traincm = optim.ConfusionMatrix(10)
+
+   -- 1. training
+
+   local a = torch.Timer()
+   agent:training()
+   for i, input, target in trainset:sampleiter(opt.batchsize, opt.trainsize) do
+      target = targetmodule:forward(target)
+      -- forward
+      local output = agent:forward(input)
+      local err = criterion:forward(output, target)
+      traincm:batchAdd(output[1], target)
+
+      -- backward
+      local gradOutput = criterion:backward(output, target)
+      agent:zeroGradParameters()
+      agent:backward(input, gradOutput)
+
+      -- update
+      if opt.cutoff > 0 then
+         local norm = agent:gradParamClip(opt.cutoff) -- affects gradParams
+         opt.meanNorm = opt.meanNorm and (opt.meanNorm*0.9 + norm*0.1) or norm
+      end
+      agent:updateGradParameters(opt.momentum) -- affects gradParams
+      agent:updateParameters(opt.lr) -- affects params
+      agent:maxParamNorm(opt.maxnormout) -- affects params
 
-   -- output will be : {classpred, {classpred, basereward}}
-   agent:add(concat2)
+      if opt.progress then
+         xlua.progress(i, opt.trainsize)
+      end
 
-   if opt.uniform > 0 then
-      for k,param in ipairs(agent:parameters()) do
-         param:uniform(-opt.uniform, opt.uniform)
+      if i % 1000 == 0 then
+         collectgarbage()
       end
+
    end
-end
 
---[[Propagators]]--
-opt.decayFactor = (opt.minLR - opt.learningRate)/opt.saturateEpoch
-
-train = dp.Optimizer{
-   loss = nn.ParallelCriterion(true)
-      :add(nn.ModuleCriterion(nn.ClassNLLCriterion(), nil, nn.Convert())) -- BACKPROP
-      :add(nn.ModuleCriterion(nn.VRClassReward(agent, opt.rewardScale), nil, nn.Convert())) -- REINFORCE
-   ,
-   epoch_callback = function(model, report) -- called every epoch
-      if report.epoch > 0 then
-         opt.learningRate = opt.learningRate + opt.decayFactor
-         opt.learningRate = math.max(opt.minLR, opt.learningRate)
-         if not opt.silent then
-            print("learningRate", opt.learningRate)
-         end
-      end
-   end,
-   callback = function(model, report)
-      if opt.cutoffNorm > 0 then
-         local norm = model:gradParamClip(opt.cutoffNorm) -- affects gradParams
-         opt.meanNorm = opt.meanNorm and (opt.meanNorm*0.9 + norm*0.1) or norm
-         if opt.lastEpoch < report.epoch and not opt.silent then
-            print("mean gradParam norm", opt.meanNorm)
-         end
+   -- learning rate decay
+   opt.lr = opt.lr + (opt.minlr - opt.startlr)/opt.saturate
+   opt.lr = math.max(opt.minlr, opt.lr)
+
+   if not opt.silent then
+      print("learning rate", opt.lr)
+      if opt.meanNorm then
+         print("mean gradParam norm", opt.meanNorm)
       end
-      model:updateGradParameters(opt.momentum) -- affects gradParams
-      model:updateParameters(opt.learningRate) -- affects params
-      model:maxParamNorm(opt.maxOutNorm) -- affects params
-      model:zeroGradParameters() -- affects gradParams
-   end,
-   feedback = dp.Confusion{output_module=nn.SelectTable(1)},
-   sampler = dp.ShuffleSampler{
-      epoch_size = opt.trainEpochSize, batch_size = opt.batchSize
-   },
-   progress = opt.progress
-}
-
-
-valid = dp.Evaluator{
-   feedback = dp.Confusion{output_module=nn.SelectTable(1)},
-   sampler = dp.Sampler{epoch_size = opt.validEpochSize, batch_size = opt.batchSize},
-   progress = opt.progress
-}
-if not opt.noTest then
-   tester = dp.Evaluator{
-      feedback = dp.Confusion{output_module=nn.SelectTable(1)},
-      sampler = dp.Sampler{batch_size = opt.batchSize}
-   }
-end
+   end
 
---[[Experiment]]--
-
-xp = dp.Experiment{
-   model = agent,
-   optimizer = train,
-   validator = valid,
-   tester = tester,
-   observer = {
-      ad,
-      dp.FileLogger(),
-      dp.EarlyStopper{
-         max_epochs = opt.maxTries,
-         error_report={'validator','feedback','confusion','accuracy'},
-         maximize = true
-      }
-   },
-   random_seed = os.time(),
-   max_epoch = opt.maxEpoch
-}
-
---[[GPU or CPU]]--
-if opt.cuda then
-   print"Using CUDA"
-   require 'cutorch'
-   require 'cunn'
-   cutorch.setDevice(opt.useDevice)
-   xp:cuda()
-else
-   xp:float()
-end
+   if cutorch then cutorch.synchronize() end
+   local speed = a:time().real/opt.trainsize
+   print(string.format("Speed : %f sec/batch ", speed))
 
-xp:verbose(not opt.silent)
-if not opt.silent then
-   print"Agent :"
-   print(agent)
-end
+   traincm:updateValids()
+   print("Training error : "..((1 - traincm.totalValid)*100).."%")
+
+   xplog.traincm[epoch] = traincm
+
+   -- 2. cross-validation
+
+   agent:evaluate()
+   local validcm = optim.ConfusionMatrix(10)
+   for i, input, target in validset:subiter(opt.batchsize, opt.validsize) do
+      target = targetmodule:forward(target)
+      local output = agent:forward(input)
+      validcm:batchAdd(output[1], target)
+   end
+
+   validcm:updateValids()
+   local validerr = 1 - validcm.totalValid
+   print("Validation error : "..(validerr*100).."%")
+
+   xplog.validcm[epoch] = validcm
+   ntrial = ntrial + 1
+
+   -- early-stopping
+   if validerr < xplog.minvaliderr then
+      -- save best version of model
+      xplog.minvaliderr = validerr
+      xplog.epoch = epoch
+      local filename = paths.concat(opt.savepath, opt.id..'.t7')
+      print("Found new minima. Saving to "..filename)
+      torch.save(filename, xplog)
+      ntrial = 0
+   elseif ntrial >= opt.earlystop then
+      print("No new minima found after "..ntrial.." epochs.")
+      print("Stopping experiment.")
+      break
+   end
 
-xp.opt = opt
 
-if checksum then
-   assert(math.abs(xp:model():parameters()[1]:sum() - checksum) < 0.0001, "Loaded model parameters were changed???")
+   collectgarbage()
+   epoch = epoch + 1
 end
-xp:run(ds)
+print("Evaluate model using : ")
+print("th scripts/evaluate-rva.lua -xplogpath "..paths.concat(opt.savepath, opt.id..'.t7')..(opt.cuda and ' -cuda' or '')..' -evaltest')
diff --git a/examples/sequence-to-one.lua b/examples/sequence-to-one.lua
index 04413b9..8200c86 100644
--- a/examples/sequence-to-one.lua
+++ b/examples/sequence-to-one.lua
@@ -1,8 +1,8 @@
 require 'rnn'
 
--- hyper-parameters 
+-- hyper-parameters
 batchSize = 8
-rho = 10 -- sequence length
+seqlen = 10 -- sequence length
 hiddenSize = 100
 nIndex = 100 -- input words
 nClass = 7 -- output classes
@@ -10,16 +10,19 @@ lr = 0.1
 
 
 -- build simple recurrent neural network
-r = nn.Recurrent(
-   hiddenSize, nn.Identity(), 
-   nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), 
-   rho
-)
+rec = nn.Recurrence(
+   nn.Sequential()
+      :add(nn.ParallelTable()
+         :add(nn.Identity())
+         :add(nn.Linear(hiddenSize, hiddenSize)))
+      :add(nn.CAddTable())
+      :add(nn.Sigmoid())
+   , hiddenSize, 1)
 
 rnn = nn.Sequential()
    :add(nn.LookupTable(nIndex, hiddenSize))
    :add(nn.SplitTable(1,2))
-   :add(nn.Sequencer(r))
+   :add(nn.Sequencer(rec))
    :add(nn.SelectTable(-1)) -- this selects the last time-step of the rnn output sequence
    :add(nn.Linear(hiddenSize, nClass))
    :add(nn.LogSoftMax())
@@ -28,21 +31,21 @@ rnn = nn.Sequential()
 
 criterion = nn.ClassNLLCriterion()
 
--- build dummy dataset (task is to predict class given rho words)
+-- build dummy dataset (task is to predict class given seqlen words)
 -- similar to sentiment analysis datasets
 ds = {}
 ds.size = 1000
-ds.input = torch.LongTensor(ds.size,rho)
+ds.input = torch.LongTensor(ds.size,seqlen)
 ds.target = torch.LongTensor(ds.size):random(nClass)
 
 -- this will make the inputs somewhat correlate with the targets,
 -- such that the reduction in training error should be more obvious
-local correlate = torch.LongTensor(nClass, rho*3):random(nClass)
-local indices = torch.LongTensor(rho)
+local correlate = torch.LongTensor(nClass, seqlen*3):random(nClass)
+local indices = torch.LongTensor(seqlen)
 local buffer = torch.LongTensor()
 local sortVal, sortIdx = torch.LongTensor(), torch.LongTensor()
 for i=1,ds.size do
-   indices:random(1,rho*3)
+   indices:random(1,seqlen*3)
    buffer:index(correlate[ds.target[i]], 1, indices)
    sortVal:sort(sortIdx, buffer, 1)
    ds.input[i]:copy(sortVal:view(-1))
@@ -54,27 +57,27 @@ indices:resize(batchSize)
 -- training
 local inputs, targets = torch.LongTensor(), torch.LongTensor()
 for iteration = 1, 1000 do
-   -- 1. create a sequence of rho time-steps
-   
+   -- 1. create a sequence of seqlen time-steps
+
    indices:random(1,ds.size) -- choose some random samples
    inputs:index(ds.input, 1,indices)
    targets:index(ds.target, 1,indices)
-   
+
    -- 2. forward sequence through rnn
-   
-   rnn:zeroGradParameters() 
-   
+
+   rnn:zeroGradParameters()
+
    local outputs = rnn:forward(inputs)
    local err = criterion:forward(outputs, targets)
-   
+
    print(string.format("Iteration %d ; NLL err = %f ", iteration, err))
 
    -- 3. backward sequence through rnn (i.e. backprop through time)
-   
+
    local gradOutputs = criterion:backward(outputs, targets)
    local gradInputs = rnn:backward(inputs, gradOutputs)
-   
+
    -- 4. update
-   
+
    rnn:updateParameters(lr)
 end
diff --git a/examples/simple-bisequencer-network-variable.lua b/examples/simple-bisequencer-network-variable.lua
index 3426329..389ad27 100644
--- a/examples/simple-bisequencer-network-variable.lua
+++ b/examples/simple-bisequencer-network-variable.lua
@@ -6,7 +6,7 @@ math.randomseed(0)
 
 -- hyper-parameters
 batchSize = 8
-rho = 10 -- sequence length
+seqlen = 10 -- sequence length
 hiddenSize = 5
 nIndex = 10
 lr = 0.1
@@ -63,16 +63,16 @@ maxStep = {}
 for i=1,batchSize do
    table.insert(offsets, math.ceil(math.random()*sequence:size(1)))
    -- variable length for each sample
-   table.insert(maxStep, math.random(rho))
+   table.insert(maxStep, math.random(seqlen))
 end
 offsets = torch.LongTensor(offsets)
 
 -- training
 for iteration = 1, maxIter do
-   -- 1. create a sequence of rho time-steps
+   -- 1. create a sequence of seqlen time-steps
 
    local inputs, inputs_rev, targets = {}, {}, {}
-   for step=1,rho do
+   for step=1,seqlen do
       -- a batch of inputs
       inputs[step] = sequence:index(1, offsets)
       -- increment indices
@@ -93,7 +93,7 @@ for iteration = 1, maxIter do
    end
 
    -- reverse
-   for step=1,rho do
+   for step=1,seqlen do
       inputs_rev[step] = torch.LongTensor(batchSize)
       for j=1,batchSize do
          if step <= maxStep[j] then
@@ -113,7 +113,7 @@ for iteration = 1, maxIter do
 
    local correct = 0
    local total = 0
-   for step=1,rho do
+   for step=1,seqlen do
       probs = outputs[step]
       _, preds = probs:max(2)
       for j=1,batchSize do
diff --git a/examples/simple-bisequencer-network.lua b/examples/simple-bisequencer-network.lua
index ac86405..2d87004 100644
--- a/examples/simple-bisequencer-network.lua
+++ b/examples/simple-bisequencer-network.lua
@@ -1,8 +1,8 @@
 require 'rnn'
 
--- hyper-parameters 
+-- hyper-parameters
 batchSize = 8
-rho = 5 -- sequence length
+seqlen = 5 -- sequence length
 hiddenSize = 7
 nIndex = 10
 lr = 0.1
@@ -11,9 +11,9 @@ lr = 0.1
 -- forward rnn
 -- build simple recurrent neural network
 local fwd = nn.Recurrent(
-   hiddenSize, nn.LookupTable(nIndex, hiddenSize), 
-   nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), 
-   rho
+   hiddenSize, nn.LookupTable(nIndex, hiddenSize),
+   nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(),
+   seqlen
 )
 
 -- backward rnn (will be applied in reverse order of input sequence)
@@ -22,7 +22,7 @@ bwd:reset() -- reinitializes parameters
 
 -- merges the output of one time-step of fwd and bwd rnns.
 -- You could also try nn.AddTable(), nn.Identity(), etc.
-local merge = nn.JoinTable(1, 1) 
+local merge = nn.JoinTable(1, 1)
 
 -- we use BiSequencerLM because this is a language model (previous and next words to predict current word).
 -- If we used BiSequencer, x[t] would be used to predict y[t] = x[t] (which is cheating).
@@ -30,7 +30,7 @@ local merge = nn.JoinTable(1, 1)
 local brnn = nn.BiSequencerLM(fwd, bwd, merge)
 
 local rnn = nn.Sequential()
-   :add(brnn) 
+   :add(brnn)
    :add(nn.Sequencer(nn.Linear(hiddenSize*2, nIndex))) -- times two due to JoinTable
    :add(nn.Sequencer(nn.LogSoftMax()))
 
@@ -54,10 +54,10 @@ offsets = torch.LongTensor(offsets)
 -- training
 local iteration = 1
 while true do
-   -- 1. create a sequence of rho time-steps
-   
+   -- 1. create a sequence of seqlen time-steps
+
    local inputs, targets = {}, {}
-   for step=1,rho do
+   for step=1,seqlen do
       -- a batch of inputs
       inputs[step] = sequence:index(1, offsets)
       -- incement indices
@@ -69,24 +69,24 @@ while true do
       end
       targets[step] = sequence:index(1, offsets)
    end
-   
+
    -- 2. forward sequence through rnn
-   
-   rnn:zeroGradParameters() 
-   
+
+   rnn:zeroGradParameters()
+
    local outputs = rnn:forward(inputs)
    local err = criterion:forward(outputs, targets)
-   
+
    print(string.format("Iteration %d ; NLL err = %f ", iteration, err))
 
    -- 3. backward sequence through rnn (i.e. backprop through time)
-   
+
    local gradOutputs = criterion:backward(outputs, targets)
    local gradInputs = rnn:backward(inputs, gradOutputs)
-   
+
    -- 4. update
-   
+
    rnn:updateParameters(lr)
-   
+
    iteration = iteration + 1
 end
diff --git a/examples/simple-recurrence-network.lua b/examples/simple-recurrence-network.lua
index 607ca7f..c6d376e 100644
--- a/examples/simple-recurrence-network.lua
+++ b/examples/simple-recurrence-network.lua
@@ -1,15 +1,15 @@
 -- example use of nn.Recurrence
 require 'rnn'
 
--- hyper-parameters 
+-- hyper-parameters
 batchSize = 8
-rho = 5 -- sequence length
+seqlen = 5 -- sequence length
 hiddenSize = 7
 nIndex = 10
 lr = 0.1
 
--- the internal recurrentModule used by Recurrence
-local rm = nn.Sequential() -- input is {x[t], h[t-1]}
+-- the internal step module used by Recurrence
+local stepmodule = nn.Sequential() -- input is {x[t], h[t-1]}
    :add(nn.ParallelTable()
       :add(nn.LookupTable(nIndex, hiddenSize)) -- input layer
       :add(nn.Linear(hiddenSize, hiddenSize))) -- recurrent layer
@@ -17,7 +17,7 @@ local rm = nn.Sequential() -- input is {x[t], h[t-1]}
    :add(nn.Sigmoid()) -- transfer
 
 local rnn = nn.Sequential()
-   :add(nn.Recurrence(rm, hiddenSize, 0)) -- similar to nn.Recurrent, but more general, and no startModule
+   :add(nn.Recurrence(stepmodule, hiddenSize, 0)) -- essentially the same as nn.LookupRNN
    :add(nn.Linear(hiddenSize, nIndex))
    :add(nn.LogSoftMax())
 
@@ -45,10 +45,10 @@ offsets = torch.LongTensor(offsets)
 -- training
 local iteration = 1
 while true do
-   -- 1. create a sequence of rho time-steps
-   
+   -- 1. create a sequence of seqlen time-steps
+
    local inputs, targets = {}, {}
-   for step=1,rho do
+   for step=1,seqlen do
       -- a batch of inputs
       inputs[step] = sequence:index(1, offsets)
       -- incement indices
@@ -60,24 +60,24 @@ while true do
       end
       targets[step] = sequence:index(1, offsets)
    end
-   
+
    -- 2. forward sequence through rnn
-   
-   rnn:zeroGradParameters() 
-   
+
+   rnn:zeroGradParameters()
+
    local outputs = rnn:forward(inputs)
    local err = criterion:forward(outputs, targets)
-   
+
    print(string.format("Iteration %d ; NLL err = %f ", iteration, err))
 
    -- 3. backward sequence through rnn (i.e. backprop through time)
-   
+
    local gradOutputs = criterion:backward(outputs, targets)
    local gradInputs = rnn:backward(inputs, gradOutputs)
-   
+
    -- 4. update
-   
+
    rnn:updateParameters(lr)
-   
+
    iteration = iteration + 1
 end
diff --git a/examples/simple-recurrent-network.lua b/examples/simple-recurrent-network.lua
index bc3305f..614ad69 100644
--- a/examples/simple-recurrent-network.lua
+++ b/examples/simple-recurrent-network.lua
@@ -1,29 +1,21 @@
 require 'rnn'
 
--- hyper-parameters 
+-- hyper-parameters
 batchSize = 8
-rho = 5 -- sequence length
+seqlen = 5 -- sequence length
 hiddenSize = 7
 nIndex = 10
 lr = 0.1
 
-
--- build simple recurrent neural network
-local r = nn.Recurrent(
-   hiddenSize, nn.LookupTable(nIndex, hiddenSize), 
-   nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), 
-   rho
-)
-
 local rnn = nn.Sequential()
-   :add(r)
+   :add(nn.LookupRNN(nIndex, hiddenSize))
    :add(nn.Linear(hiddenSize, nIndex))
    :add(nn.LogSoftMax())
 
 -- wrap the non-recurrent module (Sequential) in Recursor.
 -- This makes it a recurrent module
 -- i.e. Recursor is an AbstractRecurrent instance
-rnn = nn.Recursor(rnn, rho)
+rnn = nn.Recursor(rnn, seqlen)
 
 print(rnn)
 
@@ -45,10 +37,10 @@ offsets = torch.LongTensor(offsets)
 -- training
 local iteration = 1
 while true do
-   -- 1. create a sequence of rho time-steps
-   
+   -- 1. create a sequence of seqlen time-steps
+
    local inputs, targets = {}, {}
-   for step=1,rho do
+   for step=1,seqlen do
       -- a batch of inputs
       inputs[step] = sequence:index(1, offsets)
       -- incement indices
@@ -60,31 +52,31 @@ while true do
       end
       targets[step] = sequence:index(1, offsets)
    end
-   
+
    -- 2. forward sequence through rnn
-   
-   rnn:zeroGradParameters() 
+
+   rnn:zeroGradParameters()
    rnn:forget() -- forget all past time-steps
-   
+
    local outputs, err = {}, 0
-   for step=1,rho do
+   for step=1,seqlen do
       outputs[step] = rnn:forward(inputs[step])
       err = err + criterion:forward(outputs[step], targets[step])
    end
-   
+
    print(string.format("Iteration %d ; NLL err = %f ", iteration, err))
 
    -- 3. backward sequence through rnn (i.e. backprop through time)
-   
+
    local gradOutputs, gradInputs = {}, {}
-   for step=rho,1,-1 do -- reverse order of forward calls
+   for step=seqlen,1,-1 do -- reverse order of forward calls
       gradOutputs[step] = criterion:backward(outputs[step], targets[step])
       gradInputs[step] = rnn:backward(inputs[step], gradOutputs[step])
    end
-   
+
    -- 4. update
-   
+
    rnn:updateParameters(lr)
-   
+
    iteration = iteration + 1
 end
diff --git a/examples/simple-sequencer-network.lua b/examples/simple-sequencer-network.lua
index 09add26..9f2c9c7 100644
--- a/examples/simple-sequencer-network.lua
+++ b/examples/simple-sequencer-network.lua
@@ -1,22 +1,15 @@
 require 'rnn'
 
--- hyper-parameters 
+-- hyper-parameters
 batchSize = 8
-rho = 5 -- sequence length
+seqlen = 5 -- sequence length
 hiddenSize = 7
 nIndex = 10
 lr = 0.1
 
 
--- build simple recurrent neural network
-local r = nn.Recurrent(
-   hiddenSize, nn.LookupTable(nIndex, hiddenSize), 
-   nn.Linear(hiddenSize, hiddenSize), nn.Sigmoid(), 
-   rho
-)
-
 local rnn = nn.Sequential()
-   :add(r)
+   :add(nn.LookupRNN(nIndex, hiddenSize))
    :add(nn.Linear(hiddenSize, nIndex))
    :add(nn.LogSoftMax())
 
@@ -43,10 +36,10 @@ offsets = torch.LongTensor(offsets)
 -- training
 local iteration = 1
 while true do
-   -- 1. create a sequence of rho time-steps
-   
+   -- 1. create a sequence of seqlen time-steps
+
    local inputs, targets = {}, {}
-   for step=1,rho do
+   for step=1,seqlen do
       -- a batch of inputs
       inputs[step] = sequence:index(1, offsets)
       -- incement indices
@@ -58,24 +51,24 @@ while true do
       end
       targets[step] = sequence:index(1, offsets)
    end
-   
+
    -- 2. forward sequence through rnn
-   
-   rnn:zeroGradParameters() 
-   
+
+   rnn:zeroGradParameters()
+
    local outputs = rnn:forward(inputs)
    local err = criterion:forward(outputs, targets)
-   
+
    print(string.format("Iteration %d ; NLL err = %f ", iteration, err))
 
    -- 3. backward sequence through rnn (i.e. backprop through time)
-   
+
    local gradOutputs = criterion:backward(outputs, targets)
    local gradInputs = rnn:backward(inputs, gradOutputs)
-   
+
    -- 4. update
-   
+
    rnn:updateParameters(lr)
-   
+
    iteration = iteration + 1
 end
diff --git a/scripts/evaluate-rva.lua b/scripts/evaluate-rva.lua
index 884dc33..10c0d04 100644
--- a/scripts/evaluate-rva.lua
+++ b/scripts/evaluate-rva.lua
@@ -1,4 +1,4 @@
-require 'dp'
+local dl = require 'dataload'
 require 'rnn'
 require 'optim'
 
@@ -11,45 +11,45 @@ cmd = torch.CmdLine()
 cmd:text()
 cmd:text('Evaluate a Recurrent Model for Visual Attention')
 cmd:text('Options:')
-cmd:option('--xpPath', '', 'path to a previously saved model')
-cmd:option('--cuda', false, 'model was saved with cuda')
-cmd:option('--evalTest', false, 'model was saved with cuda')
-cmd:option('--stochastic', false, 'evaluate the model stochatically. Generate glimpses stochastically')
-cmd:option('--dataset', 'Mnist', 'which dataset to use : Mnist | TranslattedMnist | etc')
-cmd:option('--overwrite', false, 'overwrite checkpoint')
+cmd:option('-xplogpath', '', 'path to an xplog generated with examples/recurrent-visual-attention.lua')
+cmd:option('-cuda', false, 'model was saved with cuda')
+cmd:option('-evaltest', false, 'evaluate performance on test set')
+cmd:option('-stochastic', false, 'evaluate the model stochatically. Generate glimpses stochastically')
+cmd:option('-dataset', 'Mnist', 'which dataset to use : Mnist | TranslattedMnist | etc')
+cmd:option('-overwrite', false, 'overwrite checkpoint')
 cmd:text()
 local opt = cmd:parse(arg or {})
 
 -- check that saved model exists
-assert(paths.filep(opt.xpPath), opt.xpPath..' does not exist')
+assert(paths.filep(opt.xplogpath), opt.xplogpath..' does not exist')
 
 if opt.cuda then
    require 'cunn'
 end
 
-xp = torch.load(opt.xpPath)
-model = xp:model().module 
-tester = xp:tester() or xp:validator() -- dp.Evaluator
-tester:sampler()._epoch_size = nil
-conf = tester:feedback() -- dp.Confusion
-cm = conf._cm -- optim.ConfusionMatrix
+xplog = torch.load(opt.xplogpath)
+model = torch.type(xplog.model) == 'nn.Serial' and xplog.model.modules[1] or xplog.model
 
-print("Last evaluation of "..(xp:tester() and 'test' or 'valid').." set :")
-print(cm)
+print("Last evaluation of validation set")
+print(xplog.validcm[#xplog.validcm])
 
+--[[
 if opt.dataset == 'TranslatedMnist' then
    ds = torch.checkpoint(
       paths.concat(dp.DATA_DIR, 'checkpoint/dp.TranslatedMnist_test.t7'),
-      function() 
-         local ds = dp[opt.dataset]{load_all=false} 
+      function()
+         local ds = dp[opt.dataset]{load_all=false}
          ds:loadTest()
          return ds
-         end, 
+         end,
       opt.overwrite
    )
 else
    ds = dp[opt.dataset]()
 end
+--]]
+assert(opt.dataset == 'Mnist')
+trainset, validset, testset = dl.loadMNIST()
 
 ra = model:findModules('nn.RecurrentAttention')[1]
 sg = model:findModules('nn.SpatialGlimpse')[1]
@@ -60,18 +60,21 @@ for i=1,#ra.actions do
    rn.stochastic = opt.stochastic
 end
 
-if opt.evalTest then
-   conf:reset()
-   tester:propagateEpoch(ds:testSet())
+local testcm = optim.ConfusionMatrix(10)
+if opt.evaltest then
+   model:evaluate()
+   for i, input, target in testset:subiter(opt.batchsize) do
+      target = xplog.targetmodule:forward(target)
+      local output = model:forward(input)
+      testcm:batchAdd(output[1], target)
+   end
 
-   print((opt.stochastic and "Stochastic" or "Deterministic") .. "evaluation of test set :")
-   print(cm)
+   print((opt.stochastic and "Stochastic" or "Deterministic") .. " evaluation of test set :")
+   print(testcm)
 end
 
-inputs = ds:get('test','inputs')
-targets = ds:get('test','targets', 'b')
 
-input = inputs:narrow(1,1,10)
+input = testset.inputs:narrow(1,1,10)
 model:training() -- otherwise the rnn doesn't save intermediate time-step states
 if not opt.stochastic then
    for i=1,#ra.actions do
@@ -105,8 +108,6 @@ function drawBox(img, bbox, channel)
 end
 
 locations = ra.actions
-
-input = nn.Convert(ds:ioShapes(),'bchw'):forward(input)
 glimpses = {}
 patches = {}
 
@@ -118,7 +119,7 @@ for i=1,input:size(1) do
       glimpses[j] = glimpse
       local patch = patches[j] or {}
       patches[j] = patch
-      
+
       local xy = location[i]
       -- (-1,-1) top left corner, (1,1) bottom right corner of image
       local x, y = xy:select(1,1), xy:select(1,2)
@@ -126,7 +127,7 @@ for i=1,input:size(1) do
       x, y = (x+1)/2, (y+1)/2
       -- (1,1), (input:size(3), input:size(4))
       x, y = x*(input:size(3)-1)+1, y*(input:size(4)-1)+1
-      
+
       local gimg = img:clone()
       for d=1,sg.depth do
          local size = sg.height*(sg.scale^(d-1))
@@ -134,15 +135,10 @@ for i=1,input:size(1) do
          drawBox(gimg, bbox, 1)
       end
       glimpse[i] = gimg
-      
-      local sg_, ps
-      if j == 1 then
-         sg_ = ra.rnn.initialModule:findModules('nn.SpatialGlimpse')[1]
-      else
-         sg_ = ra.rnn.sharedClones[j]:findModules('nn.SpatialGlimpse')[1]
-      end
+
+      local sg_ = ra.rnn.sharedClones[j]:findModules('nn.SpatialGlimpse')[1]
       patch[i] = image.scale(img:clone():float(), sg_.output[i]:narrow(1,1,1):float())
-      
+
       collectgarbage()
    end
 end