diff --git a/VolumetricDilatedMaxPooling.lua b/VolumetricDilatedMaxPooling.lua
new file mode 100644
index 000000000..050e2c917
--- /dev/null
+++ b/VolumetricDilatedMaxPooling.lua
@@ -0,0 +1,64 @@
+local THNN = require 'nn.THNN'
+local VolumetricDilatedMaxPooling, parent = torch.class('nn.VolumetricDilatedMaxPooling', 'nn.VolumetricMaxPooling')
+
+function VolumetricDilatedMaxPooling:__init(kT, kW, kH, dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH)
+   parent.__init(self, kT, kW, kH, dT, dW, dH, padT, padW, padH)
+
+   self.dilationT = dilationT or 1
+   self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
+
+end
+
+function VolumetricDilatedMaxPooling:updateOutput(input)
+   local dims = input:dim()
+   self.itime = input:size(dims-2)
+   self.iheight = input:size(dims-1)
+   self.iwidth = input:size(dims)
+
+   self.indices = self.indices or input.new()
+   input.THNN.VolumetricDilatedMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH,
+      self.ceil_mode
+   )
+   return self.output
+end
+
+function VolumetricDilatedMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.VolumetricDilatedMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH
+   )
+   return self.gradInput
+end
+
+function VolumetricDilatedMaxPooling:clearState()
+   if self.indices then 
+      self.indices:set() 
+   end
+   return parent.clearState(self)
+end
+
+function VolumetricDilatedMaxPooling:__tostring__()
+   local s =  string.format('%s(%dx%dx%d, %d,%d,%d', torch.type(self),
+                            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT.. ',' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ', ' .. self.dilationT .. ',' .. self.dilationW .. ',' .. self.dilationH
+   s = s .. ')'
+
+   return s
+end
diff --git a/doc/convolution.md b/doc/convolution.md
index 96d92d9b3..b1a0d4c7c 100644
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -37,6 +37,7 @@ a kernel for computing the weighted average in a neighborhood ;
     * [VolumetricFullConvolution](#nn.VolumetricFullConvolution) : a 3D full convolution over an input video (a sequence of images) ;
     * [VolumetricDilatedConvolution](#nn.VolumetricDilatedConvolution) : a 3D dilated convolution over an input image ;
     * [VolumetricMaxPooling](#nn.VolumetricMaxPooling) : a 3D max-pooling operation over an input video.
+    * [VolumetricDilatedMaxPooling](#nn.VolumetricDilatedMaxPooling) : a 3D dilated max-pooling operation over an input video ;
     * [VolumetricAveragePooling](#nn.VolumetricAveragePooling) : a 3D average-pooling operation over an input video.
     * [VolumetricMaxUnpooling](#nn.VolumetricMaxUnpooling) : a 3D max-unpooling operation.
     * [VolumetricReplicationPadding](#nn.VolumetricReplicationPadding) : Pads a volumetric feature map with the value at the edge of the input borders. ;
@@ -1022,6 +1023,30 @@ Applies 3D max-pooling operation in `kTxkWxkH` regions by step size
 `dTxdWxdH` steps. The number of output features is equal to the number of
 input planes / dT. The input can optionally be padded with zeros. Padding should be smaller than half of kernel size.  That is, `padT < kT/2`, `padW < kW/2` and `padH < kH/2`.
 
+<a name="nn.VolumetricDilatedMaxPooling"></a>
+### VolumetricDilatedMaxPooling ###
+
+```lua
+module = nn.VolumetricDilatedMaxPooling(kT, kW, kH [, dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH])
+```
+
+Also sometimes referred to as **atrous pooling**.
+Applies 3D dilated max-pooling operation in `kTxkWxkH` regions by step size
+`dTxdWxdH` steps. The number of output features is equal to the number of
+input planes. If `dilationT`, `dilationW` and `dilationH` are not provided, this is equivalent to performing normal `nn.VolumetricMaxPooling`.
+
+If the input image is a 4D tensor `nInputPlane x depth x height x width`, the output
+image size will be `nOutputPlane x otime x oheight x owidth` where
+
+```lua
+otime  = op((depth - (dilationT * (kT - 1) + 1) + 2*padT) / dT + 1)
+owidth  = op((width - (dilationW * (kW - 1) + 1) + 2*padW) / dW + 1)
+oheight = op((height - (dilationH * (kH - 1) + 1) + 2*padH) / dH + 1)
+```
+
+`op` is a rounding operator. By default, it is `floor`. It can be changed
+by calling `:ceil()` or `:floor()` methods.
+
 <a name="nn.VolumetricAveragePooling"></a>
 ### VolumetricAveragePooling ###
 
diff --git a/init.lua b/init.lua
index 98edfc532..70027a18c 100644
--- a/init.lua
+++ b/init.lua
@@ -128,6 +128,7 @@ require('nn.VolumetricConvolution')
 require('nn.VolumetricFullConvolution')
 require('nn.VolumetricDilatedConvolution')
 require('nn.VolumetricMaxPooling')
+require('nn.VolumetricDilatedMaxPooling')
 require('nn.VolumetricMaxUnpooling')
 require('nn.VolumetricAveragePooling')
 require('nn.VolumetricBatchNormalization')
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index 319ffc798..d7ecb13e8 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -1122,6 +1122,26 @@ TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH);
+
 TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/lib/THNN/generic/VolumetricDilatedMaxPooling.c b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
new file mode 100644
index 000000000..0db41aef3
--- /dev/null
+++ b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -0,0 +1,415 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c"
+#else
+
+static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+          
+          long kernel_t = fminf(kT, kT + start_t);
+          long kernel_h = fminf(kH, kH + start_h);
+          long kernel_w = fminf(kW, kW + start_w);
+
+          while(start_t < 0)
+            start_t += dilationT;
+          while(start_h < 0)
+            start_h += dilationH;
+          while(start_w < 0)
+            start_w += dilationW;
+          
+          real *ip = input_p + k * itime * iwidth * iheight
+            + start_t * iwidth * iheight + start_h * iwidth + start_w;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+          real *indzp = indz_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local max: */
+          real maxval = -THInf;
+          int x,y,z;
+          int mx, my, mz;
+
+          for (z = 0; z < kernel_t; z++)
+          {
+            for (y = 0; y < kernel_h; y++)
+            {
+              for (x = 0; x < kernel_w; x++)
+              {
+                if ((start_t + z * dilationT < itime) && (start_h + y * dilationH < iheight) && (start_w + x * dilationW < iwidth))
+                {
+                  real val = *(ip + z * dilationT * iwidth * iheight + y * dilationH * iwidth + x * dilationW);
+                  if (val > maxval)
+                  {
+                    maxval = val;
+                    // Store indices w.r.t the kernel dimension
+                    mz = z + (kT - kernel_t);
+                    my = y + (kH - kernel_h);
+                    mx = x + (kW - kernel_w);
+                  }
+                }
+              }
+            }
+          }
+
+          // set max values
+          ((unsigned char*)(indzp))[0] = mz;
+          ((unsigned char*)(indzp))[1] = my;
+          ((unsigned char*)(indzp))[2] = mx;
+          ((unsigned char*)(indzp))[3] = 0;
+
+          /* set output to local max */
+          *op = maxval;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH,
+          bool ceilMode)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
+    "input image smaller than kernel size"
+  );
+
+  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+    "pad should be smaller than half of kernel size"
+  );
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+  else
+  {
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+
+  if (otime < 1 || owidth < 1 || oheight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j uchar locations packed into float/double */
+    THTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j locations for each output point */
+    THTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+        input_data   + p * istride,
+        output_data  + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
+    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
+    real *indz_p_k = indz_p + k * otime * owidth * oheight;
+
+    /* calculate max points */
+    long ti, i, j;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* retrieve position of max */
+          real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
+          long maxti = ((unsigned char*)(indzp))[0] * dilationT + ti * dT - pT;
+          long maxi  = ((unsigned char*)(indzp))[1] * dilationH + i * dH - pH;
+          long maxj  = ((unsigned char*)(indzp))[2] * dilationW + j * dW - pW;
+
+          /* update gradient */
+          gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
+            gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      dT, dW, dH,
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+        gradInput_data + p * istride,
+        gradOutput_data + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        dT, dW, dH,
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricMaxPooling.c b/lib/THNN/generic/VolumetricMaxPooling.c
index 053c02c02..dc376e6a7 100644
--- a/lib/THNN/generic/VolumetricMaxPooling.c
+++ b/lib/THNN/generic/VolumetricMaxPooling.c
@@ -2,101 +2,6 @@
 #define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
 #else
 
-static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
-          real *input_p,
-          real *output_p,
-          real *indz_p,
-          long nslices,
-          long itime,
-          long iwidth,
-          long iheight,
-          long otime,
-          long owidth,
-          long oheight,
-          int kT,
-          int kW,
-          int kH,
-          int dT,
-          int dW,
-          int dH,
-          int pT,
-          int pW,
-          int pH)
-{
-  long k;
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)
-  {
-    /* loop over output */
-    long i, j, ti;
-    for (ti = 0; ti < otime; ti++)
-    {
-      for (i = 0; i < oheight; i++)
-      {
-        for (j = 0; j < owidth; j++)
-        {
-          /* local pointers */
-
-          long start_t = ti * dT - pT;
-          long start_h = i * dH - pH;
-          long start_w = j * dW - pW;
-
-          long kernel_t = fminf(kT, kT + start_t);
-          long kernel_h = fminf(kH, kH + start_h);
-          long kernel_w = fminf(kW, kW + start_w);
-
-          start_t = fmaxf(start_t, 0);
-          start_h = fmaxf(start_h, 0);
-          start_w = fmaxf(start_w, 0);
-
-          real *ip = input_p + k * itime * iwidth * iheight
-            + start_t * iwidth * iheight + start_h * iwidth + start_w;
-          real *op = output_p + k * otime * owidth * oheight
-            + ti * owidth * oheight + i * owidth + j;
-          real *indzp = indz_p + k * otime * owidth * oheight
-            + ti * owidth * oheight + i * owidth + j;
-
-          /* compute local max: */
-          real maxval = -THInf;
-          int x,y,z;
-          int mx, my, mz;
-
-          for (z = 0; z < kernel_t; z++)
-          {
-            for (y = 0; y < kernel_h; y++)
-            {
-              for (x = 0; x < kernel_w; x++)
-              {
-                if ((start_t + z < itime) && (start_h + y < iheight) && (start_w + x < iwidth))
-                {
-                  real val = *(ip + z * iwidth * iheight + y * iwidth + x);
-                  if (val > maxval)
-                  {
-                    maxval = val;
-                    // Store indices w.r.t the kernel dimension
-                    mz = z + (kT - kernel_t);
-                    my = y + (kH - kernel_h);
-                    mx = x + (kW - kernel_w);
-                  }
-                }
-              }
-            }
-          }
-
-          // set max values
-          ((unsigned char*)(indzp))[0] = mz;
-          ((unsigned char*)(indzp))[1] = my;
-          ((unsigned char*)(indzp))[2] = mx;
-          ((unsigned char*)(indzp))[3] = 0;
-
-          /* set output to local max */
-          *op = maxval;
-        }
-      }
-    }
-  }
-}
-
 void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -113,181 +18,10 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
           int pH,
           bool ceilMode)
 {
-  long nslices;
-  long itime;
-  long iheight;
-  long iwidth;
-  long otime;
-  long oheight;
-  long owidth;
-  real *input_data;
-  real *output_data;
-  real *indices_data;
-
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch-mode) tensor expected"
-  );
-
-  int dimN = 0;
-  int dimt = 1;
-  int dimh = 2;
-  int dimw = 3;
-
-  if (input->nDimension == 5)
-  {
-    dimN++;
-    dimt++;
-    dimh++;
-    dimw++;
-  }
-
-  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
-    "input image smaller than kernel size"
-  );
-
-  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
-    "pad should be smaller than half of kernel size"
-  );
-
-  /* sizes */
-  nslices = input->size[dimN];
-  itime   = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth  = input->size[dimw];
-  if (ceilMode)
-  {
-    otime   = (int)(ceil((float)(itime   - kT + 2 * pT) / dT) + 1);
-    oheight = (int)(ceil((float)(iheight - kH + 2 * pH) / dH) + 1);
-    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * pW) / dW) + 1);
-  }
-  else
-  {
-    otime   = (int)(floor((float)(itime   - kT + 2 * pT) / dT) + 1);
-    oheight = (int)(floor((float)(iheight - kH + 2 * pH) / dH) + 1);
-    owidth  = (int)(floor((float)(iwidth  - kW + 2 * pW) / dW) + 1);
-  }
-
-  if (pT || pW || pH)
-  {
-    // ensure that the last pooling starts inside the image
-    if ((otime - 1)*dT >= itime + pT)
-      --otime;
-    if ((oheight - 1)*dH >= iheight + pH)
-      --oheight;
-    if ((owidth  - 1)*dW >= iwidth  + pW)
-      --owidth;
-  }
-
-  /* get contiguous input */
-  input = THTensor_(newContiguous)(input);
-
-  if (input->nDimension == 4) /* non-batch mode */
-  {
-    /* resize output */
-    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
-    /* indices will contain ti,i,j uchar locations packed into float/double */
-    THTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
-
-    input_data = THTensor_(data)(input);
-    output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
-
-    THNN_(VolumetricMaxPooling_updateOutput_frame)(
-      input_data, output_data,
-      indices_data,
-      nslices,
-      itime, iwidth, iheight,
-      otime, owidth, oheight,
-      kT, kW, kH,
-      dT, dW, dH,
-      pT, pW, pH
-    );
-  }
-  else /* batch mode */
-  {
-    long p;
-    long nBatch = input->size[0];
-
-    long istride = nslices * itime * iwidth * iheight;
-    long ostride = nslices * otime * owidth * oheight;
-
-    /* resize output */
-    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
-    /* indices will contain ti,i,j locations for each output point */
-    THTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
-
-    input_data = THTensor_(data)(input);
-    output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
-
-#pragma omp parallel for private(p)
-    for (p=0; p < nBatch; p++)
-    {
-      THNN_(VolumetricMaxPooling_updateOutput_frame)(
-        input_data   + p * istride,
-        output_data  + p * ostride,
-        indices_data + p * ostride,
-        nslices,
-        itime, iwidth, iheight,
-        otime, owidth, oheight,
-        kT, kW, kH,
-        dT, dW, dH,
-        pT, pW, pH
-      );
-    }
-  }
-
-  /* cleanup */
-  THTensor_(free)(input);
-}
-
-static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
-          real *gradInput_p,
-          real *gradOutput_p,
-          real *indz_p,
-          long nslices,
-          long itime,
-          long iwidth,
-          long iheight,
-          long otime,
-          long owidth,
-          long oheight,
-          int dT,
-          int dW,
-          int dH,
-          int pT,
-          int pW,
-          int pH)
-{
-  long k;
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)
-  {
-    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
-    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
-    real *indz_p_k = indz_p + k * otime * owidth * oheight;
-
-    /* calculate max points */
-    long ti, i, j;
-    for (ti = 0; ti < otime; ti++)
-    {
-      for (i = 0; i < oheight; i++)
-      {
-        for (j = 0; j < owidth; j++)
-        {
-          /* retrieve position of max */
-          real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
-          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - pT;
-          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - pH;
-          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - pW;
-
-          /* update gradient */
-          gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
-            gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
-        }
-      }
-    }
-  }
+  THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          state, input, output, indices,
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
 }
 
 void THNN_(VolumetricMaxPooling_updateGradInput)(
@@ -303,90 +37,9 @@ void THNN_(VolumetricMaxPooling_updateGradInput)(
           int pW,
           int pH)
 {
-  int nslices;
-  int itime;
-  int iheight;
-  int iwidth;
-  int otime;
-  int oheight;
-  int owidth;
-  real *gradInput_data;
-  real *gradOutput_data;
-  real *indices_data;
-
-  int dimN = 0;
-  int dimt = 1;
-  int dimh = 2;
-  int dimw = 3;
-
-  /* get contiguous gradOutput */
-  gradOutput = THTensor_(newContiguous)(gradOutput);
-
-  /* resize */
-  THTensor_(resizeAs)(gradInput, input);
-  THTensor_(zero)(gradInput);
-
-  if (input->nDimension == 5)
-  {
-    dimN++;
-    dimt++;
-    dimh++;
-    dimw++;
-  }
-
-  /* sizes */
-  nslices = input->size[dimN];
-  itime = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
-  otime = gradOutput->size[dimt];
-  oheight = gradOutput->size[dimh];
-  owidth = gradOutput->size[dimw];
-
-  /* get raw pointers */
-  gradInput_data = THTensor_(data)(gradInput);
-  gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
-
-  /* backprop */
-  if (input->nDimension == 4) /* non-batch mode*/
-  {
-    THNN_(VolumetricMaxPooling_updateGradInput_frame)(
-      gradInput_data, gradOutput_data,
-      indices_data,
-      nslices,
-      itime, iwidth, iheight,
-      otime, owidth, oheight,
-      dT, dW, dH,
-      pT, pW, pH
-    );
-  }
-  else /* batch mode */
-  {
-    long p;
-    long nBatch = input->size[0];
-
-    long istride = nslices * itime * iwidth * iheight;
-    long ostride = nslices * otime * owidth * oheight;
-
-#pragma omp parallel for private(p)
-    for (p = 0; p < nBatch; p++)
-    {
-      THNN_(VolumetricMaxPooling_updateGradInput_frame)(
-        gradInput_data + p * istride,
-        gradOutput_data + p * ostride,
-        indices_data + p * ostride,
-        nslices,
-        itime, iwidth, iheight,
-        otime, owidth, oheight,
-        dT, dW, dH,
-        pT, pW, pH
-      );
-    }
-  }
-
-  /* cleanup */
-  THTensor_(free)(gradOutput);
+  THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          state, input, gradOutput, gradInput, indices,
+          dT, dW, dH, pT, pW, pH, 1, 1, 1);
 }
 
 #endif
diff --git a/lib/THNN/init.c b/lib/THNN/init.c
index d26c509d6..c75fa8871 100644
--- a/lib/THNN/init.c
+++ b/lib/THNN/init.c
@@ -172,6 +172,9 @@
 #include "generic/VolumetricMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/VolumetricDilatedMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricMaxUnpooling.c"
 #include "THGenerateFloatTypes.h"
 
diff --git a/test.lua b/test.lua
index 6673fed1d..0b57626a8 100644
--- a/test.lua
+++ b/test.lua
@@ -4223,6 +4223,55 @@ function nntest.VolumetricMaxPooling()
    mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
 end
 
+function nntest.VolumetricDilatedMaxPooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(2,3)
+      local kt = math.random(3,4)
+      local ki = math.random(3,4)
+      local kj = math.random(3,4)
+      local st = math.random(2,3)
+      local si = math.random(2,3)
+      local sj = math.random(2,3)
+      local outt = math.random(3,4)
+      local outi = math.random(3,4)
+      local outj = math.random(3,4)
+      local padT = math.min(math.random(0,1),math.floor(kt/2))
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH =  math.min(math.random(0,1),math.floor(kj/2))
+      local dilationT = math.random(1,3)
+      local dilationW = math.random(1,3)
+      local dilationH = math.random(1,3)
+      local int = (outt-1)*st+(dilationT*(kt-1)+1)-2*padT
+      local ini = (outi-1)*si+(dilationW*(ki-1)+1)-2*padW
+      local inj = (outj-1)*sj+(dilationH*(kj-1)+1)-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local module = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padT,padW,padH,dilationT,dilationW,dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+      local input = torch.rand(from,int,inj,ini)
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+      -- batch
+      local nbatch = math.random(2,5)
+      input = torch.rand(nbatch,from,int,inj,ini)
+      module = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padT,padW,padH,dilationT,dilationW,dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+  end
+end
+
 function nntest.VolumetricMaxUnpooling()
    local from = math.random(2,3)
    local kt = math.random(3,4)