add cudnnv3 conv3d

6117f98b · Nicolas Ballas · --global · 34223240 · 6117f98b · 6117f98b
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -127,6 +127,30 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
        """ % dict(var=var, err=err, desc=desc, fail=fail)
+def c_set_tensorNd(var, nb_dim, desc, err, fail):
+    return """CudaNdarra
+{
+    int* stride = CudaNdarray_HOST_STRIDES(%(var)s);
+%(err)s = cudnnSetTensorNdDescriptorEx(
+    %(desc)s, nb_dim, CUDNN_DATA_FLOAT,
+    CudaNdarray_HOST_DIMS(%(var)s),
+    CudaNdarray_HOST_STRIDES(%(var)s));
+);
+if (%(err)s != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+    "could not set tensorNd descriptor: %%s, %d"
+    cudnnGetErrorString(%(err)s),
+    nb_dim,
+    );
+    %(fail)s
+}
+}
+        """ % dict(var=var, nb_dim=nb_dim, err=err, desc=desc, fail=fail)
 class DnnBase(GpuOp, COp):
    """
@@ -359,10 +383,146 @@ AddConfigVar('dnn.conv.workmem_bwd',
             EnumStr('deterministic', 'none', 'fft', 'guess'),
             in_c_key=False)
+class GpuDnnConv3dDesc(GpuOp):
+    """This Op builds a 3d convolution descriptor for use in the other
+    3d convolution operations.
+    see the doc of :func:`dnn_conv` for a description of the parameters
+    """
+    __props__ = ('border_mode', 'subsample', 'conv_mode')
+    def c_headers(self):
+        return ['cudnn.h', 'cudnn_helper.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def c_libraries(self):
+        return ['cudnn']
+    def c_compiler(self):
+        return NVCC_compiler
+    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
+        if isinstance(border_mode, int):
+            border_mode = (border_mode, border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            pad_d, pad_h, pad_w = map(int, border_mode)
+            border_mode = (pad_d, pad_h, pad_w)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", an integer or a pair of'
+                ' integers'.format(border_mode))
+        self.border_mode = border_mode
+        assert len(subsample) == 3
+        self.subsample = subsample
+        assert conv_mode in ('conv', 'cross')
+        self.conv_mode = conv_mode
+    def make_node(self, img_shape, kern_shape):
+        if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
+            raise TypeError('img must be 1D shape tensor')
+        if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
+            raise TypeError('kern must be 1D shape tensor')
+        return Apply(self, [img_shape, kern_shape],
+                     [CDataType("cudnnConvolutionDescriptor_t")()])
+    def c_code(self, node, name, inputs, outputs, sub):
+        img_shape, kern_shape = inputs
+        desc, = outputs
+        if isinstance(self.border_mode, tuple):
+            pad_d_spec, pad_h_spec, pad_w_spec = map(int, self.border_mode)
+            assert pad_d_spec, pad_h_spec >= 0 and pad_w_spec >= 0
+            bmode = 2
+        else:
+            pad_d_spec = pad_h_spec = pad_w_spec = 0
+            if self.border_mode == "valid":
+                bmode = 1
+            else:
+                assert self.border_mode == "full"
+                bmode = 0
+        if self.conv_mode == 'conv':
+            conv_flag = 'CUDNN_CONVOLUTION'
+        else:
+            conv_flag = 'CUDNN_CROSS_CORRELATION'
+        return """
+{
+  cudnnStatus_t err;
+  int pad[3];
+  if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
+                 "descriptor: %%s", cudnnGetErrorString(err));
+    %(fail)s
+  }
+  if (%(bmode)d == 2) {
+    pad[0] = %(pad_d_spec)d;
+    pad[1] = %(pad_h_spec)d;
+    pad[2] = %(pad_w_spec)d;
+  } else if (%(bmode)d == 1) {
+    pad[0] = 0;
+    pad[1] = 0;
+    pad[2] = 0;
+  } else if (%(bmode)d == 0) {
+    pad[0] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
+    pad[1] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
+    pad[2] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 4) - 1;
+  } else {
+    PyErr_SetString(PyExc_ValueError, "bad border mode");
+    %(fail)s
+  }
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 30
+  int subsample[3];
+  subsample[0] = %(subsd)d;
+  subsample[1] = %(subsx)d;
+  subsample[2] = %(subsy)d;
+  int upscale[3] = {1, 1, 1};
+  err = cudnnSetConvolutionNdDescriptor(
+  %(desc)s,
+  3,
+  pad, subsample, upscale,
+  %(conv_flag)s
+  );
+#else
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: CUDNN_VERSION must be >= 30");
+#endif
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
+                 cudnnGetErrorString(err));
+    %(fail)s
+  }
+}
+""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
+           bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
+           subsd=self.subsample[0], subsx=self.subsample[1], subsy=self.subsample[2],
+           pad_d_spec=pad_d_spec, pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
+    def c_code_cache_version(self):
+        return None
+#        return (2, version())
 # scalar constants
 _zero = constant(numpy.asarray(0.0, dtype='float32'))
 _one = constant(numpy.asarray(1.0, dtype='float32'))
+_ifour = constant(numpy.asarray(4, dtype='int32'))
+_ifive = constant(numpy.asarray(5, dtype='int32'))
 def ensure_float(val, default, name):
    if val is None:
@@ -377,6 +537,19 @@ def ensure_float(val, default, name):
        raise TypeError("%s: type is not float32" % (name,))
    return val
+def ensure_int(val, default, name):
+    if val is None:
+        return default.clone()
+    if not isinstance(val, Variable):
+        val = constant(val)
+    if hasattr(val, 'ndim') and val.ndim == 0:
+        val = as_scalar(val)
+    if not isinstance(val.type, theano.scalar.Scalar):
+        raise TypeError("%s: expected a scalar value" % (name,))
+    if not val.type.dtype == 'int32':
+        raise TypeError("%s: type is not int32" % (name,))
+    return val
 class GpuDnnConv(DnnBase, COp):
    """
@@ -449,7 +622,7 @@ class GpuDnnConv(DnnBase, COp):
        return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def
-    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
+    def make_node(self, img, kern, output, desc, alpha=None, beta=None, nb_dim=None):
        img = as_cuda_ndarray_variable(img)
        kern = as_cuda_ndarray_variable(kern)
        output = as_cuda_ndarray_variable(output)
@@ -466,12 +639,13 @@ class GpuDnnConv(DnnBase, COp):
        alpha = ensure_float(alpha, _one, 'alpha')
        beta = ensure_float(beta, _zero, 'beta')
+        nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
-        return Apply(self, [img, kern, output, desc, alpha, beta],
+        return Apply(self, [img, kern, output, desc, alpha, beta, nb_dim],
                     [output.type()])
    def grad(self, inp, grads):
-        img, kerns, output, desc, alpha, beta = inp
+        img, kerns, output, desc, alpha, beta, nb_dim = inp
        top, = grads
        top = gpu_contiguous(top)
@@ -480,13 +654,14 @@ class GpuDnnConv(DnnBase, COp):
        d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
+        d_nb_dim = grad_not_implemented(self, 6, nb_dim)
        return [d_img * alpha, d_kerns * alpha, top * beta,
-                DisconnectedType()(), d_alpha, d_beta]
+                DisconnectedType()(), d_alpha, d_beta, d_nb_dim]
    def connection_pattern(self, node):
        # not connected to desc
-        return [[1], [1], [1], [0], [1], [1]]
+        return [[1], [1], [1], [0], [1], [1], [1]]
    @staticmethod
    def get_out_shape(ishape, kshape, border_mode, subsample):
@@ -523,6 +698,101 @@ class GpuDnnConv(DnnBase, COp):
        return [shape[2]]
+class GpuDnnConv3d(GpuDnnConv):
+    """
+    The forward convolution.
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+    """
+    __props__ = ('workmem', 'inplace')
+    __input_name__ = ('image', 'kernel', 'output',
+                      'descriptor', 'alpha', 'beta')
+    def __init__(self, workmem=None, inplace=False):
+        """
+        :param workmem: either 'none', 'small', 'large', 'fft', 'time' or
+        'guess'. Default is the value of :attr:`config.dnn.conv.workmem`.
+        """
+        ### Only workmem = 'none' work with cudnn conv 3d
+        super(GpuDnnConv3d, self).__init__(workmem='none', inplace=inplace)
+    def make_node(self, img, kern, output, desc, alpha=None, beta=None, nb_dim=None):
+        img = as_cuda_ndarray_variable(img)
+        kern = as_cuda_ndarray_variable(kern)
+        output = as_cuda_ndarray_variable(output)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        if output.type.ndim != 5:
+            raise TypeError('output must be a 5D tensor')
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+        alpha = ensure_float(alpha, _one, 'alpha')
+        beta = ensure_float(beta, _zero, 'beta')
+        nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
+        return Apply(self, [img, kern, output, desc, alpha, beta, nb_dim],
+                     [output.type()])
+    def grad(self, inp, grads):
+        img, kerns, output, desc, alpha, beta, nb_dim = inp
+        top, = grads
+        top = gpu_contiguous(top)
+        d_img = GpuDnnConvGrad3dI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
+        d_kerns = GpuDnnConvGrad3dW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+        d_nb_dim = grad_not_implemented(self, 6, nb_dim)
+        return [d_img * alpha, d_kerns * alpha, top * beta,
+                DisconnectedType()(), d_alpha, d_beta, d_nb_dim]
+    @staticmethod
+    def get_out_shape(ishape, kshape, border_mode, subsample):
+        """
+        This function computes the output shape for a convolution with
+        the specified parameters.  `ishape` and `kshape` can be symbolic
+        or scalar.
+        """
+        b = ishape[0]  # Number of inputs
+        d = ishape[2]  # Depth of input feature maps
+        h = ishape[3]  # Height of input feature maps
+        w = ishape[4]  # Width of input feature maps
+        nb = kshape[0]  # Number of output feature maps
+        kd = kshape[2]  # Depth of each filter
+        kh = kshape[3]  # Height of each filter
+        kw = kshape[4]  # Width of each filter
+        sd, sh, sw = subsample
+        if border_mode == 'full':
+            padd = kd - 1
+            padh = kh - 1
+            padw = kw - 1
+        elif isinstance(border_mode, tuple):
+            padd, padh, padw = border_mode
+        else:
+            assert border_mode == 'valid'
+            padd = 0
+            padh = 0
+            padw = 0
+        return (
+            b, nb,
+            (d + 2*padd - kd)//sd + 1,
+            (h + 2*padh - kh)//sh + 1,
+            (w + 2*padw - kw)//sw + 1
+        )
 class GpuDnnConvGradW(DnnBase, COp):
    """
    The convolution gradient with respect to the weights.
@@ -554,7 +824,7 @@ class GpuDnnConvGradW(DnnBase, COp):
            self.inplace = False
    def grad(self, inp, grads):
-        img, top, output, desc, alpha, beta = inp
+        img, top, output, desc, alpha, beta, nb_dim = inp
        kerns, = grads
        kerns = gpu_contiguous(kerns)
@@ -563,13 +833,14 @@ class GpuDnnConvGradW(DnnBase, COp):
        d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
+        d_nb_dim = grad_not_implemented(self, 6, nb_dim)
        return (d_img * alpha, d_top * alpha, kerns * beta,
-                DisconnectedType()(), d_alpha, d_beta)
+                DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
    def connection_pattern(self, node):
        # not connected to desc
-        return [[1], [1], [1], [0], [1], [1]]
+        return [[1], [1], [1], [0], [1], [1], [1]]
    def get_op_params(self):
        if self.inplace:
@@ -597,7 +868,7 @@ class GpuDnnConvGradW(DnnBase, COp):
        return inplace_def + [alg_def, alg_choose_def]
-    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
+    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
        img = as_cuda_ndarray_variable(img)
        topgrad = as_cuda_ndarray_variable(topgrad)
        output = as_cuda_ndarray_variable(output)
@@ -614,13 +885,70 @@ class GpuDnnConvGradW(DnnBase, COp):
        alpha = ensure_float(alpha, _one, 'alpha')
        beta = ensure_float(beta, _zero, 'beta')
+        nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
-        return Apply(self, [img, topgrad, output, desc, alpha, beta],
+        return Apply(self, [img, topgrad, output, desc, alpha, beta, nb_dim],
                     [output.type()])
    def infer_shape(self, node, shape):
        return [shape[2]]
+class GpuDnnConv3dGradW(GpuDnnConvGradW):
+    """
+    The convolution gradient with respect to the weights.
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+    """
+    __props__ = ('workmem', 'inplace',)
+    __input_name__ = ('image', 'grad', 'output', 'descriptor', 'alpha', 'beta')
+    def __init__(self, inplace=False, workmem=None):
+        ### Only workmem = 'none' work with cudnn conv 3d
+        super(GpuDnnConv3dGradW, self).__init(inplace=inplace, workmem='none')
+    def grad(self, inp, grads):
+        img, top, output, desc, alpha, beta, nb_dim = inp
+        kerns, = grads
+        kerns = gpu_contiguous(kerns)
+        d_img = GpuDnnConvGrad3dI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
+        d_top = GpuDnnConv3d()(img, kerns, gpu_alloc_empty(*top.shape), desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+        d_nb_dim = grad_not_implemented(self, 6, nb_dim)
+        return (d_img * alpha, d_top * alpha, kerns * beta,
+                DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
+    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
+        img = as_cuda_ndarray_variable(img)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        output = as_cuda_ndarray_variable(output)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if output.type.ndim != 5:
+            raise TypeError('output must be 5D tensor')
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+        alpha = ensure_float(alpha, _one, 'alpha')
+        beta = ensure_float(beta, _zero, 'beta')
+        nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
+        return Apply(self, [img, topgrad, output, desc, alpha, beta, nb_dim],
+                     [output.type()])
 class GpuDnnConvGradI(DnnBase, COp):
    """
@@ -652,7 +980,7 @@ class GpuDnnConvGradI(DnnBase, COp):
            self.workmem = 'none'
    def grad(self, inp, grads):
-        kerns, top, output, desc, alpha, beta = inp
+        kerns, top, output, desc, alpha, beta, nb_dim = inp
        img, = grads
        img = gpu_contiguous(img)
@@ -661,13 +989,14 @@ class GpuDnnConvGradI(DnnBase, COp):
        d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
+        d_nb_dim = grad_not_implemented(self, 6, nb_dim)
        return (d_kerns * alpha, d_top * alpha, img * beta,
-                DisconnectedType()(), d_alpha, d_beta)
+                DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
    def connection_pattern(self, node):
        # not connected to desc
-        return [[1], [1], [1], [0], [1], [1]]
+        return [[1], [1], [1], [0], [1], [1], [1]]
    def get_op_params(self):
        if self.inplace:
@@ -695,7 +1024,7 @@ class GpuDnnConvGradI(DnnBase, COp):
        return inplace_def + [alg_def, alg_choose_def]
-    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
+    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
        kern = as_cuda_ndarray_variable(kern)
        topgrad = as_cuda_ndarray_variable(topgrad)
        output = as_cuda_ndarray_variable(output)
@@ -712,14 +1041,72 @@ class GpuDnnConvGradI(DnnBase, COp):
        alpha = ensure_float(alpha, _one, 'alpha')
        beta = ensure_float(beta, _zero, 'beta')
+        nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
-        return Apply(self, [kern, topgrad, output, desc, alpha, beta],
+        return Apply(self, [kern, topgrad, output, desc, alpha, beta, nb_dim],
                     [output.type()])
    def infer_shape(self, node, shape):
        return [shape[2]]
+class GpuDnnConvGrad3dI(GpuDnnConvGradI):
+    """
+    The convolution gradient with respect to the inputs.
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+    """
+    __props__ = ('inplace',)
+    __input_name__ = ('kernel', 'grad', 'output',
+                      'descriptor', 'alpha', 'beta')
+    def __init__(self, inplace=False):
+        super(GpuDnnConvGradI, self).__init__(inplace)
+    def grad(self, inp, grads):
+        kerns, top, output, desc, alpha, beta, nb_dim = inp
+        img, = grads
+        img = gpu_contiguous(img)
+        d_kerns = GpuDnnConvGrad3dW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
+        d_top = GpuDnnConv3d()(img, kerns, gpu_alloc_empty(*top.shape), desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+        d_nb_dim = grad_not_implemented(self, 6, nb_dim)
+        return (d_kerns * alpha, d_top * alpha, img * beta,
+                DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
+    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
+        kern = as_cuda_ndarray_variable(kern)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        output = as_cuda_ndarray_variable(output)
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if output.type.ndim != 4:
+            raise TypeError('output must be 5D tensor')
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+        alpha = ensure_float(alpha, _one, 'alpha')
+        beta = ensure_float(beta, _zero, 'beta')
+        nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
+        return Apply(self, [kern, topgrad, output, desc, alpha, beta, nb_dim],
+                     [output.type()])
 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
             conv_mode='conv', direction_hint=None, workmem=None):
    """
@@ -804,6 +1191,179 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
+def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
+             conv_mode='conv', direction_hint=None, workmem=None):
+    """
+    GPU convolution using cuDNN from NVIDIA.
+    The memory layout to use is 'bc01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+    :param img: images to do the convolution over
+    :param kerns: convolution filters
+    :param border_mode: one of 'valid', 'full'; additionally, the padding size
+        could be directly specified by an integer or a pair of integers
+    :param subsample: perform subsampling of the output (default: (1, 1))
+    :param conv_mode: perform convolution (kernels flipped) or cross-correlation.
+        One of 'conv', 'cross'. (default: 'conv')
+    :param direction_hint: Used by graph optimizers to change algorithm choice.
+        By default, GpuDnnConv will be used to carry out the convolution.
+        If border_mode is 'valid', subsample is (1,1) and direction_hint is
+        'bprop weights', it will use GpuDnnConvGradW.
+        If border_mode is 'full', subsample is (1,1) and direction_hint is
+        *not* 'forward!', it will use GpuDnnConvGradI.
+        This parameter is used internally by graph optimizers and may be
+        removed at any time without a deprecation period. You have been warned.
+    :param workmem: Specify the amount of working memory allowed.
+      More memory is usually faster.  One of 'none', 'small' or
+      'large'.  (default is None which takes its value from
+      :attr:`config.dnn.conv.workmem`)
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    """
+    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
+    if (border_mode == 'valid' and subsample == (1, 1) and
+        direction_hint == 'bprop weights'):
+        # Special case: We are asked to use GpuDnnConvGradW. We need to set
+        # up a suitable 'fake' convolution to compute the gradient for.
+        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
+        if conv_mode == 'conv':
+            # We need to flip manually. These 'kerns' are not the kernels
+            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
+            kerns = kerns[:, :, ::-1, ::-1]
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
+        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
+                        shape_i(img, 1, fgraph), shape2, shape3)
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+                              conv_mode='cross')(img.shape, out.shape)
+        conv = GpuDnnConvGradW()(img, kerns, out, desc)
+        return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
+    elif (border_mode == 'full' and subsample == (1, 1) and
+          direction_hint != 'forward!'):
+        # Special case: We can be faster by using GpuDnnConvGradI to compute
+        # the full convolution as the backward pass of a valid convolution.
+        # We just need to set up a suitable 'fake' valid convolution.
+        img = gpu_contiguous(img)  # cudnn v1 and v2 rc3 need contiguous data
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
+        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
+        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
+        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
+        out = gpu_alloc_empty(shape_i(img, 0, fgraph),
+                        shape_i(kerns, 1, fgraph), shape2, shape3)
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+                              conv_mode=conv_mode)(out.shape, kerns.shape)
+        return GpuDnnConvGradI()(kerns, img, out, desc)
+    # Standard case: We use GpuDnnConv with suitable padding.
+    # contig_version will return a gpu_contiguous copy
+    # if the img contains negative strides
+    img = gpu_contiguous(img)
+    kerns = gpu_contiguous(kerns)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img.shape, kerns.shape)
+    desc_op = desc.owner.op
+    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
+                                       desc_op.border_mode,
+                                       desc_op.subsample)
+    out = gpu_alloc_empty(*out_shp)
+    return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
+def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
+               conv_mode='conv', direction_hint=None, workmem=None):
+    """
+    GPU convolution using cuDNN from NVIDIA.
+    The memory layout to use is 'bct01', that is 'batch', 'channel',
+    'first dim', 'second dim', 'third dim' in that order.
+    :param img: images to do the convolution over
+    :param kerns: convolution filters
+    :param border_mode: one of 'valid', 'full'; additionally, the padding size
+        could be directly specified by an integer or a pair of integers
+    :param subsample: perform subsampling of the output (default: (1, 1, 1))
+    :param conv_mode: perform convolution (kernels flipped) or cross-correlation.
+        One of 'conv', 'cross'. (default: 'conv')
+    :param direction_hint: Used by graph optimizers to change algorithm choice.
+        By default, GpuDnnConv will be used to carry out the convolution.
+        If border_mode is 'valid', subsample is (1,1,1) and direction_hint is
+        'bprop weights', it will use GpuDnnConvGradW.
+        If border_mode is 'full', subsample is (1,1,1) and direction_hint is
+        *not* 'forward!', it will use GpuDnnConvGradI.
+        This parameter is used internally by graph optimizers and may be
+        removed at any time without a deprecation period. You have been warned.
+    :param workmem: Specify the amount of working memory allowed.
+      More memory is usually faster.  One of 'none', 'small' or
+      'large'.  (default is None which takes its value from
+      :attr:`config.dnn.conv.workmem`)
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    :warning: dnn_conv"d only works with cuDNN library 3.0
+    """
+    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
+    if (border_mode == 'valid' and subsample == (1, 1, 1) and
+        direction_hint == 'bprop weights'):
+        # Special case: We are asked to use GpuDnnConvGradW. We need to set
+        # up a suitable 'fake' convolution to compute the gradient for.
+        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
+        if conv_mode == 'conv':
+            # We need to flip manually. These 'kerns' are not the kernels
+            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
+            kerns = kerns[:, :, ::-1, ::-1, ::-1]
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
+        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+        shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
+        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
+                        shape_i(img, 1, fgraph), shape2, shape3, shape3)
+        desc = GpuDnnConv3dDesc(border_mode='valid', subsample=(1, 1, 1),
+                                conv_mode='cross')(img.shape, out.shape)
+        conv = GpuDnnConv3dGradW()(img, kerns, out, desc)
+        return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3, 4))
+    elif (border_mode == 'full' and subsample == (1, 1, 1) and
+          direction_hint != 'forward!'):
+        # Special case: We can be faster by using GpuDnnConvGradI to compute
+        # the full convolution as the backward pass of a valid convolution.
+        # We just need to set up a suitable 'fake' valid convolution.
+        img = gpu_contiguous(img)  # cudnn v1 and v2 rc3 need contiguous data
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
+        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
+        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
+        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
+        shape4 = shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1
+        out = gpu_alloc_empty(shape_i(img, 0, fgraph),
+                        shape_i(kerns, 1, fgraph), shape2, shape3, shape4)
+        desc = GpuDnnConv3dDesc(border_mode='valid', subsample=(1, 1, 1),
+                                conv_mode=conv_mode)(out.shape, kerns.shape)
+        return GpuDnnConv3dGradI()(kerns, img, out, desc)
+    # Standard case: We use GpuDnnConv with suitable padding.
+    # contig_version will return a gpu_contiguous copy
+    # if the img contains negative strides
+    img = gpu_contiguous(img)
+    kerns = gpu_contiguous(kerns)
+    desc = GpuDnnConv3dDesc(border_mode=border_mode, subsample=subsample,
+                            conv_mode=conv_mode)(img.shape, kerns.shape)
+    desc_op = desc.owner.op
+    out_shp = GpuDnnConv3d.get_out_shape(img.shape, kerns.shape,
+                                       desc_op.border_mode,
+                                       desc_op.subsample)
+    out = gpu_alloc_empty(*out_shp)
+    return GpuDnnConv3d(workmem=workmem)(img, kerns, out, desc)
 class GpuDnnPoolDesc(GpuOp):
    """
    This Op builds a pooling descriptor for use in the other

--- a/theano/sandbox/cuda/dnn_base.c
+++ b/theano/sandbox/cuda/dnn_base.c
@@ -33,6 +33,60 @@ c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
  return 0;
 }
+static int
+c_set_tensorNd(CudaNdarray *var, int dim, cudnnTensorDescriptor_t desc) {
+  int strides[dim];
+  for (int i = 0; i < dim; ++i)
+  {
+    if (CudaNdarray_HOST_STRIDES(var)[i])
+      strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
+    else
+    {
+      strides[i] = 1;
+      for (int j = i + 1; j < dim; ++j)
+        strides[i] *= CudaNdarray_HOST_DIMS(var)[j];
+    }
+  }
+  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
+                                                 CudaNdarray_HOST_DIMS(var),
+                                                 strides);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+		 "Could not set tensorNd descriptor: %s"
+		 "dim=%d",
+		 cudnnGetErrorString(err), dim);
+    return -1;
+  }
+  return 0;
+}
+static int
+c_set_filterNd(CudaNdarray *var, int dim, cudnnFilterDescriptor_t desc) {
+  if (!CudaNdarray_is_c_contiguous(var)) {
+    PyErr_SetString(PyExc_ValueError,
+		    "Only contiguous filters (kernels) are supported.");
+    return -1;
+  }
+  cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
+                                                 CudaNdarray_HOST_DIMS(var));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+		 "Could not set filter descriptor: %s."
+		 " dims= %d",
+		 cudnnGetErrorString(err), dim);
+    return -1;
+  }
+  return 0;
+}
 static int
 c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
  if (!CudaNdarray_is_c_contiguous(var)) {

--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -7,9 +7,9 @@ cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
 and the algorithms, if any, that were selected according to these dimensions
 and according to the amount of memory available at that time.
 */
-int APPLY_SPECIFIC(previous_input_shape)[4];
+int APPLY_SPECIFIC(previous_input_shape)[5];
-int APPLY_SPECIFIC(previous_kerns_shape)[4];
+int APPLY_SPECIFIC(previous_kerns_shape)[5];
-int APPLY_SPECIFIC(previous_output_shape)[4];
+int APPLY_SPECIFIC(previous_output_shape)[5];
 cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
 cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
 cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
@@ -21,12 +21,12 @@ APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(output) = NULL;
 APPLY_SPECIFIC(kerns) = NULL;
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
 	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
@@ -36,7 +36,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns)))
  FAIL;
 }
-for (int i = 0; i < 4; i++)
+for (int i = 0; i < 5; i++)
 {
  APPLY_SPECIFIC(previous_input_shape)[i] = 0;
  APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -3,7 +3,8 @@
 int
 APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
                         CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
-                         float alpha, float beta, CudaNdarray **output) {
+                         float alpha, float beta, int nb_dim, CudaNdarray **output) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -11,37 +12,49 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    return 1;
  }
-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
+  /* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
+  /*   return 1; */
+  /* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
+  /*   return 1; */
 #ifdef CONV_INPLACE
  Py_XDECREF(*output);
  *output = om;
  Py_INCREF(*output);
 #else
-  if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
+  if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
    return 1;
 #endif
-  if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
+   if (c_set_tensorNd(*output, nb_dim, APPLY_SPECIFIC(output)) == -1)
     return 1;
+  /* if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) */
+  /*   return 1; */
  {
    size_t worksize;
    void *workspace;
    cudnnConvolutionFwdAlgo_t chosen_algo;
+     for (int i = 0; (i < nb_dim); i++)
+       std::cout << i << "/" << nb_dim << ", "
+                 << CudaNdarray_HOST_DIMS(input)[i] << ", "
+                 << CudaNdarray_HOST_DIMS(kerns)[i] << std::endl;
    if (CHOOSE_ALGO)
    {
      // Check if the input and the kernels have the same shape as they have
      // last time the apply node was executed
      bool same_shapes = true;
-      for (int i = 0; (i < 4) && same_shapes; i++)
+      for (int i = 0; (i < nb_dim) && same_shapes; i++)
      {
          same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] !=
                          APPLY_SPECIFIC(previous_input_shape)[i]);
@@ -115,7 +128,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
        // Store the shapes of the inputs and kernels as well as the chosen
        // algorithm for future use.
        APPLY_SPECIFIC(previous_algo) = chosen_algo;
-        for (int i = 0; i < 4; i++)
+        for (int i = 0; i < nb_dim; i++)
        {
            APPLY_SPECIFIC(previous_input_shape)[i] =
                                            CudaNdarray_HOST_DIMS(input)[i];
@@ -142,7 +155,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    // If the chosen implementation is FFT, validate that it can be used
    // on the current data and default on a safe implementation if it
    // can't.
-    if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
+    // Following code is 2d-specific, but it is fine as ftt is define only for 2d-filters
+    if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
    {
      // Extract the properties of the convolution descriptor
@@ -186,12 +200,12 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
                                                  chosen_algo,
                                                  &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
+      std::cout << "here" << std::endl;
      PyErr_Format(PyExc_RuntimeError,
                   "GpuDnnConv: error getting worksize: %s",
                     cudnnGetErrorString(err));
      return 1;
    }
    workspace = get_work_mem(worksize);
    if (workspace == NULL && worksize != 0)
      return 1;
@@ -208,6 +222,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
  }
  if (err != CUDNN_STATUS_SUCCESS) {
+    std::cout << "here2" << std::endl;
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
 		 cudnnGetErrorString(err));
    return 1;

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -3,7 +3,7 @@
 int
 APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
                        CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
-                        float alpha, float beta, CudaNdarray **input) {
+                        float alpha, float beta, int nb_dim, CudaNdarray **input) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    return 1;
  }
-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  /* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
+  /*   return 1; */
+  /* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
+  /*   return 1; */
+  if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
 #ifdef CONV_INPLACE
@@ -22,13 +27,16 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
  *input = im;
  Py_INCREF(*input);
 #else
-  if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
+  if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
    return 1;
 #endif
-  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
+  /* if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) */
+  /*   return 1; */
+  if (c_set_tensorNd(*input, nb_dim, APPLY_SPECIFIC(input)) == -1)
    return 1;
  {

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
@@ -3,7 +3,7 @@
 int
 APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
                        CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
-                        float alpha, float beta, CudaNdarray **kerns) {
+                        float alpha, float beta, int nb_dim, CudaNdarray **kerns) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    return 1;
  }
-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  /* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
+  /*   return 1; */
+  /* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
+  /*   return 1; */
+  if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
    return 1;
 #ifdef CONV_INPLACE
@@ -22,13 +27,15 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
  *kerns = km;
  Py_INCREF(*kerns);
 #else
-  if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
+  if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
    return 1;
 #endif
-  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
+  /* if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) */
+  /*   return 1; */
+  if (c_set_filterNd(*kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
  {

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -13,6 +13,7 @@ from theano.tensor.signal.downsample import max_pool_2d
 from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
+from theano.sandbox.cuda import float32_shared_constructor as shared
 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
@@ -763,6 +764,58 @@ def test_dnn_conv_grad():
    utt.verify_grad(dconvw, [img_val, kern_val, out_val])
+def test_conv3d_valid():
+    print dnn.version()
+    if not cuda.dnn.dnn_available():
+        raise SkipTest('"3D conv not supported in cudnn v1')
+    def run_conv3d_valid(inputs_shape, filters_shape,
+                         subsample=(1, 1, 1)):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
+        conv_ref = theano.tensor.nnet.conv3D(V=inputs.dimshuffle(0, 2, 3, 4, 1),
+                                             W=filters.dimshuffle(0, 2, 3, 4, 1),
+                                             b=bias, d=subsample)
+        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
+                              border_mode="valid", subsample=subsample, conv_mode='cross')
+        f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3))
+        f = theano.function([], conv, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res = f()
+        print res_ref.shape, res.shape
+        utt.assert_allclose(res_ref, res)
+    run_conv3d_valid(inputs_shape=(128, 3, 5, 5, 5),
+                     filters_shape=(64, 3, 1, 2, 4))
+    run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
+                     filters_shape=(10, 4, 6, 12, 4),
+                     subsample=(2, 2, 2))
+    run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
+                     filters_shape=(10, 4, 6, 12, 4),
+                     subsample=(2, 2, 2))
+    run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
+                     filters_shape=(10, 1, 6, 12, 4),
+                     subsample=(3, 3, 3))
+    run_conv3d_valid(inputs_shape=(16, 2, 20, 12, 15),
+                     filters_shape=(10, 2, 6, 12, 4),
+                     subsample=(3, 3, 3))
+    run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
+                     filters_shape=(10, 1, 6, 12, 4),
+                     subsample=(3, 2, 1))
+    run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
+                     filters_shape=(10, 1, 6, 12, 4),
+                     subsample=(1, 2, 3))
 def test_version():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)