Merge pull request #3091 from nouiz/abergeron-gpuarray_dnn

gpuarray in new back-end

Merge pull request #3091 from nouiz/abergeron-gpuarray_dnn
709c14cc · Frédéric Bastien · 74a05173 · e7071351 · 709c14cc · 709c14cc
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -56,7 +56,7 @@ if pygpu:
            init_dev(config.device)
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
-            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
+            optdb.add_tags('gpuarray', 'fast_run', 'fast_compile')
        elif config.gpuarray.init_device != '':
            init_dev(config.gpuarray.init_device)


--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -783,6 +783,10 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
        raise NotImplementedError("grad disabled")


+def empty_like(var):
+    return GpuAllocEmpty(var.type.dtype)(*var.shape)
+
+
 class GpuContiguous(Op):
    """
    Always return a c contiguous output. Copy the input only if it is

--- a/theano/sandbox/gpuarray/comp.py
+++ b/theano/sandbox/gpuarray/comp.py
@@ -29,12 +29,12 @@ class NVCC_compiler(NVCC_base):
        # exist in the past
        numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
        if bool(numpy_ver < [1, 7]):
-            flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
-            flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED")
-            flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
-            flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
-            flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
-            flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
+            flags.append("-DNPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
+            flags.append("-DNPY_ARRAY_ALIGNED=NPY_ALIGNED")
+            flags.append("-DNPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
+            flags.append("-DNPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
+            flags.append("-DNPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
+            flags.append("-DNPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")

        # If the user didn't specify architecture flags add them
        if not any(['-arch=sm_' in f for f in flags]):

--- a/theano/sandbox/gpuarray/cudnn_helper.h
+++ b/theano/sandbox/gpuarray/cudnn_helper.h
+#ifndef CUDNN_HELPER_H
+#define CUDNN_HELPER_H
+
+#include <cudnn.h>
+
+#ifndef CUDNN_VERSION
+#include <assert.h>
+
+// Here we define the R2 API in terms of functions in the R1 interface
+// This is only for what we use
+
+static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
+  switch (err) {
+  case CUDNN_STATUS_SUCCESS:
+    return "The operation completed successfully.";
+  case CUDNN_STATUS_NOT_INITIALIZED:
+    return "The handle was not initialized(Is your driver recent enought?).";
+  case CUDNN_STATUS_ALLOC_FAILED:
+    return "Ressource allocation failed inside the library.";
+  case CUDNN_STATUS_BAD_PARAM:
+    return "An incorrect value was passed in.";
+  case CUDNN_STATUS_ARCH_MISMATCH:
+    return "The current GPU does not support the required features (only cc 3.0+ are supported).";
+  case CUDNN_STATUS_MAPPING_ERROR:
+    return "An access to GPU memory space failed (probably due to a failure to bind texture).";
+  case CUDNN_STATUS_EXECUTION_FAILED:
+    return "A kernel failed to execute.";
+  case CUDNN_STATUS_INTERNAL_ERROR:
+    return "An internal cuDNN operation failed.";
+  case CUDNN_STATUS_NOT_SUPPORTED:
+    return "The combination of parameters is not currently supported.";
+  default:
+    return "Unknown error code.";
+  }
+}
+
+// some macros to help support cudnn R1 while using R2 code.
+#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
+#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
+#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
+
+typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
+
+static inline cudnnStatus_t
+cudnnGetConvolution2dForwardOutputDim(
+  const cudnnConvolutionDescriptor_t convDesc,
+  const cudnnTensorDescriptor_t inputTensorDesc,
+  const cudnnFilterDescriptor_t filterDesc,
+  int *n,
+  int *c,
+  int *h,
+  int *w) {
+  return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
+				   n, c, h, w);
+}
+
+typedef int cudnnConvolutionFwdAlgo_t;
+typedef int cudnnConvolutionFwdPreference_t;
+
+#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0
+
+static inline cudnnStatus_t
+cudnnGetConvolutionForwardAlgorithm(
+  cudnnHandle_t handle,
+  const cudnnTensorDescriptor_t srcDesc,
+  const cudnnFilterDescriptor_t filterDesc,
+  const cudnnConvolutionDescriptor_t convDesc,
+  const cudnnTensorDescriptor_t destDesc,
+  cudnnConvolutionFwdPreference_t preference,
+  size_t memoryLimitInbytes,
+  cudnnConvolutionFwdAlgo_t *algo) {
+  *algo = 0;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+static inline cudnnStatus_t
+cudnnGetConvolutionForwardWorkspaceSize(
+ cudnnHandle_t handle,
+ const cudnnTensorDescriptor_t srcDesc,
+ const cudnnFilterDescriptor_t filterDesc,
+ const cudnnConvolutionDescriptor_t convDesc,
+ const cudnnTensor4dDescriptor_t destDesc,
+ cudnnConvolutionFwdAlgo_t algo,
+ size_t *sizeInBytes) {
+  *sizeInBytes = 0;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+
+static inline cudnnStatus_t
+cudnnConvolutionForward_v2(
+  cudnnHandle_t handle,
+  const void *alpha,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
+  const cudnnFilterDescriptor_t filterDesc,
+  const void *filterData,
+  const cudnnConvolutionDescriptor_t convDesc,
+  cudnnConvolutionFwdAlgo_t algo,
+  void *workSpace,
+  size_t workSpaceSizeInBytes,
+  const void *beta,
+  const cudnnTensorDescriptor_t destDesc,
+  void *destData) {
+  assert(*(float *)alpha == 1.0);
+  cudnnAccumulateResult_t r;
+  if (*(float *)beta == 0.0) {
+    r = CUDNN_RESULT_NO_ACCUMULATE;
+  } else if (*(float *)beta == 1.0) {
+    r = CUDNN_RESULT_ACCUMULATE;
+  } else {
+    assert(0 && "beta must be 0.0 or 1.0");
+  }
+  return cudnnConvolutionForward(handle, srcDesc, srcData,
+				 filterDesc, filterData,
+				 convDesc, destDesc, destData,
+				 r);
+}
+#define cudnnConvolutionForward cudnnConvolutionForward_v2
+
+static inline cudnnStatus_t
+cudnnConvolutionBackwardFilter_v2(
+  cudnnHandle_t	handle,
+  const void *alpha,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
+  const cudnnTensorDescriptor_t diffDesc,
+  const void *diffData,
+  const cudnnConvolutionDescriptor_t convDesc,
+  const void *beta,
+  const cudnnFilterDescriptor_t gradDesc,
+  void *gradData) {
+  assert(*(float *)alpha == 1.0);
+  cudnnAccumulateResult_t r;
+  if (*(float *)beta == 0.0) {
+    r = CUDNN_RESULT_NO_ACCUMULATE;
+  } else if (*(float *)beta == 1.0) {
+    r = CUDNN_RESULT_ACCUMULATE;
+  } else {
+    assert(0 && "beta must be 0.0 or 1.0");
+  }
+  return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
+					diffDesc, diffData,
+					convDesc, gradDesc, gradData,
+					r);
+}
+
+#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
+
+static inline cudnnStatus_t
+cudnnConvolutionBackwardData_v2(
+  cudnnHandle_t	handle,
+  const void *alpha,
+  const cudnnFilterDescriptor_t filterDesc,
+  const void *filterData,
+  const cudnnTensorDescriptor_t diffDesc,
+  const void *diffData,
+  const cudnnConvolutionDescriptor_t convDesc,
+  const void *beta,
+  const cudnnTensorDescriptor_t gradDesc,
+  void *gradData) {
+  assert(*(float *)alpha == 1.0);
+  cudnnAccumulateResult_t r;
+  if (*(float *)beta == 0.0) {
+    r = CUDNN_RESULT_NO_ACCUMULATE;
+  } else if (*(float *)beta == 1.0) {
+    r = CUDNN_RESULT_ACCUMULATE;
+  } else {
+    assert(0 && "beta must be 0.0 or 1.0");
+  }
+  /* This function needs the casting because its params are not
+     declared as const */
+  return cudnnConvolutionBackwardData(handle,
+				      (cudnnFilterDescriptor_t)filterDesc,
+				      filterData,
+				      (cudnnTensorDescriptor_t)diffDesc,
+				      diffData,
+				      (cudnnConvolutionDescriptor_t)convDesc,
+				      (cudnnTensorDescriptor_t)gradDesc,
+				      gradData,
+				      r);
+}
+
+#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
+
+//Needed for R2 rc2
+# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
+#else
+
+// r2 rc1 and rc2 do not have the same macro defined
+// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
+# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
+
+#endif
+
+#endif
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
+import os
+import numpy
+
+import theano
+from theano import Op, Apply, tensor, config, Variable
+from theano.scalar import as_scalar, constant
+from theano.gradient import DisconnectedType, grad_not_implemented
+from theano.gof import Optimizer, local_optimizer, COp
+from theano.gof.cmodule import GCC_compiler
+from theano.gof.type import CDataType, Generic
+from theano.compile import optdb
+from theano.compile.ops import shape_i
+from theano.configparser import AddConfigVar, EnumStr, StrParam
+from theano.tensor.nnet import SoftmaxGrad
+from theano.tensor.signal.downsample import (
+    DownsampleFactorMax, DownsampleFactorMaxGrad)
+
+from . import pygpu, init_dev
+from .basic_ops import (as_gpuarray_variable,
+                        gpu_contiguous, HostFromGpu,
+                        GpuAllocEmpty, empty_like)
+from .conv import GpuConv
+
+# These don't exist in gpuarray
+# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
+from .nnet import GpuSoftmax
+from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
+from .opt_util import alpha_merge, output_merge
+
+# This is to avoid conflict with the one in cuda/dnn.py
+if not hasattr(config, 'dnn'):
+    AddConfigVar('dnn.conv.workmem',
+                 "Default value for the workmem attribute of cudnn "
+                 "convolutions.",
+                 EnumStr('small', 'none', 'large'),
+                 in_c_key=False)
+
+AddConfigVar('dnn.include_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(lambda: os.path.join(config.cuda.root, 'include')))
+
+AddConfigVar('dnn.library_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
+
+
+def dnn_available():
+    if dnn_available.avail is not None:
+        return dnn_available.avail
+    if pygpu is None:
+        dnn_available.msg = "PyGPU not available"
+        dnn_available.avail = False
+        return False
+    if not init_dev.device.startswith('cuda'):
+        dnn_available.msg = "Not on a CUDA device. Got %s." % init_dev.device
+        dnn_available.avail = False
+        return False
+    # This is a hack because bin_id is in the from of
+    # "sm_<major><minor>" for cuda devices.
+    if pygpu.get_default_context().bin_id < 'sm_30':
+        dnn_available.msg = "Device not supported by cuDNN"
+        dnn_available.avail = False
+    preambule = """
+#include <stdio.h>
+#include <cuda.h>
+#include <cudnn.h>
+#include <cudnn_helper.h>
+"""
+
+    body = """
+cudnnHandle_t _handle = NULL;
+cudnnStatus_t err;
+if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+  fprintf(stderr, "could not create cuDNN handle: %s",
+          cudnnGetErrorString(err));
+  return 1;
+}
+"""
+    # Do not run here the test program. It would run on the
+    # default gpu, not the one selected by the user. If mixed
+    # GPU are installed or if the GPUs are configured in
+    # exclusive mode, this cause bad detection.
+    comp, out, err = GCC_compiler.try_flags(
+        ["-l", "cudnn", "-I" + os.path.dirname(__file__),
+         "-I" + config.dnn.include_path,
+         "-L" + config.dnn.library_path],
+        preambule=preambule, body=body,
+        try_run=False, output=True)
+
+    dnn_available.avail = comp
+    if not dnn_available.avail:
+        dnn_available.msg = (
+            "Theano cannot compile with cuDNN. We got this error:\n" +
+            str(err))
+    else:
+        # If we can compile, check that we can import and run.
+        v = version()
+        if isinstance(v, tuple) and v[0] != v[1]:
+            dnn_available.avail = False
+            dnn_available.msg = ("Mixed dnn version. The header is"
+                                 " from one version, but we link with"
+                                 " a different version %s" % str(v))
+            raise RuntimeError(dnn_available.msg)
+        if version() == (20, 20):
+            dnn_available.avail = False
+            dnn_available.msg = (
+                "You have installed a release candidate of CuDNN v2."
+                " This isn't supported anymore."
+                " Update to CuDNN v2 final version.")
+            raise RuntimeError(dnn_available.msg)
+    return dnn_available.avail
+
+
+dnn_available.avail = None
+dnn_available.msg = None
+
+
+def c_set_tensor4d(var, desc, err, fail):
+    return """
+{
+  cudnnDataType_t dt;
+  size_t ds;
+  switch (%(var)s->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
+    return -1;
+  }
+  ds = gpuarray_get_elsize(%(var)s->ga.typecode);
+
+  int str0, str1, str2, str3;
+  // cudnn do not like 0s in strides
+  str3 = PyGpuArray_STRIDES(%(var)s)[3]?PyGpuArray_STRIDES(%(var)s)[3]/ds:1;
+  str2 = PyGpuArray_STRIDES(%(var)s)[2]?PyGpuArray_STRIDES(%(var)s)[2]/ds:PyGpuArray_DIMS(%(var)s)[3];
+  str1 = PyGpuArray_STRIDES(%(var)s)[1]?PyGpuArray_STRIDES(%(var)s)[1]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3];
+  str0 = PyGpuArray_STRIDES(%(var)s)[0]?PyGpuArray_STRIDES(%(var)s)[0]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3]*PyGpuArray_DIMS(%(var)s)[1];
+  %(err)s = cudnnSetTensor4dDescriptorEx(
+    %(desc)s, dt,
+    PyGpuArray_DIMS(%(var)s)[0],
+    PyGpuArray_DIMS(%(var)s)[1],
+    PyGpuArray_DIMS(%(var)s)[2],
+    PyGpuArray_DIMS(%(var)s)[3],
+    str0, str1, str2, str3);
+
+  if (%(err)s != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+    "could not set tensor4d descriptor: %%s",
+    cudnnGetErrorString(%(err)s));
+    %(fail)s
+  }
+}
+        """ % dict(var=var, err=err, desc=desc, fail=fail)
+
+
+class DnnBase(COp):
+    """
+    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
+    """
+    # dnn does not know about broadcasting, so we do not need to assert
+    # the input broadcasting pattern.
+    check_broadcast = False
+
+    def __init__(self):
+        COp.__init__(self, "dnn_base.c")
+
+    def c_headers(self):
+        return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h',
+                'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h',
+                'gpuarray_api.h', 'numpy_compat.h']
+
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__), pygpu.get_include(),
+                config.dnn.include_path]
+
+    def c_libraries(self):
+        return ['cudnn', 'gpuarray']
+
+    def c_lib_dirs(self):
+        return [config.dnn.library_path]
+
+
+class DnnVersion(Op):
+    def c_headers(self):
+        return ['cudnn.h']
+
+    def c_header_dirs(self):
+        return [config.dnn.include_path]
+
+    def c_libraries(self):
+        return ['cudnn']
+
+    def c_lib_dirs(self):
+        return [config.dnn.library_path]
+
+    def c_support_code(self):
+        return """
+#if PY_MAJOR_VERSION >= 3
+#define PyInt_FromLong PyLong_FromLong
+#endif
+"""
+
+    def make_node(self):
+        return Apply(self, [], [Generic()()])
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        o = outputs[0]
+        return """
+        #if defined(CUDNN_VERSION)
+        %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
+        #else
+        %(o)s = PyInt_FromLong(-1);
+        #endif
+        """ % locals()
+
+    def do_constant_folding(self, node):
+        # Needed as we do not want to cache this information.
+        return False
+
+    def c_code_cache_version(self):
+        # Not needed, but make it clear that we do not want to cache this.
+        return None
+
+
+def version():
+    """return the current cuDNN version we compile with.
+
+    This return a tuple with the header version and the library
+    version we link with. For older cudnn version without version
+    information, we return -1.
+
+    """
+    if not dnn_available():
+        raise Exception(
+            "We can't determine the cudnn version as it is not available",
+            dnn_available.msg)
+
+    if version.v is None:
+        f = theano.function([], DnnVersion()(),
+                            theano.Mode(optimizer=None),
+                            profile=False)
+        version.v = f()
+    return version.v
+version.v = None
+
+
+class GpuDnnConvDesc(Op):
+    """This Op builds a convolution descriptor for use in the other
+    convolution operations.
+
+    see the doc of :func:`dnn_conv` for a description of the parameters
+
+    """
+    __props__ = ('border_mode', 'subsample', 'conv_mode')
+
+    def c_headers(self):
+        return ['cudnn.h', 'cudnn_helper.h']
+
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__), config.dnn.include_path]
+
+    def c_libraries(self):
+        return ['cudnn']
+
+    def c_lib_dirs(self):
+        return [config.dnn.library_path]
+
+    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
+        if isinstance(border_mode, int):
+            border_mode = (border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            pad_h, pad_w = map(int, border_mode)
+            border_mode = (pad_h, pad_w)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", an integer or a pair of'
+                ' integers'.format(border_mode))
+        self.border_mode = border_mode
+        assert len(subsample) == 2
+        self.subsample = subsample
+        assert conv_mode in ('conv', 'cross')
+        self.conv_mode = conv_mode
+
+    def make_node(self, img_shape, kern_shape):
+        if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
+            raise TypeError('img must be 1D shape tensor')
+        if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
+            raise TypeError('kern must be 1D shape tensor')
+
+        return Apply(self, [img_shape, kern_shape],
+                     [CDataType("cudnnConvolutionDescriptor_t")()])
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        img_shape, kern_shape = inputs
+        desc, = outputs
+
+        if isinstance(self.border_mode, tuple):
+            pad_h_spec, pad_w_spec = map(int, self.border_mode)
+            assert pad_h_spec >= 0 and pad_w_spec >= 0
+            bmode = 2
+        else:
+            pad_h_spec = pad_w_spec = 0
+
+            if self.border_mode == "valid":
+                bmode = 1
+            else:
+                assert self.border_mode == "full"
+                bmode = 0
+
+        if self.conv_mode == 'conv':
+            conv_flag = 'CUDNN_CONVOLUTION'
+        else:
+            conv_flag = 'CUDNN_CROSS_CORRELATION'
+
+        return """
+{
+  cudnnStatus_t err;
+  int pad_h%(name)s;
+  int pad_w%(name)s;
+
+  if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
+                 "descriptor: %%s", cudnnGetErrorString(err));
+    %(fail)s
+  }
+
+  if (%(bmode)d == 2) {
+    pad_h%(name)s = %(pad_h_spec)d;
+    pad_w%(name)s = %(pad_w_spec)d;
+  } else if (%(bmode)d == 1) {
+    pad_h%(name)s = 0;
+    pad_w%(name)s = 0;
+  } else if (%(bmode)d == 0) {
+    pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
+    pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
+  } else {
+    PyErr_SetString(PyExc_ValueError, "bad border mode");
+    %(fail)s
+  }
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 20
+  err = cudnnSetConvolution2dDescriptor(
+  %(desc)s,
+  pad_h%(name)s,
+  pad_w%(name)s,
+  %(subsx)d, %(subsy)d, 1, 1,
+  %(conv_flag)s
+  );
+#else
+  err = cudnnSetConvolutionDescriptorEx(
+  %(desc)s,
+  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
+  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
+  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
+  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
+  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
+  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
+  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
+  pad_h%(name)s,
+  pad_w%(name)s,
+  %(subsx)d, %(subsy)d, 1, 1,
+  %(conv_flag)s
+  );
+#endif
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
+                 cudnnGetErrorString(err));
+    %(fail)s
+  }
+}
+""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
+           bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
+           subsx=self.subsample[0], subsy=self.subsample[1],
+           pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
+
+    def c_code_cache_version(self):
+        return (1, version())
+
+# scalar constants
+_zero = constant(numpy.asarray(0.0, dtype='float64'))
+_one = constant(numpy.asarray(1.0, dtype='float64'))
+
+
+def ensure_dt(val, default, name, dtype):
+    if val is None:
+        val = default.clone()
+    if not isinstance(val, Variable):
+        val = constant(val)
+    if hasattr(val, 'ndim') and val.ndim == 0:
+        val = as_scalar(val)
+    if not isinstance(val.type, theano.scalar.Scalar):
+        raise TypeError("%s: expected a scalar value" % (name,))
+    if not val.type.dtype == dtype:
+        val = val.astype(dtype)
+    return val
+
+
+class GpuDnnConv(DnnBase, COp):
+    """
+    The forward convolution.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+    """
+    __props__ = ('workmem', 'inplace')
+
+    def __init__(self, workmem=None, inplace=False):
+        """
+        :param workmem: either 'none', 'small' or 'large'.  Default is
+        the value of :attr:`config.dnn.conv.workmem`.
+        """
+        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
+                     "APPLY_SPECIFIC(conv_fwd)")
+        if workmem is None:
+            workmem = config.dnn.conv.workmem
+        self.workmem = workmem
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [2]}
+        assert self.workmem in ['none', 'small', 'large']
+
+    def get_op_params(self):
+        if self.inplace:
+            inpl_def = [('CONV_INPLACE', '1')]
+        else:
+            inpl_def = []
+        if version() == -1:
+            alg_def = ('CONV_ALGO', "0")
+        else:
+            if self.workmem == 'none':
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
+            elif self.workmem == 'small':
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+            elif self.workmem == 'large':
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
+            alg_def = ('CONV_ALGO', alg)
+        return [alg_def] + inpl_def
+
+    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
+        img = as_gpuarray_variable(img)
+        kern = as_gpuarray_variable(kern)
+        output = as_gpuarray_variable(output)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if output.type.ndim != 4:
+            raise TypeError('output must be a 4D tensor')
+
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+
+        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
+        beta = ensure_dt(beta, _zero, 'beta', img.dtype)
+
+        return Apply(self, [img, kern, output, desc, alpha, beta],
+                     [output.type()])
+
+    def grad(self, inp, grads):
+        img, kerns, output, desc, alpha, beta = inp
+        top, = grads
+
+        top = gpu_contiguous(top)
+
+        d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
+        d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+
+        return [d_img * alpha, d_kerns * alpha, top * beta,
+                DisconnectedType()(), d_alpha, d_beta]
+
+    def connection_pattern(self, node):
+        # not connected to desc
+        return [[1], [1], [1], [0], [1], [1]]
+
+    @staticmethod
+    def get_out_shape(ishape, kshape, border_mode, subsample):
+        """
+        This function computes the output shape for a convolution with
+        the specified parameters.  `ishape` and `kshape` can be symbolic
+        or scalar.
+        """
+        b = ishape[0]  # Number of inputs
+        h = ishape[2]  # Height of input feature maps
+        w = ishape[3]  # Width of input feature maps
+        nb = kshape[0]  # Number of output feature maps
+        kh = kshape[2]  # Height of each filter
+        kw = kshape[3]  # Width of each filter
+
+        sh, sw = subsample
+        if border_mode == 'full':
+            padh = kh - 1
+            padw = kw - 1
+        elif isinstance(border_mode, tuple):
+            padh, padw = border_mode
+        else:
+            assert border_mode == 'valid'
+            padh = 0
+            padw = 0
+
+        return (
+            b, nb,
+            (h + 2 * padh - kh) // sh + 1,
+            (w + 2 * padw - kw) // sw + 1
+        )
+
+    def infer_shape(self, node, shape):
+        return [shape[2]]
+
+
+class GpuDnnConvGradW(DnnBase, COp):
+    """
+    The convolution gradient with respect to the weights.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+
+    """
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
+                     "APPLY_SPECIFIC(conv_gw)")
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [2]}
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'inplace'):
+            self.inplace = False
+
+    def grad(self, inp, grads):
+        img, top, output, desc, alpha, beta = inp
+        kerns, = grads
+
+        kerns = gpu_contiguous(kerns)
+
+        d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
+        d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+
+        return (d_img * alpha, d_top * alpha, kerns * beta,
+                DisconnectedType()(), d_alpha, d_beta)
+
+    def connection_pattern(self, node):
+        # not connected to desc
+        return [[1], [1], [1], [0], [1], [1]]
+
+    def get_op_params(self):
+        if self.inplace:
+            return [('CONV_INPLACE', '1')]
+        else:
+            return []
+
+    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
+        img = as_gpuarray_variable(img)
+        topgrad = as_gpuarray_variable(topgrad)
+        output = as_gpuarray_variable(output)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if output.type.ndim != 4:
+            raise TypeError('output must be 4D tensor')
+
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+
+        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
+        beta = ensure_dt(beta, _zero, 'beta', img.dtype)
+
+        return Apply(self, [img, topgrad, output, desc, alpha, beta],
+                     [output.type()])
+
+    def infer_shape(self, node, shape):
+        return [shape[2]]
+
+
+class GpuDnnConvGradI(DnnBase):
+    """
+    The convolution gradient with respect to the inputs.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+
+    """
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
+                     "APPLY_SPECIFIC(conv_gi)")
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [2]}
+
+    def grad(self, inp, grads):
+        kerns, top, output, desc, alpha, beta = inp
+        img, = grads
+
+        img = gpu_contiguous(img)
+
+        d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
+        d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+
+        return (d_kerns * alpha, d_top * alpha, img * beta,
+                DisconnectedType()(), d_alpha, d_beta)
+
+    def connection_pattern(self, node):
+        # not connected to desc
+        return [[1], [1], [1], [0], [1], [1]]
+
+    def get_op_params(self):
+        if self.inplace:
+            return [('CONV_INPLACE', '1')]
+        else:
+            return []
+
+    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
+        kern = as_gpuarray_variable(kern)
+        topgrad = as_gpuarray_variable(topgrad)
+        output = as_gpuarray_variable(output)
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if output.type.ndim != 4:
+            raise TypeError('output must be 4D tensor')
+
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+
+        alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype)
+        beta = ensure_dt(beta, _zero, 'beta', kern.dtype)
+
+        return Apply(self, [kern, topgrad, output, desc, alpha, beta],
+                     [output.type()])
+
+    def infer_shape(self, node, shape):
+        return [shape[2]]
+
+
+def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
+             conv_mode='conv', direction_hint=None, workmem=None):
+    """
+    GPU convolution using cuDNN from NVIDIA.
+
+    The memory layout to use is 'bc01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+
+    :param img: images to do the convolution over
+    :param kerns: convolution filters
+    :param border_mode: one of 'valid', 'full'; additionally, the padding size
+        could be directly specified by an integer or a pair of integers
+    :param subsample: perform subsampling of the output (default: (1, 1))
+    :param conv_mode: perform convolution (kernels flipped) or cross-correlation.
+        One of 'conv', 'cross'. (default: 'conv')
+    :param direction_hint: Used by graph optimizers to change algorithm choice.
+        By default, GpuDnnConv will be used to carry out the convolution.
+        If border_mode is 'valid', subsample is (1,1) and direction_hint is
+        'bprop weights', it will use GpuDnnConvGradW.
+        If border_mode is 'full', subsample is (1,1) and direction_hint is
+        *not* 'forward!', it will use GpuDnnConvGradI.
+        This parameter is used internally by graph optimizers and may be
+        removed at any time without a deprecation period. You have been warned.
+    :param workmem: Specify the amount of working memory allowed.
+      More memory is usually faster.  One of 'none', 'small' or
+      'large'.  (default is None which takes its value from
+      :attr:`config.dnn.conv.workmem`)
+
+
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    """
+    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
+    if (border_mode == 'valid' and subsample == (1, 1) and
+            direction_hint == 'bprop weights'):
+        # Special case: We are asked to use GpuDnnConvGradW. We need to set
+        # up a suitable 'fake' convolution to compute the gradient for.
+        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
+        if conv_mode == 'conv':
+            # We need to flip manually. These 'kerns' are not the kernels
+            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
+            kerns = kerns[:, :, ::-1, ::-1]
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
+        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+        out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph),
+                                       shape_i(img, 1, fgraph), shape2, shape3)
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+                              conv_mode='cross')(img.shape, out.shape)
+        conv = GpuDnnConvGradW()(img, kerns, out, desc)
+        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3))
+
+    elif (border_mode == 'full' and subsample == (1, 1) and
+          direction_hint != 'forward!'):
+        # Special case: We can be faster by using GpuDnnConvGradI to compute
+        # the full convolution as the backward pass of a valid convolution.
+        # We just need to set up a suitable 'fake' valid convolution.
+        img = gpu_contiguous(img)  # cudnn v1 and v2 rc3 need contiguous data
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
+        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
+        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
+        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
+        out = GpuAllocEmpty(img.dtype)(shape_i(img, 0, fgraph),
+                                       shape_i(kerns, 1, fgraph),
+                                       shape2, shape3)
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+                              conv_mode=conv_mode)(out.shape, kerns.shape)
+        return GpuDnnConvGradI()(kerns, img, out, desc)
+
+    # Standard case: We use GpuDnnConv with suitable padding.
+    # contig_version will return a gpu_contiguous copy
+    # if the img contains negative strides
+    img = gpu_contiguous(img)
+    kerns = gpu_contiguous(kerns)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img.shape, kerns.shape)
+    desc_op = desc.owner.op
+    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
+                                       desc_op.border_mode,
+                                       desc_op.subsample)
+    out = GpuAllocEmpty(img.dtype)(*out_shp)
+    return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
+
+
+class GpuDnnPoolDesc(Op):
+    """
+    This Op builds a pooling descriptor for use in the other
+    pooling operations.
+
+    :param ws: windows size
+    :param stride: (dx, dy)
+    :param mode: 'max', 'average_inc_pad' or 'average_exc_pad'
+        The old deprecated name 'average' correspond to 'average_inc_pad'
+    :param pad: (padX, padY) padding information.
+        padX is the size of the left and right borders,
+        padY is the size of the top and bottom borders.
+    """
+    __props__ = ('ws', 'stride', 'mode', 'pad')
+
+    def c_headers(self):
+        return ['cudnn.h', 'cudnn_helper.h']
+
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__), config.dnn.include_path]
+
+    def c_libraries(self):
+        return ['cudnn']
+
+    def c_lib_dirs(self):
+        return [config.dnn.library_path]
+
+    def do_constant_folding(self, node):
+        return False
+
+    def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
+        if mode == 'average':
+            mode = 'average_inc_pad'
+        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
+        self.mode = mode
+        assert len(ws) == 2
+        self.ws = ws
+        assert len(stride) == 2
+        self.stride = stride
+        assert len(stride) == 2
+        self.pad = pad
+        if (pad[0] != 0 or pad[1] != 0) and version() == -1:
+            raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'pad'):
+            self.pad = (0, 0)
+
+    def make_node(self):
+        if self.pad != (0, 0) and version() == -1:
+            raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
+
+        return Apply(self, [],
+                     [CDataType("cudnnPoolingDescriptor_t")()])
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        desc, = outputs
+
+        if self.mode == 'max':
+            mode_flag = 'CUDNN_POOLING_MAX'
+        elif self.mode == "average_inc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
+        elif self.mode == "average_exc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
+            if version() == -1:
+                raise Exception("cudnn v1 do not support average_exc_pad")
+        else:
+            raise NotImplementedError("Unsupported pooling model.")
+
+        return """
+{
+  cudnnStatus_t err;
+
+  if ((err = cudnnCreatePoolingDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate pooling "
+                 "descriptor: %%s", cudnnGetErrorString(err));
+    %(fail)s
+  }
+#ifndef CUDNN_VERSION
+  err = cudnnSetPoolingDescriptor(
+  %(desc)s,
+  %(mode_flag)s,
+  %(wsX)d, %(wsY)d,
+  %(stridex)d, %(stridey)d
+  );
+#else
+  err = cudnnSetPooling2dDescriptor(
+  %(desc)s,
+  %(mode_flag)s,
+  %(wsX)d, %(wsY)d,
+  %(padX)d, %(padY)d,
+  %(stridex)d, %(stridey)d
+  );
+#endif
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
+                 cudnnGetErrorString(err));
+    %(fail)s
+  }
+}
+""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
+           wsX=self.ws[0], wsY=self.ws[1],
+           stridex=self.stride[0], stridey=self.stride[1],
+           padX=self.pad[0], padY=self.pad[1])
+
+    def c_code_cache_version(self):
+        return (2, version())
+
+
+class GpuDnnPool(DnnBase):
+    """
+    Pooling.
+
+    :param img: the image 4d tensor.
+    :param desc: the pooling descriptor.
+    """
+    __props__ = ()
+
+    def make_node(self, img, desc):
+        img = as_gpuarray_variable(img)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
+            raise TypeError('desc must be cudnnPoolingDescriptor_t')
+
+        return Apply(self, [img, desc],
+                     [img.type()])
+
+    def infer_shape(self, node, shape):
+        desc = node.inputs[1].owner.op
+        kh, kw = desc.ws
+        sh, sw = desc.stride
+        padh, padw = desc.pad
+        return [(
+            shape[0][0],
+            shape[0][1],
+            (shape[0][2] + 2 * padh - kh) // sh + 1,
+            (shape[0][3] + 2 * padw - kw) // sw + 1
+        )]
+
+    def c_support_code_struct(self, node, name):
+        return """
+cudnnTensorDescriptor_t input%(name)s;
+cudnnTensorDescriptor_t output%(name)s;
+""" % dict(name=name)
+
+    def c_init_code_struct(self, node, name, sub):
+        return """
+cudnnStatus_t err%(name)s;
+input%(name)s = NULL;
+output%(name)s = NULL;
+if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(inp): %%s", cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(out): %%s", cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+""" % dict(name=name, fail=sub['fail'])
+
+    def c_cleanup_code_struct(self, node, name):
+        return """
+if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
+if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
+""" % dict(name=name)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        desc = inputs[1]
+        out, = outputs
+
+        set_in = c_set_tensor4d(inputs[0], "input" + str(name),
+                                'err' + name, sub['fail'])
+
+        set_out = c_set_tensor4d(out, "output" + str(name),
+                                 'err' + name, sub['fail'])
+
+        return """
+cudnnStatus_t err%(name)s;
+
+size_t %(out)s_dims[4];
+
+if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
+  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+  %(fail)s
+}
+
+%(set_in)s
+
+cudnnPoolingMode_t mode;
+int wsX, wsY, vpad, hpad, strideX, strideY;
+#ifndef CUDNN_VERSION
+err%(name)s = cudnnGetPoolingDescriptor(
+        %(desc)s, &mode,
+        &wsX, &wsY,
+        &strideX, &strideY);
+#else
+err%(name)s = cudnnGetPooling2dDescriptor(
+        %(desc)s, &mode,
+        &wsX, &wsY,
+        &vpad, &hpad,
+        &strideX, &strideY);
+#endif
+
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError,
+               "GpuDnnPool: error doing cudnnGetPoolingDescriptor operation: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+
+%(out)s_dims[0] = PyGpuArray_DIMS(%(input)s)[0];
+%(out)s_dims[1] = PyGpuArray_DIMS(%(input)s)[1];
+%(out)s_dims[2] = (PyGpuArray_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
+%(out)s_dims[3] = (PyGpuArray_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;
+
+if (theano_prep_output(&%(out)s, 4, %(out)s_dims, %(input)s->ga.typecode,
+                       GA_C_ORDER, pygpu_default_context()) != 0) {
+  %(fail)s
+}
+
+%(set_out)s
+#ifndef CUDNN_VERSION
+err%(name)s = cudnnPoolingForward(
+_handle,
+%(desc)s,
+%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
+%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s)
+);
+#else
+{
+const float alpha = 1;
+const float beta = 0;
+err%(name)s = cudnnPoolingForward(
+_handle,
+%(desc)s,
+&alpha,
+%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
+&beta,
+%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s)
+);
+}
+#endif
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError,
+               "GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+""" % dict(out=out, desc=desc, fail=sub['fail'],
+           name=name, set_in=set_in,
+           set_out=set_out, input=inputs[0],
+           input_desc="input" + name,
+           output_desc="output" + name)
+
+    def grad(self, inp, grads):
+        img, desc = inp
+        grad, = grads
+
+        grad = gpu_contiguous(grad)
+
+        out = self(img, desc)
+
+        g_out = GpuDnnPoolGrad()(img, out, grad, desc)
+
+        return g_out, theano.gradient.DisconnectedType()()
+
+    def connection_pattern(self, node):
+        # not connected to desc
+        return [[1], [0]]
+
+    def c_code_cache_version(self):
+        return (7, version())
+
+
+class GpuDnnPoolGrad(DnnBase):
+    """
+    The pooling gradient.
+
+    :param inp: the input of the pooling.
+    :param out: the output of the pooling in the forward.
+    :param inp_grad: same size as out, but is the corresponding gradient information.
+    :param desc: The pooling descriptor.
+    """
+    __props__ = ()
+
+    def make_node(self, inp, out, inp_grad, desc):
+        inp = as_gpuarray_variable(inp)
+        if inp.type.ndim != 4:
+            raise TypeError('inp must be 4D tensor')
+
+        inp_grad = as_gpuarray_variable(inp_grad)
+        if inp_grad.type.ndim != 4:
+            raise TypeError('inp_grad must be 4D tensor')
+
+        out = as_gpuarray_variable(out)
+        if out.type.ndim != 4:
+            raise TypeError('out must be 4D tensor')
+
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
+            raise TypeError('desc must be cudnnPoolingDescriptor_t')
+
+        return Apply(self, [inp, out, inp_grad, desc],
+                     [inp.type()])
+
+    def c_support_code_struct(self, node, name):
+        return """
+cudnnTensorDescriptor_t input%(name)s;
+cudnnTensorDescriptor_t input_grad%(name)s;
+cudnnTensorDescriptor_t output%(name)s;
+cudnnTensorDescriptor_t output_grad%(name)s;
+""" % dict(name=name)
+
+    def c_init_code_struct(self, node, name, sub):
+        return """
+cudnnStatus_t err%(name)s;
+input%(name)s = NULL;
+input_grad%(name)s = NULL;
+output%(name)s = NULL;
+output_grad%(name)s = NULL;
+if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError,
+               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
+               "(input): %%s", cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+if ((err%(name)s = cudnnCreateTensorDescriptor(&input_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError,
+               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
+               "(input_grad): %%s", cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError,
+               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
+               "(output): %%s", cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError,
+               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
+               "(output_grad): %%s", cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+""" % dict(name=name, fail=sub['fail'])
+
+    def c_cleanup_code_struct(self, node, name):
+        return """
+if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
+if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); }
+if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
+if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); }
+""" % dict(name=name)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        # Here the name out and inp are based on the cudnn definition.
+        # Not the definition of this class.
+        # This make it complicated.
+        out, inp, inp_grad, desc = inputs
+        out_grad, = outputs
+
+        set_in = "\n".join([
+            c_set_tensor4d(inp, "input" + name,
+                           'err' + name, sub['fail']),
+            c_set_tensor4d(inp_grad, "input_grad" + name,
+                           'err' + name, sub['fail']),
+            c_set_tensor4d(out, "output" + name,
+                           'err' + name, sub['fail'])
+        ])
+
+        set_out = c_set_tensor4d(out, "output_grad" + name,
+                                 'err' + name, sub['fail'])
+
+        return """
+cudnnStatus_t err%(name)s;
+
+if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
+  PyErr_SetString(PyExc_ValueError,
+                  "GpuDnnPoolGrad: Only contiguous inputs are supported.");
+  %(fail)s
+}
+
+if (!GpuArray_IS_C_CONTIGUOUS(&%(input_grad)s->ga)) {
+  PyErr_SetString(PyExc_ValueError,
+                  "GpuDnnPoolGrad: Only contiguous input gradients are supported.");
+  %(fail)s
+}
+
+if (!GpuArray_IS_C_CONTIGUOUS(&%(output)s->ga)) {
+  PyErr_SetString(PyExc_ValueError,
+                  "GpuDnnPoolGrad: Only contiguous outputs are supported.");
+  %(fail)s
+}
+
+%(set_in)s
+
+if (theano_prep_output(&%(output_grad)s, PyGpuArray_NDIM(%(output)s),
+                       PyGpuArray_DIMS(%(output)s), %(output)s->ga.typecode,
+                       GA_C_ORDER, pygpu_default_context()) != 0)
+{
+  %(fail)s
+}
+
+%(set_out)s
+#ifndef CUDNN_VERSION
+err%(name)s = cudnnPoolingBackward(
+_handle,
+%(desc)s,
+%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
+%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
+%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
+%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
+);
+#else
+{
+const float alpha = 1;
+const float beta = 0;
+err%(name)s = cudnnPoolingBackward(
+_handle,
+%(desc)s,
+&alpha,
+%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
+%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
+%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
+&beta,
+%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
+);
+}
+#endif
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError,
+               "GpuDnnPoolGrad: error doing operation: %%s.",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+""" % dict(output_grad=out_grad, desc=desc,
+           fail=sub['fail'],
+           name=name, set_in=set_in,
+           set_out=set_out, input=inp, input_grad=inp_grad, output=out,
+           input_desc="input" + name,
+           input_grad_desc="input_grad" + name,
+           output_desc="output" + name,
+           output_grad_desc="output_grad" + name)
+
+    def c_code_cache_version(self):
+        return (5, version())
+
+    def infer_shape(self, node, shape):
+        return [shape[0]]
+
+
+def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
+    """
+    GPU pooling using cuDNN from NVIDIA.
+
+    The memory layout to use is 'bc01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+
+    :param img: images to do the pooling over
+    :param ws: subsampling window size
+    :param stride: subsampling stride (default: (1, 1))
+    :param mode: one of 'max', 'average_inc_pad' or 'average_exc_pad
+        (default: 'max')
+    :param pad: (padX, padY) padding information.
+        padX is the size of the left and right borders,
+        padY is the size of the top and bottom borders.
+
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    :note: This Op implements the ignore_border=True of max_pool_2d.
+    """
+    img = gpu_contiguous(img)
+    desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
+    return GpuDnnPool()(img, desc)
+
+
+class GpuDnnSoftmaxBase(DnnBase):
+    """
+    Op for the cuDNN Softmax.
+
+    :param tensor_format: Whether the data format is 'bc01' or 'b01c'.
+    :param algo: 'fast' or 'accurate' indicating whether computations should be
+        optimized for speed or accuracy respectively.
+    :param mode: 'instance' or 'channel' indicating whether the softmax should
+        be computed per image across 'c01' or per spatial location '01' per
+        image across 'c'.
+    """
+
+    __props__ = ('tensor_format', 'mode', 'algo')
+
+    def __init__(self, tensor_format, algo, mode):
+        assert(tensor_format in ('bc01', 'b01c'))
+        DnnBase.__init__(self)
+        self.tensor_format = tensor_format
+
+        assert(algo in ('fast', 'accurate'))
+        self.algo = algo
+
+        assert(mode in ('instance', 'channel'))
+        self.mode = mode
+
+        self.tensor_4d_descs = [softmax_input
+                                for softmax_input in self.softmax_inputs]
+        self.tensor_4d_descs.append('softmax_output')
+
+    def infer_shape(self, node, shape):
+        if self.direction == 'forward':
+            return [shape[0]]
+        else:
+            return [shape[1]]
+
+    def _define_tensor4d_desc(self, name, id):
+        return """
+cudnnTensorDescriptor_t %(id)s_%(name)s;
+""" % dict(name=name, id=id)
+
+    def _init_tensor4d_desc(self, name, id, fail):
+        return """
+%(id)s_%(name)s = NULL;
+if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               ": %%s", cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+""" % dict(name=name, id=id, fail=fail)
+
+    def _clean_tensor4d_desc(self, name, id):
+        return """
+if(%(id)s_%(name)s!= NULL)
+  cudnnDestroyTensorDescriptor(%(id)s_%(name)s);
+""" % dict(name=name, id=id)
+
+    def c_support_code_struct(self, node, name):
+        result = ''
+        for id in self.tensor_4d_descs:
+            result += self._define_tensor4d_desc(name, id)
+        return result
+
+    def c_init_code_struct(self, node, name, sub):
+        result = """
+cudnnStatus_t err%(name)s;
+""" % dict(name=name)
+
+        for id in self.tensor_4d_descs:
+            result += self._init_tensor4d_desc(name, id, sub['fail'])
+        return result
+
+    def c_cleanup_code_struct(self, node, name):
+        result = ''
+        for id in self.tensor_4d_descs:
+            result += self._clean_tensor4d_desc(name, id)
+        return result
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        ins = inputs
+        outs, = outputs
+
+        if self.tensor_format == 'b01c':
+            tensor_format = 1
+        else:
+            tensor_format = 0
+
+        if self.mode == 'instance':
+            mode = 1
+        else:
+            mode = 0
+
+        if self.algo == 'fast':
+            algo = 1
+        else:
+            algo = 0
+
+        # Setup configuration variables.
+        result = """
+cudnnStatus_t err%(name)s;
+cudnnTensorFormat_t format%(name)s = CUDNN_TENSOR_NCHW;
+if (%(tensor_format)d == 1)
+  format%(name)s = CUDNN_TENSOR_NHWC;
+
+cudnnSoftmaxAlgorithm_t algo%(name)s = CUDNN_SOFTMAX_ACCURATE;
+if (%(algo)d == 1)
+  algo%(name)s = CUDNN_SOFTMAX_FAST;
+
+cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL;
+if (%(mode)d == 1)
+  mode%(name)s = CUDNN_SOFTMAX_MODE_INSTANCE;
+""" % dict(name=name, tensor_format=tensor_format, mode=mode, algo=algo)
+
+        # Validate the input and build the input variables.
+        for input_idx, input_name in enumerate(self.softmax_inputs):
+            result += c_set_tensor4d(ins[input_idx], input_name + "_" + name,
+                                     "err" + name, sub['fail'])
+
+        subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
+                    name=name)
+
+        for idx, softmax_input in enumerate(self.softmax_inputs):
+            subs['name%d' % idx] = softmax_input
+            subs['ins%d' % idx] = inputs[idx]
+
+        # Build and prepare the output variable.
+        result += """
+if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
+                       PyGpuArray_DIMS(%(ins)s), %(ins)s->ga.typecode,
+                       GA_C_ORDER, pygpu_default_context()) != 0)
+{
+  %(fail)s
+}
+""" % subs
+        result += c_set_tensor4d(outs,
+                                 "softmax_output_" + name,
+                                 "err" + name, sub['fail'])
+
+        # Add on a call to the method that does the actual work.
+        result += self.method() % subs
+
+        return result
+
+    def c_code_cache_version(self):
+        return (0, 7, version())
+
+    def method(self):
+        raise NotImplementedError('GpuDnnSoftmaxBase::method')
+
+
+class GpuDnnSoftmax(GpuDnnSoftmaxBase):
+    """
+    Op for the cuDNN Softmax.
+
+    :param tensor_format: Whether the data format is 'bc01' or 'b01c'.
+    :param algo: 'fast' or 'accurate' indicating whether computations should be
+        optimized for speed or accuracy respectively.
+    :param mode: 'instance' or 'channel' indicating whether the softmax should
+        be computed per image across 'c01' or per spatial location '01' per
+        image across 'c'.
+    """
+    direction = 'forward'
+    softmax_inputs = ['softmax_input']
+
+    def make_node(self, x):
+        x = as_gpuarray_variable(x)
+        assert x.ndim == 4
+        return Apply(self, [x], [x.type()])
+
+    def method(self):
+        return """
+#ifndef CUDNN_VERSION
+err%(name)s = cudnnSoftmaxForward(
+  _handle,
+  algo%(name)s,
+  mode%(name)s,
+  softmax_input_%(name)s,
+  PyGpuArray_DEV_DATA(%(ins)s),
+  softmax_output_%(name)s,
+  PyGpuArray_DEV_DATA(%(outs)s)
+);
+#else
+{
+const float alpha = 1.;
+const float beta = 0.;
+err%(name)s = cudnnSoftmaxForward(
+  _handle,
+  algo%(name)s,
+  mode%(name)s,
+  (void*) &alpha,
+  softmax_input_%(name)s,
+  PyGpuArray_DEV_DATA(%(ins)s),
+  (void*) &beta,
+  softmax_output_%(name)s,
+  PyGpuArray_DEV_DATA(%(outs)s)
+);
+}
+#endif
+"""
+
+    def grad(self, inp, grads):
+        x, = inp
+        g_sm, = grads
+        sm = self.make_node(x).outputs[0]
+        return [GpuDnnSoftmaxGrad(
+            self.tensor_format,
+            self.algo,
+            self.mode
+        )(g_sm, sm)]
+
+
+class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
+    """
+    Op for the cuDNN SoftmaxGrad.
+
+    :param tensor_format: Whether the data format is 'bc01' or 'b01c'.
+    :param algo: 'fast' or 'accurate' indicating whether computations should be
+        optimized for speed or accuracy respectively.
+    :param mode: 'instance' or 'channel' indicating whether the softmax should
+        be computed per image across 'c01' or per spatial location '01' per
+        image across 'c'.
+    """
+    direction = 'backward'
+    softmax_inputs = ['softmax_gout', 'softmax_input']
+
+    def make_node(self, dy, sm):
+        dy = as_gpuarray_variable(dy)
+        sm = as_gpuarray_variable(sm)
+        assert dy.ndim == 4
+        assert sm.ndim == 4
+        return Apply(self, [dy, sm], [sm.type.make_variable()])
+
+    def method(self):
+        return """
+#ifndef CUDNN_VERSION
+err%(name)s = cudnnSoftmaxBackward(
+  _handle,
+  algo%(name)s,
+  mode%(name)s,
+  %(name1)s_%(name)s,
+  PyGpuArray_DEV_DATA(%(ins1)s),
+  %(name0)s_%(name)s,
+  PyGpuArray_DEV_DATA(%(ins0)s),
+  softmax_output_%(name)s,
+  PyGpuArray_DEV_DATA(%(outs)s)
+);
+#else
+{
+const float alpha = 1.;
+const float beta = 0.;
+err%(name)s = cudnnSoftmaxBackward(
+  _handle,
+  algo%(name)s,
+  mode%(name)s,
+  (void*) &alpha,
+  %(name1)s_%(name)s,
+  PyGpuArray_DEV_DATA(%(ins1)s),
+  %(name0)s_%(name)s,
+  PyGpuArray_DEV_DATA(%(ins0)s),
+  (void*) &beta,
+  softmax_output_%(name)s,
+  PyGpuArray_DEV_DATA(%(outs)s)
+);
+}
+#endif
+        """
+
+
+# @register_opt('cudnn')  # this optimizer is registered in opt.py instead.
+@local_optimizer([GpuConv])
+def local_conv_dnn(node):
+    if not dnn_available():
+        return
+    if isinstance(node.op, GpuConv):
+        if node.op.border_mode not in ['full', 'valid']:
+            return
+        img, kern = node.inputs
+        border_mode = node.op.border_mode
+        subsample = node.op.subsample
+        direction_hint = node.op.direction_hint
+        rval = dnn_conv(img, kern,
+                        border_mode=border_mode, subsample=subsample,
+                        direction_hint=direction_hint)
+        return [rval]
+
+
+# This optimizer is registered in opt.py as part of the meta-optimizer.
+# It tries exactly the opposite code path of what local_conv_dnn() uses,
+# because for some input/kernel shape configurations, this is faster.
+@local_optimizer([GpuConv])
+def local_conv_dnn_alternative(node):
+    if not dnn_available():
+        return
+    if isinstance(node.op, GpuConv):
+        border_mode = node.op.border_mode
+        subsample = node.op.subsample
+        if border_mode not in ['full', 'valid'] or subsample != (1, 1):
+            return
+        img, kern = node.inputs
+        direction_hint = node.op.direction_hint
+        if border_mode == 'full':
+            # for a full convolution, try using the forward pass instead
+            # of the backward pass wrt. inputs
+            direction_hint = 'forward!'
+        elif border_mode == 'valid':
+            # for a valid convolution, try using the backward pass wrt.
+            # weights instead of the forward pass and vice versa
+            if direction_hint == 'bprop weights':
+                direction_hint = 'forward'
+            else:
+                direction_hint = 'bprop weights'
+        rval = dnn_conv(img, kern,
+                        border_mode=border_mode, subsample=subsample,
+                        direction_hint=direction_hint)
+        if node.outputs[0].broadcastable != rval.broadcastable:
+            rval = tensor.patternbroadcast(
+                rval, node.outputs[0].type.broadcastable)
+        return [rval]
+
+
+conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
+                       'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
+
+
+@local_optimizer([GpuDnnConv], inplace=True)
+def local_dnn_conv_inplace(node):
+    if type(node.op) != GpuDnnConv or node.op.inplace:
+        return
+    inputs = list(node.inputs)
+    dest = inputs[2]
+    if (dest.owner and
+            isinstance(dest.owner.op, GpuAllocEmpty) and
+            len(dest.clients) > 1):
+        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+    return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)]
+
+
+@local_optimizer([GpuDnnConvGradW], inplace=True)
+def local_dnn_convgw_inplace(node):
+    if type(node.op) != GpuDnnConvGradW or node.op.inplace:
+        return
+    inputs = list(node.inputs)
+    dest = inputs[2]
+    if (dest.owner and
+            isinstance(dest.owner.op, GpuAllocEmpty) and
+            len(dest.clients) > 1):
+        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+    return [GpuDnnConvGradW(inplace=True)(*inputs)]
+
+
+@local_optimizer([GpuDnnConvGradI], inplace=True)
+def local_dnn_convgi_inplace(node):
+    if type(node.op) != GpuDnnConvGradI or node.op.inplace:
+        return
+    inputs = list(node.inputs)
+    dest = inputs[2]
+    if (dest.owner and
+            isinstance(dest.owner.op, GpuAllocEmpty) and
+            len(dest.clients) > 1):
+        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
+    return [GpuDnnConvGradI(inplace=True)(*inputs)]
+
+optdb.register('local_dnna_conv_inplace',
+               tensor.opt.in2out(local_dnn_conv_inplace,
+                                 local_dnn_convgw_inplace,
+                                 local_dnn_convgi_inplace,
+                                 name="local_dnn_conv_inplace"),
+               70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')
+
+
+@register_opt('cudnn')
+@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
+def local_dnn_conv_alpha_merge(node, *inputs):
+    if not dnn_available() or version() == -1:
+        return None
+    return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
+
+
+@register_opt('cudnn')
+@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
+def local_dnn_convw_alpha_merge(node, *inputs):
+    if not dnn_available() or version() == -1:
+        return None
+    return [GpuDnnConvGradW()(*inputs)]
+
+
+@register_opt('cudnn')
+@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
+def local_dnn_convi_alpha_merge(node, *inputs):
+    if not dnn_available() or version() == -1:
+        return None
+    return [GpuDnnConvGradI()(*inputs)]
+
+
+@register_opt('cudnn')
+@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
+def local_dnn_conv_output_merge(node, *inputs):
+    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
+    return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
+
+
+@register_opt('cudnn')
+@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
+def local_dnn_convw_output_merge(node, *inputs):
+    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
+    return [GpuDnnConvGradW()(*inputs)]
+
+
+@register_opt('cudnn')
+@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
+def local_dnn_convi_output_merge(node, *inputs):
+    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
+    return [GpuDnnConvGradI()(*inputs)]
+
+
+@register_opt('cudnn')
+@op_lifter([DownsampleFactorMax])
+def local_pool_dnn_alternative(node):
+    if not dnn_available():
+        return
+    if not node.op.ignore_border:
+        return
+    img, = node.inputs
+    ds = node.op.ds
+    stride = node.op.st
+    pad = node.op.padding
+    mode = node.op.mode
+    return dnn_pool(gpu_contiguous(img.owner.inputs[0]),
+                    ds, stride=stride, pad=pad, mode=mode)
+
+
+@register_opt('cudnn')
+@op_lifter([DownsampleFactorMaxGrad])
+def local_pool_dnn_grad_stride(node):
+    if not dnn_available():
+        return
+    if not node.op.ignore_border:
+        return
+    inp, out, inp_grad = node.inputs
+    ds = node.op.ds
+    st = node.op.st
+    pad = node.op.padding
+    mode = node.op.mode
+
+    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
+    return GpuDnnPoolGrad()(gpu_contiguous(inp),
+                            gpu_contiguous(out),
+                            gpu_contiguous(inp_grad),
+                            desc)
+
+
+@register_opt('cudnn')
+@local_optimizer([GpuSoftmax])
+def local_softmax_dnn(node):
+    if not dnn_available():
+        return
+    if isinstance(node.op, GpuSoftmax):
+        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
+        ins = gpu_contiguous(ins)
+        out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins)
+        out = as_gpuarray_variable(out.dimshuffle(0, 1))
+        return [out]
+
+
+class NoCuDNNRaise(Optimizer):
+    def apply(self, fgraph):
+        """ Raise a RuntimeError if cudnn can't be used"""
+        if not dnn_available():
+            # Make an assert error as we want Theano to fail, not
+            # just skip this optimization.
+            raise AssertionError(
+                "cuDNN optimization was enabled, but Theano was not able"
+                " to use it. We got this error: \n" +
+                dnn_available.msg)
+
+gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
+
+
+@register_opt('cudnn')
+@op_lifter([SoftmaxGrad])
+def local_softmax_dnn_grad(node):
+    if not dnn_available():
+        return
+    ins = []
+    for n in node.inputs:
+        if isinstance(n.owner.op, HostFromGpu):
+            n = n.owner.inputs[0]
+        if n.ndim != 2:
+            return
+        ins.append(n.dimshuffle(0, 1, 'x', 'x'))
+
+    out = GpuDnnSoftmaxGrad('bc01', 'accurate', 'channel')(
+        gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
+    return [out.dimshuffle(0, 1)]
--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
+#section support_code
+static cudnnHandle_t _handle = NULL;
+
+static int
+c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
+  cudnnDataType_t dt;
+  size_t ds;
+  switch (var->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
+    return -1;
+  }
+  ds = gpuarray_get_elsize(var->ga.typecode);
+
+  int str0, str1, str2, str3;
+  // cudnn do not like 0s in strides
+  str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
+  str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3];
+  str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3];
+  str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1];
+  cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
+    desc, dt,
+    PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1),
+    PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3),
+    str0, str1, str2, str3);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+		 "Could not set tensor4d descriptor: %s",
+		 cudnnGetErrorString(err));
+    return -1;
+  }
+  return 0;
+}
+
+static int
+c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
+  cudnnDataType_t dt;
+  if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+		    "Only contiguous filters (kernels) are supported.");
+    return -1;
+  }
+  switch (var->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
+    return -1;
+  }
+  cudnnStatus_t err = cudnnSetFilter4dDescriptor(
+    desc, dt,
+    PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1],
+    PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+		 "Could not set filter descriptor: %s.",
+		 cudnnGetErrorString(err));
+    return -1;
+  }
+  return 0;
+}
+
+#section init_code
+
+{
+  cudnnStatus_t err;
+  if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
+		 cudnnGetErrorString(err));
+#if PY_MAJOR_VERSION >= 3
+    return NULL;
+#else
+    return;
+#endif
+  }
+}
--- a/theano/sandbox/gpuarray/dnn_conv_base.c
+++ b/theano/sandbox/gpuarray/dnn_conv_base.c
+#section support_code_struct
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
+
+#section init_code_struct
+
+cudnnStatus_t APPLY_SPECIFIC(err);
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(kerns) = NULL;
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", 
+	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
+if (APPLY_SPECIFIC(output) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
+if (APPLY_SPECIFIC(kerns) != NULL)
+  cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
+#section support_code_struct
+
+int
+APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
+                         PyGpuArrayObject *om,
+                         cudnnConvolutionDescriptor_t desc,
+                         double alpha, double beta,
+                         PyGpuArrayObject **output) {
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  float af = alpha, bf = beta;
+  void *alpha_p;
+  void *beta_p;
+
+  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
+    PyErr_SetString(PyExc_ValueError,
+		    "GpuDnnConv images and kernel must have the same stack size");
+    return 1;
+  }
+
+  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
+
+  switch (input->ga.typecode) {
+  case GA_DOUBLE:
+    alpha_p = (void *)&alpha;
+    beta_p = (void *)&beta;
+    break;
+  case GA_FLOAT:
+    alpha_p = (void *)&af;
+    beta_p = (void *)&bf;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
+    return 1;
+  }
+
+#ifdef CONV_INPLACE
+  Py_XDECREF(*output);
+  *output = om;
+  Py_INCREF(*output);
+#else
+  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
+                         om->ga.typecode, GA_C_ORDER,
+                         pygpu_default_context()) != 0)
+    return 1;
+  if (beta != 0.0 && pygpu_move(*output, om))
+    return 1;
+#endif
+
+  if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+
+  {
+    size_t worksize;
+    gpudata *workspace;
+    PyGpuContextObject *c;
+
+    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
+                                                  APPLY_SPECIFIC(input),
+                                                  APPLY_SPECIFIC(kerns),
+                                                  desc,
+                                                  APPLY_SPECIFIC(output),
+                                                  CONV_ALGO,
+                                                  &worksize);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv: error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    /* 
+     * This is less than ideal since we need to free it after (which
+     * introduces a synchronization point. But we don't have a module
+     * to place a nice get_work_mem() function in.
+     */
+    if (worksize != 0) {
+      c = pygpu_default_context();
+      workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+      if (workspace == NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Could not allocate working memory");
+        return 1;
+      }
+    }
+
+    err = cudnnConvolutionForward(
+      _handle,
+      alpha_p,
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+      desc, CONV_ALGO,
+      worksize == 0 ? NULL : *(void **)workspace, worksize,
+      beta_p,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
+
+    if (worksize != 0)
+      c->ops->buffer_release(workspace);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
+		 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
+#section support_code_struct
+
+int
+APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
+                        PyGpuArrayObject *im,
+                        cudnnConvolutionDescriptor_t desc,
+                        double alpha, double beta, PyGpuArrayObject **input) {
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  float af = alpha, bf = beta;
+  void *alpha_p;
+  void *beta_p;
+
+  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
+    PyErr_SetString(PyExc_ValueError,
+		    "GpuDnnConv images and kernel must have the same stack size");
+    return 1;
+  }
+
+  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
+
+  switch (im->ga.typecode) {
+  case GA_DOUBLE:
+    alpha_p = (void *)&alpha;
+    beta_p = (void *)&beta;
+    break;
+  case GA_FLOAT:
+    alpha_p = (void *)&af;
+    beta_p = (void *)&bf;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
+    return 1;
+  }
+
+#ifdef CONV_INPLACE
+  Py_XDECREF(*input);
+  *input = im;
+  Py_INCREF(*input);
+#else
+  if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
+                         im->ga.typecode, GA_C_ORDER,
+                         pygpu_default_context()) != 0)
+    return 1;
+  if (beta != 0.0 && pygpu_move(*input, im))
+    return 1;
+#endif
+
+  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+
+  err = cudnnConvolutionBackwardData(
+    _handle,
+    alpha_p,
+    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
+    desc,
+    beta_p,
+    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
+#section support_code_struct
+
+int 
+APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
+                        PyGpuArrayObject *km,
+                        cudnnConvolutionDescriptor_t desc,
+                        double alpha, double beta, PyGpuArrayObject **kerns) {
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  float af = alpha, bf = beta;
+  void *alpha_p;
+  void *beta_p;
+
+  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
+    PyErr_SetString(PyExc_ValueError,
+		    "GpuDnnConv images and kernel must have the same stack size");
+    return 1;
+  }
+
+  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+
+  switch (input->ga.typecode) {
+  case GA_DOUBLE:
+    alpha_p = (void *)&alpha;
+    beta_p = (void *)&beta;
+    break;
+  case GA_FLOAT:
+    alpha_p = (void *)&af;
+    beta_p = (void *)&bf;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
+    return 1;
+  }
+
+#ifdef CONV_INPLACE
+  Py_XDECREF(*kerns);
+  *kerns = km;
+  Py_INCREF(*kerns);
+#else
+  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
+                         km->ga.typecode, GA_C_ORDER,
+                         pygpu_default_context()) != 0)
+    return 1;
+  if (beta != 0.0 && pygpu_move(*kerns, km))
+    return 1;
+#endif
+
+  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
+
+  err = cudnnConvolutionBackwardFilter(
+    _handle,
+    alpha_p,
+    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
+    desc,
+    beta_p,
+    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/gpuarray_helper.h
+++ b/theano/sandbox/gpuarray/gpuarray_helper.h
@@ -42,4 +42,9 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
  return out;
 }

+/* This is guaranteed to work and return the raw CUDA/OpenCL object on
+ * all recent (as of June 2015) version of libgpuarray. This is also
+ * promised to keep working in future versions. */
+#define PyGpuArray_DEV_DATA(ary) (*(void **)((ary)->ga.data))
+
 #endif
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -12,11 +12,13 @@ from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, Optimizer, toolbox)
+from theano.gof.optdb import LocalGroupDB

 from theano.scan_module import scan_utils, scan_op, scan_opt

 from theano.tensor.nnet.conv import ConvOp
 from theano.tests.breakpoint import PdbBreakpoint
+
 from .type import GpuArrayType, GpuArrayConstant
 from .basic_ops import (host_from_gpu, gpu_from_host,
                        HostFromGpu, GpuFromHost,
@@ -39,6 +41,10 @@ gpu_cut_copies = EquilibriumDB()

 gpu_seqopt = SequenceDB()

+# Don't register this right now
+conv_groupopt = LocalGroupDB()
+conv_groupopt.__name__ = "gpua_conv_opts"
+
 gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'inplace', 'gpuarray')
 gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
@@ -689,6 +695,9 @@ def local_gpu_conv(node):
    out.values_eq_approx = values_eq_approx
    return [out]

+# Register this here so that it goes after 'local_gpu_conv'
+register_opt()(conv_groupopt)
+

 @register_opt("low_memory")
 @local_optimizer([GpuCAReduceCuda])

--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -7,10 +7,10 @@ from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)

-from .basic_ops import GpuFromHost, HostFromGpu, host_from_gpu
+from .basic_ops import GpuFromHost, HostFromGpu
 from .elemwise import GpuDimShuffle, GpuElemwise

-_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
+_one = scal.constant(numpy.asarray(1.0, dtype='float64'))


 def grab_cpu_scalar(v, nd):
@@ -18,10 +18,10 @@ def grab_cpu_scalar(v, nd):
        n = v.owner
        if (isinstance(n.op, GpuDimShuffle) and
                n.op.new_order == ('x',) * nd):
-            return host_from_gpu(n.inputs[0])
+            return grab_cpu_scalar(n.inputs[0])
        elif (isinstance(n.op, DimShuffle) and
              n.op.new_order == ('x',) * nd):
-            return n.inputs[0]
+            return grab_cpu_scalar(n.inputs[0])
        elif isinstance(n.op, GpuFromHost):
            return grab_cpu_scalar(n.inputs[0], nd=nd)
        else:
@@ -37,7 +37,7 @@ def find_node(v, cls, ignore_clients=False):
    # that has the op class specified. If ignore_clients is False (the
    # default) it will only dig through nodes that have a single
    # client.
-    if v.owner is not None and (ignore_clients or v.clients == 1):
+    if v.owner is not None and (ignore_clients or len(v.clients) == 1):
        if isinstance(v.owner.op, cls):
            return v.owner
        elif (isinstance(v.owner.op, GpuFromHost) and

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
+import logging
+
+from nose.plugins.skip import SkipTest
+import numpy
+from itertools import product
+
+import theano
+from six import StringIO
+import theano.tensor as T
+import theano.tests.unittest_tools as utt
+from theano.sandbox.neighbours import images2neibs
+from theano.tensor.signal.downsample import max_pool_2d
+from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
+
+from .. import dnn
+from ..basic_ops import GpuAllocEmpty
+
+from .test_basic_ops import mode_with_gpu, mode_without_gpu
+from . import test_nnet
+
+
+def test_dnn_conv_desc_merge():
+    if not dnn.dnn_available():
+        raise SkipTest(dnn.dnn_available.msg)
+    img_shp = T.as_tensor_variable(
+        numpy.asarray([2, 1, 8, 8]).astype('int64'))
+    kern_shp = T.as_tensor_variable(
+        numpy.asarray([3, 1, 2, 2]).astype('int64'))
+    desc1 = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(2, 2),
+                               conv_mode='conv')(img_shp, kern_shp)
+    desc2 = dnn.GpuDnnConvDesc(border_mode='full', subsample=(1, 1),
+                               conv_mode='cross')(img_shp, kern_shp)
+    # CDataType is not DeepCopyable so this will crash if we don't use
+    # borrow=True
+    f = theano.function([], [theano.Out(desc1, borrow=True),
+                             theano.Out(desc2, borrow=True)])
+
+    d1, d2 = f()
+
+    # This will be the case if they are merged, which would be bad.
+    assert d1 != d2
+
+
+def test_dnn_conv_merge():
+    # This test that we merge correctly multiple dnn_conv.
+    if not dnn.dnn_available():
+        raise SkipTest(dnn.dnn_available.msg)
+    img_shp = [2, 5, 6, 8]
+    kern_shp = [3, 5, 5, 6]
+    img = T.ftensor4('img')
+    kern = T.ftensor4('kern')
+    out = T.ftensor4('out')
+    desc = dnn.GpuDnnConvDesc(
+        border_mode='valid')(img.shape, kern.shape)
+
+    # Test forward op
+    o1 = dnn.dnn_conv(img, kern)
+    o2 = dnn.dnn_conv(img, kern)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
+               numpy.random.rand(*kern_shp).astype('float32'))
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]) == 1
+
+    # Test grad w op
+    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
+    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
+    f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]) == 1
+
+    # Test grad i op
+    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
+    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
+    f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]) == 1
+
+
+def test_dnn_conv_inplace():
+    """This test that we have inplace work correctly even when
+    GpuAllocEmpty get merged together.
+
+    """
+    if not dnn.dnn_available():
+        raise SkipTest(dnn.dnn_available.msg)
+    img_shp = [2, 5, 6, 8]
+    kern_shp = [3, 5, 5, 6]
+    img = T.ftensor4('img')
+    kern = T.ftensor4('kern')
+    out = T.ftensor4('out')
+    desc1 = dnn.GpuDnnConvDesc(border_mode='valid', conv_mode='conv')(
+        img.shape, kern.shape)
+    desc2 = dnn.GpuDnnConvDesc(
+        border_mode='valid', conv_mode='cross')(img.shape, kern.shape)
+
+    # Test forward op
+    o1 = dnn.dnn_conv(img, kern, conv_mode='conv')
+    o2 = dnn.dnn_conv(img, kern, conv_mode='cross')
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
+               numpy.random.rand(*kern_shp).astype('float32'))
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
+
+    # Test grad w op
+    out = GpuAllocEmpty(kern.dtype)(*kern.shape)
+    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
+    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
+
+    # Test grad i op
+    out = GpuAllocEmpty(img.dtype)(*img.shape)
+    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
+    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
+    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    convs = [n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]
+    assert len(convs) == 2
+    assert all([node.op.inplace for node in convs])
+    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
+
+
+def pool_2d_i2n(input, ds=(2, 2), strides=None,
+                pad=(0, 0),
+                pool_function=T.max, mode='ignore_borders'):
+    if strides is None:
+        strides = ds
+
+    if strides[0] > ds[0] or strides[1] > ds[1]:
+        raise RuntimeError(
+            "strides should be smaller than or equal to ds,"
+            " strides=(%d, %d) and ds=(%d, %d)" %
+            (strides + ds))
+    shape = input.shape
+    if pad != (0, 0):
+        assert pool_function is T.max
+        pad_x = pad[0]
+        pad_y = pad[1]
+        a = T.alloc(-numpy.inf, shape[0], shape[1], shape[2] + pad_x * 2,
+                    shape[3] + pad_y * 2)
+        input = T.set_subtensor(a[:, :,
+                                  pad_x:pad_x + shape[2],
+                                  pad_y:pad_y + shape[3]],
+                                input)
+        shape = input.shape
+
+    neibs = images2neibs(input, ds, strides, mode=mode)
+    pooled_neibs = pool_function(neibs, axis=1)
+
+    output_width = (shape[2] - ds[0]) // strides[0] + 1
+    output_height = (shape[3] - ds[1]) // strides[1] + 1
+
+    pooled_output = pooled_neibs.reshape((shape[0], shape[1],
+                                          output_width, output_height))
+    return pooled_output
+
+
+def test_pooling():
+    if not dnn.dnn_available():
+        raise SkipTest(dnn.dnn_available.msg)
+
+    x = T.ftensor4()
+    for mode, pad in product(('max', 'average_inc_pad', 'average_exc_pad'),
+                             ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
+        if mode == 'max':
+            func = T.max
+        else:
+            func = T.mean
+        if pad != (0, 0) and dnn.version() == -1:
+            continue
+
+        if pad != (0, 0) and func is T.mean:
+            continue
+
+        for ws in (4, 2, 5):
+            for stride in (2, 3):
+                if stride > ws:
+                    continue
+                if pad[0] > stride or pad[1] > stride:
+                    # Not implemented
+                    continue
+                # We will check that the opt introduced it.
+                out1 = max_pool_2d(x, (ws, ws),
+                                   st=(stride, stride),
+                                   ignore_border=True,
+                                   padding=pad, mode=mode)
+                out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
+                                   pad=pad,
+                                   pool_function=func)
+                mode_without_gpu2 = mode_without_gpu.including()
+                mode_without_gpu2.check_isfinite = False
+                f1 = theano.function([x], out1, mode=mode_with_gpu)
+                assert any([isinstance(node.op, dnn.GpuDnnPool)
+                            for node in f1.maker.fgraph.apply_nodes])
+                f2 = theano.function([x], out2, mode=mode_without_gpu2)
+                assert not any([isinstance(node.op, dnn.GpuDnnPool)
+                                for node in f2.maker.fgraph.apply_nodes])
+                for shp in [(1, 10, 100, 100),
+                            (1, 3, 99, 99),
+                            (32, 1, 147, 197),
+                            ]:
+                    data = numpy.random.normal(0, 1, shp).astype("float32")
+                    a = f1(data).__array__()
+
+                    b = f2(data).__array__()
+                    assert numpy.allclose(a, b,
+                                          atol=numpy.finfo(numpy.float32).eps)
+
+        # Test the grad
+        for shp in [(1, 1, 2, 2),
+                    (1, 1, 3, 3)]:
+            data = numpy.random.normal(0, 1, shp).astype("float32") * 10
+
+            ws = 2
+            stride = 2
+            if pad[0] > stride or pad[1] > stride:
+                # Not implemented
+                continue
+
+            # This test the CPU grad + opt + GPU implemtentation
+            def fn(x):
+                return max_pool_2d(x, (ws, ws), ignore_border=True,
+                                   padding=pad, mode=mode)
+            theano.tests.unittest_tools.verify_grad(fn, [data],
+                                                    cast_to_output_type=False,
+                                                    mode=mode_with_gpu)
+            # Confirm that the opt would have inserted it.
+            fg = theano.function([x], theano.grad(fn(x).sum(), x),
+                                 mode=mode_with_gpu)
+            assert any([isinstance(node.op, dnn.GpuDnnPoolGrad)
+                        for node in fg.maker.fgraph.toposort()])
+
+            # Test the GPU grad + GPU implementation
+            def fn(x):
+                dnn_op = dnn.dnn_pool(
+                    x, ws=(ws, ws),
+                    stride=(stride, stride),
+                    pad=pad,
+                    mode=mode)
+                return dnn_op
+            theano.tests.unittest_tools.verify_grad(
+                fn, [data],
+                cast_to_output_type=False,
+                mode=mode_with_gpu)
+            # Confirm that we get the good op.
+            fg = theano.function([x], theano.grad(fn(x).sum(), x),
+                                 mode=mode_with_gpu)
+            assert any([isinstance(node.op, dnn.GpuDnnPoolGrad)
+                        for node in fg.maker.fgraph.toposort()])
+            g_out = fg(data)
+
+            # Compare again the CPU result
+            out = max_pool_2d(x, (ws, ws),
+                              padding=pad,
+                              ignore_border=True, mode=mode)
+            fc = theano.function([x], theano.grad(out.sum(), x),
+                                 mode=mode_without_gpu)
+            assert any([isinstance(node.op, DownsampleFactorMaxGrad)
+                        for node in fc.maker.fgraph.toposort()])
+            c_out = fc(data)
+            assert numpy.allclose(c_out, g_out)
+
+
+def test_pooling_opt():
+    if not dnn.dnn_available():
+        raise SkipTest(dnn.dnn_available.msg)
+
+    x = T.ftensor4()
+
+    f = theano.function(
+        [x],
+        max_pool_2d(x, ds=(2, 2), ignore_border=True),
+        mode=mode_with_gpu)
+
+    assert any([isinstance(n.op, dnn.GpuDnnPool)
+                for n in f.maker.fgraph.toposort()])
+
+    f = theano.function(
+        [x],
+        T.grad(max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x),
+        mode=mode_with_gpu.including("cudnn"))
+
+    assert any([isinstance(n.op, dnn.GpuDnnPoolGrad)
+                for n in f.maker.fgraph.toposort()])
+
+
+def test_dnn_tag():
+    """
+    Test that if cudnn isn't avail we crash and that if it is avail, we use it.
+    """
+    x = T.ftensor4()
+    old = theano.config.on_opt_error
+    theano.config.on_opt_error = "raise"
+
+    sio = StringIO()
+    handler = logging.StreamHandler(sio)
+    logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler)
+    # Silence original handler when intentionnally generating warning messages
+    logging.getLogger('theano').removeHandler(theano.logging_default_handler)
+    raised = False
+    try:
+        f = theano.function(
+            [x],
+            max_pool_2d(x, ds=(2, 2), ignore_border=True),
+            mode=mode_with_gpu.including("cudnn"))
+    except (AssertionError, RuntimeError):
+        assert not dnn.dnn_available()
+        raised = True
+    finally:
+        theano.config.on_opt_error = old
+        logging.getLogger(
+            'theano.compile.tests.test_dnn').removeHandler(handler)
+        logging.getLogger('theano').addHandler(theano.logging_default_handler)
+
+    if not raised:
+        assert dnn.dnn_available()
+        assert any([isinstance(n.op, dnn.GpuDnnPool)
+                    for n in f.maker.fgraph.toposort()])
+
+
+class TestDnnInferShapes(utt.InferShapeTester):
+    def setUp(self):
+        super(TestDnnInferShapes, self).setUp()
+        self.mode = mode_with_gpu
+
+    def test_softmax(self):
+        if not dnn.dnn_available():
+            raise SkipTest(dnn.dnn_available.msg)
+        t = T.ftensor4('t')
+        rand_tensor = numpy.asarray(
+            numpy.random.rand(5, 4, 3, 2),
+            dtype='float32'
+        )
+        self._compile_and_check(
+            [t],
+            [dnn.GpuDnnSoftmax('bc01', 'accurate', 'channel')(t)],
+            [rand_tensor],
+            dnn.GpuDnnSoftmax
+        )
+
+        self._compile_and_check(
+            [t],
+            [
+                T.grad(
+                    dnn.GpuDnnSoftmax(
+                        'bc01',
+                        'accurate',
+                        'channel'
+                    )(t).mean(),
+                    t
+                )
+            ],
+            [rand_tensor],
+            dnn.GpuDnnSoftmaxGrad
+        )
+
+    def test_conv(self):
+        if not dnn.dnn_available():
+            raise SkipTest(dnn.dnn_available.msg)
+        img = T.ftensor4('img')
+        kerns = T.ftensor4('kerns')
+        out = T.ftensor4('out')
+        img_val = numpy.asarray(
+            numpy.random.rand(7, 2, 6, 4),
+            dtype='float32'
+        )
+        kern_vals = numpy.asarray(
+            numpy.random.rand(8, 2, 4, 3),
+            dtype='float32'
+        )
+
+        for params in product(
+            ['valid', 'full'],
+            [(1, 1), (2, 2)],
+            ['conv', 'cross']
+        ):
+            out_vals = numpy.zeros(
+                dnn.GpuDnnConv.get_out_shape(img_val.shape, kern_vals.shape,
+                                             border_mode=params[0],
+                                             subsample=params[1]),
+                dtype='float32')
+            desc = dnn.GpuDnnConvDesc(
+                border_mode=params[0],
+                subsample=params[1],
+                conv_mode=params[2]
+            )(img.shape, kerns.shape)
+            conv = dnn.GpuDnnConv()(img, kerns, out, desc)
+            self._compile_and_check(
+                [img, kerns, out],
+                [conv],
+                [img_val, kern_vals, out_vals],
+                dnn.GpuDnnConv
+            )
+
+    def test_conv_gradw(self):
+        if not dnn.dnn_available():
+            raise SkipTest(dnn.dnn_available.msg)
+        img = T.ftensor4('img')
+        kerns = T.ftensor4('kerns')
+        out = T.ftensor4('out')
+        img_val = numpy.asarray(
+            numpy.random.rand(2, 5, 6, 8),
+            dtype='float32'
+        )
+        kern_vals = numpy.asarray(
+            numpy.random.rand(2, 1, 5, 6),
+            dtype='float32'
+        )
+
+        for params in product(
+            ['valid', 'full'],
+            [(1, 1)],  # strides besides (1, 1)
+            ['conv', 'cross']
+        ):
+            temp_img = img.dimshuffle(1, 0, 2, 3)
+            temp_kerns = kerns
+            if params[2] == 'conv':
+                temp_kerns = temp_kerns[:, :, ::-1, ::-1]
+            temp_kerns = temp_kerns.dimshuffle(1, 0, 2, 3)
+            shape = (
+                kern_vals.shape[1], img_val.shape[1],
+                img_val.shape[2] - kern_vals.shape[2] + 1,
+                img_val.shape[3] - kern_vals.shape[3] + 1
+            )
+            out_vals = numpy.zeros(shape, dtype='float32')
+            desc = dnn.GpuDnnConvDesc(
+                border_mode=params[0],
+                subsample=params[1],
+                conv_mode=params[2]
+            )(temp_img.shape, out.shape)
+            conv_grad_w = dnn.GpuDnnConvGradW()(
+                temp_img,
+                temp_kerns,
+                out,
+                desc,
+            )
+            self._compile_and_check(
+                [temp_img, temp_kerns, out],
+                [conv_grad_w],
+                [img_val, kern_vals, out_vals],
+                dnn.GpuDnnConvGradW
+            )
+
+    def test_conv_gradi(self):
+        if not dnn.dnn_available():
+            raise SkipTest(dnn.dnn_available.msg)
+        img = T.ftensor4('img')
+        kerns = T.ftensor4('kerns')
+        out = T.ftensor4('out')
+        img_val = numpy.asarray(
+            numpy.random.rand(3, 4, 5, 6),
+            dtype='float32'
+        )
+        kern_vals = numpy.asarray(
+            numpy.random.rand(13, 14, 15, 16),
+            dtype='float32'
+        )
+
+        for params in product(
+            ['valid'],  # Should this work for 'full'?
+            [(1, 1)],
+            ['conv', 'cross']
+        ):
+            temp_kerns = kerns.dimshuffle(1, 0, 2, 3)
+            shape = (
+                img_val.shape[0], kern_vals.shape[1],
+                img_val.shape[2] + kern_vals.shape[2] - 1,
+                img_val.shape[3] + kern_vals.shape[3] - 1
+            )
+            out_vals = numpy.zeros(shape, dtype='float32')
+            desc = dnn.GpuDnnConvDesc(
+                border_mode=params[0],
+                subsample=params[1],
+                conv_mode=params[2]
+            )(out.shape, temp_kerns.shape)
+            conv_grad_i = dnn.GpuDnnConvGradI()(
+                temp_kerns,
+                img,
+                out,
+                desc,
+            )
+            self._compile_and_check(
+                [temp_kerns, img, out],
+                [conv_grad_i],
+                [kern_vals, img_val, out_vals],
+                dnn.GpuDnnConvGradI
+            )
+
+    def test_pool(self):
+        if not dnn.dnn_available():
+            raise SkipTest(dnn.dnn_available.msg)
+        img = T.ftensor4('img')
+        img_val = numpy.asarray(
+            numpy.random.rand(2, 3, 4, 5),
+            dtype='float32'
+        )
+        for params in product(
+            [(1, 1), (2, 2), (3, 3)],
+            [(1, 1), (2, 2), (3, 3)],
+            ['max', 'average_inc_pad', 'average_exc_pad']
+        ):
+            desc = dnn.GpuDnnPoolDesc(
+                ws=params[0],
+                stride=params[1],
+                mode=params[2]
+            )()
+            self._compile_and_check(
+                [img],
+                [dnn.GpuDnnPool()(img, desc)],
+                [img_val],
+                dnn.GpuDnnPool
+            )
+
+    def test_pool_grad(self):
+        if not dnn.dnn_available():
+            raise SkipTest(dnn.dnn_available.msg)
+        img = T.ftensor4('img')
+        img_grad = T.ftensor4('img_grad')
+        out = T.ftensor4('out')
+        img_val = numpy.asarray(
+            numpy.random.rand(2, 3, 4, 5),
+            dtype='float32'
+        )
+        img_grad_val = numpy.asarray(
+            numpy.random.rand(2, 3, 4, 5),
+            dtype='float32'
+        )
+        out_val = numpy.asarray(
+            numpy.random.rand(2, 3, 4, 5),
+            dtype='float32'
+        )
+
+        for params in product(
+            [(1, 1), (2, 2), (3, 3)],
+            [(1, 1), (2, 2), (3, 3)],
+            ['max', 'average_inc_pad']
+        ):
+            desc = dnn.GpuDnnPoolDesc(
+                ws=params[0],
+                stride=params[1],
+                mode=params[2]
+            )()
+            pool_grad = dnn.GpuDnnPoolGrad()(
+                img,
+                out,
+                img_grad,
+                desc
+            )
+            self._compile_and_check(
+                [img, img_grad, out],
+                [pool_grad],
+                [img_val, img_grad_val, out_val],
+                dnn.GpuDnnPoolGrad
+            )
+
+
+# this has been a problem in the past
+def test_dnn_conv_border_mode():
+    if not dnn.dnn_available():
+        raise SkipTest(dnn.dnn_available.msg)
+    img = T.ftensor4()
+    kern = T.ftensor4()
+
+    dnn.dnn_conv(img, kern, border_mode=1)
+    dnn.dnn_conv(img, kern, border_mode=(2, 3))
+    dnn.dnn_conv(img, kern, border_mode='full')
+    dnn.dnn_conv(img, kern, border_mode='valid')
+
+
+def test_dnn_conv_alpha_output_merge():
+    if not dnn.dnn_available():
+        raise SkipTest(dnn.dnn_available.msg)
+    img = T.ftensor4()
+    kern = T.ftensor4()
+    out = T.ftensor4()
+
+    b = 1
+    c = 4
+    f = 3
+    ih = 5
+    iw = 8
+    kh = 2
+    kw = 6
+    img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
+    kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
+    out_val = numpy.random.random((b, f, ih - kh + 1,
+                                   iw - kw + 1)).astype('float32')
+
+    conv = dnn.dnn_conv(img, kern)
+    gw = theano.grad(conv.sum(), kern)
+    gi = theano.grad(conv.sum(), img)
+
+    lr = numpy.asarray(0.05, dtype='float32')
+
+    if dnn.version() == -1:
+        # Can't merge alpha with cudnn v1
+        fr = conv + out
+        wr = kern + gw
+        ir = img + gi
+    else:
+        fr = lr * (conv + out)
+        wr = kern + lr * gw
+        ir = img + lr * gi
+
+    f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
+    assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
+                      dnn.GpuDnnConv)
+    assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
+                      dnn.GpuDnnConvGradW)
+    assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
+                      dnn.GpuDnnConvGradI)
+
+    mode = mode_with_gpu
+    mode = mode.excluding('local_dnn_conv_alpha_merge')
+    mode = mode.excluding('local_dnn_convw_alpha_merge')
+    mode = mode.excluding('local_dnn_convi_alpha_merge')
+    mode = mode.excluding('local_dnn_conv_output_merge')
+    mode = mode.excluding('local_dnn_convw_output_merge')
+    mode = mode.excluding('local_dnn_convi_output_merge')
+
+    f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode)
+
+    assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
+                          dnn.GpuDnnConv)
+    assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
+                          dnn.GpuDnnConvGradW)
+    assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
+                          dnn.GpuDnnConvGradI)
+
+    out_f1 = f1(img_val, kern_val, out_val)
+    out_f2 = f2(img_val, kern_val, out_val)
+
+    assert len(out_f1) == len(out_f2)
+
+    for v1, v2 in zip(out_f1, out_f2):
+        utt.assert_allclose(v1, v2)
+
+
+def test_dnn_conv_grad():
+    if not dnn.dnn_available() or dnn.version() == -1:
+        raise SkipTest('alpha != 1.0 not supported in cudnn v1')
+
+    b = 1
+    c = 4
+    f = 3
+    ih = 2
+    iw = 8
+    kh = 2
+    kw = 2
+    img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
+    kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
+    out_val = numpy.random.random((b, f, ih - kw + 1,
+                                   iw - kw + 1)).astype('float32')
+
+    def dconv(img, kern, out):
+        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+                                  conv_mode='conv')(img.shape, kern.shape)
+        return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)
+
+    def dconvi(img, kern, out):
+        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+                                  conv_mode='conv')(img.shape, kern.shape)
+        return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
+                                     beta=0.0)
+
+    def dconvw(img, kern, out):
+        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+                                  conv_mode='conv')(img.shape, kern.shape)
+        return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
+                                     beta=-1.0)
+
+    utt.verify_grad(dconv, [img_val, kern_val, out_val])
+    utt.verify_grad(dconvi, [img_val, kern_val, out_val])
+    utt.verify_grad(dconvw, [img_val, kern_val, out_val])
+
+
+def test_version():
+    if not dnn.dnn_available():
+        raise SkipTest(dnn.dnn_available.msg)
+    assert isinstance(dnn.version(), (int, tuple))
+
+
+class test_SoftMax(test_nnet.test_SoftMax):
+    gpu_op = dnn.GpuDnnSoftmax
+    gpu_grad_op = dnn.GpuDnnSoftmaxGrad
+    mode = mode_with_gpu
+
+    def test_softmax_shape_0(self):
+        raise SkipTest("Cudnn do not suport 0 shapes")
+
+    def test_softmax_grad(self):
+        def cmp(n, m, f, f_gpu):
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+            gdata = numpy.asarray(data)[:, :, None, None]
+
+            out = f(data)
+            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
+            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+
+        x = T.matrix('x', 'float32')
+        x_gpu = T.tensor4('x_gpu', 'float32')
+        f_z = T.nnet.softmax_op
+        f_gpu = dnn.GpuDnnSoftmax(
+            'bc01',
+            'accurate',
+            'channel'
+        )
+
+        # Verify the grad operation
+        dims = (2, 3, 4, 5)
+        gdata = numpy.arange(
+            numpy.product(dims),
+            dtype='float32'
+        ).reshape(dims)
+        T.verify_grad(f_gpu, [gdata], rng=numpy.random,
+                      mode=mode_with_gpu)
+
+        # Verify that the CPU and GPU implementations return the same results
+        # up to a tolerance.
+
+        self._test_softmax(
+            x,
+            x_gpu,
+            f_z,
+            f_gpu,
+            cmp
+        )
+
+        self._test_softmax(
+            x, x, f_z, f_z, self._cmp
+        )
+
+        # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
+        # optimization is applied when cudnn is required
+        y = T.fvector('y')
+        f = theano.function(
+            [y],
+            T.grad(T.nnet.softmax(y).mean(), y),
+            mode=mode_with_gpu
+        )
+        sorted_f = f.maker.fgraph.toposort()
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        self.gpu_grad_op
+                    )]) == 1)
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.tensor.nnet.SoftmaxGrad
+                    )]) == 0)
+
+        # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
+        # optimization is not applied when cudnn is excluded or not
+        # available
+        mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
+        y = T.fvector('y')
+        f = theano.function(
+            [y],
+            T.grad(T.nnet.softmax(y).mean(), y),
+            mode=mode_wo_cudnn
+        )
+        sorted_f = f.maker.fgraph.toposort()
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        self.gpu_grad_op
+                    )]) == 0)
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.tensor.nnet.SoftmaxGrad
+                    )]) == 1)
+
+        # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not
+        # crash with manual graph
+        y = T.fvector('y')
+        o = theano.tensor.nnet.SoftmaxGrad()(y, y*2)
+        f = theano.function([y], o, mode=mode_with_gpu)
+        sorted_f = f.maker.fgraph.toposort()
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        self.gpu_grad_op
+                    )]) == 1)
+        assert(len([i
+                    for i in sorted_f
+                    if isinstance(
+                        i.op,
+                        theano.tensor.nnet.SoftmaxGrad
+                    )]) == 0)
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
 from __future__ import print_function
 from nose.plugins.skip import SkipTest
 import numpy
+import unittest

 import theano
 import theano.tensor as T
@@ -11,12 +12,13 @@ from theano.sandbox import gpuarray
 # We let that import do the init of the back-end if needed.
 from .test_basic_ops import (mode_with_gpu,
                             mode_without_gpu)
-
 from ..nnet import (
    GpuCrossentropySoftmaxArgmax1HotWithBias,
    GpuCrossentropySoftmax1HotWithBiasDx,
    GpuSoftmaxWithBias, GpuSoftmax)

+mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
+

 def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    """
@@ -290,3 +292,96 @@ def softmax_unittest_template(dtypeInput):
    cmp(2, 10000)
    cmp(128, 16 * 1024)
    cmp(128, 64 * 1024)
+
+
+class test_SoftMax(unittest.TestCase):
+    gpu_op = GpuSoftmax
+    mode = mode_wo_cudnn
+
+    def _test_softmax(
+        self,
+        x,
+        x_gpu,
+        f_z,
+        f_gpu_z,
+        cmp
+    ):
+        """
+        This is basic test for GpuSoftmax and GpuDnnSoftmax
+
+        We check that we loop when there is too much block
+        We use slower code when there isn't enough shared memory
+        """
+        f_z_out = f_z(x)
+        f_gpu_z_out = f_gpu_z(x_gpu)
+
+        f = theano.function([x], f_z_out, mode=mode_without_gpu)
+        f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=self.mode)
+        self._check_types(f, f_gpu, T.nnet.Softmax, self.gpu_op)
+
+        # we need to test n>32*1024 to check that we make the block loop.
+        cmp(1, 5, f, f_gpu)
+        cmp(2, 5, f, f_gpu)
+        cmp(10, 5, f, f_gpu)
+        cmp(100, 5, f, f_gpu)
+        cmp(1000, 5, f, f_gpu)
+        cmp(10000, 5, f, f_gpu)
+        cmp(4074, 400, f, f_gpu)
+        cmp(784, 784, f, f_gpu)
+        cmp(4, 1000, f, f_gpu)
+        cmp(4, 1024, f, f_gpu)
+        cmp(4, 2000, f, f_gpu)
+        cmp(4, 2024, f, f_gpu)
+        # The GTX285 don't have enough shared memory.
+        cmp(4, 4074, f, f_gpu)
+        # The GTX580, 680 and kepler don't have enough shared memory.
+        cmp(2, 10000, f, f_gpu)
+        cmp(128, 16 * 1024, f, f_gpu)
+        cmp(128, 64 * 1024, f, f_gpu)
+        # cudnn permits no more than 2^15 - 1 rows
+        cmp((2 << 15) - 1, 5, f, f_gpu)
+        cmp(5, 2 << 15, f, f_gpu)
+
+        return f, f_gpu
+
+    def _cmp(self, n, m, f, f_gpu):
+        # print "test_softmax",n,m
+        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        out = f(data)
+        gout = f_gpu(data)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+
+    def _check_types(self, graph, graph_gpu, f_type, f_gpu_type):
+        assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
+        assert len([node for node in graph_gpu.maker.fgraph.toposort()
+                    if isinstance(node.op, f_gpu_type)]) == 1
+
+    def test_softmax(self):
+        x = T.fmatrix('x')
+        z = T.nnet.softmax_op
+
+        f, f_gpu = self._test_softmax(
+            x,
+            x,
+            z,
+            z,
+            self._cmp
+        )
+
+        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
+        # we test them only for the Theano softmax.
+        self._cmp(2 << 15, 5, f, f_gpu)
+
+    def test_softmax_shape_0(self):
+        x = T.fmatrix('x')
+        z = T.nnet.softmax_op
+
+        f, f_gpu = self._test_softmax(
+            x,
+            x,
+            z,
+            z,
+            self._cmp
+        )
+        # Theano can handle that case, but cudnn can't
+        self._cmp(0, 10, f, f_gpu)
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -593,7 +593,7 @@ def get_scalar_constant_value(orig_v, elemwise=True,
            # mess with the stabilization optimization and be too slow.
            # We put all the scalar Ops used by get_canonical_form_slice()
            # to allow it to determine the broadcast pattern correctly.
-            elif isinstance(v.owner.op, ScalarFromTensor):
+            elif isinstance(v.owner.op, (ScalarFromTensor, TensorFromScalar)):
                return get_scalar_constant_value(v.owner.inputs[0])
            elif isinstance(v.owner.op, scal.ScalarOp):
                if isinstance(v.owner.op, scal.Second):