Merge pull request #3091 from nouiz/abergeron-gpuarray_dnn

gpuarray in new back-end

Merge pull request #3091 from nouiz/abergeron-gpuarray_dnn
709c14cc · Frédéric Bastien · 74a05173 · e7071351 · 709c14cc · 709c14cc
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -56,7 +56,7 @@ if pygpu:
            init_dev(config.device)
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
-            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
+            optdb.add_tags('gpuarray', 'fast_run', 'fast_compile')
        elif config.gpuarray.init_device != '':
            init_dev(config.gpuarray.init_device)


--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -783,6 +783,10 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
        raise NotImplementedError("grad disabled")


+def empty_like(var):
+    return GpuAllocEmpty(var.type.dtype)(*var.shape)
+
+
 class GpuContiguous(Op):
    """
    Always return a c contiguous output. Copy the input only if it is

--- a/theano/sandbox/gpuarray/comp.py
+++ b/theano/sandbox/gpuarray/comp.py
@@ -29,12 +29,12 @@ class NVCC_compiler(NVCC_base):
        # exist in the past
        numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
        if bool(numpy_ver < [1, 7]):
-            flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
-            flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED")
-            flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
-            flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
-            flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
-            flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
+            flags.append("-DNPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
+            flags.append("-DNPY_ARRAY_ALIGNED=NPY_ALIGNED")
+            flags.append("-DNPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
+            flags.append("-DNPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
+            flags.append("-DNPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
+            flags.append("-DNPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")

        # If the user didn't specify architecture flags add them
        if not any(['-arch=sm_' in f for f in flags]):

--- a/theano/sandbox/gpuarray/cudnn_helper.h
+++ b/theano/sandbox/gpuarray/cudnn_helper.h
+#ifndef CUDNN_HELPER_H
+#define CUDNN_HELPER_H
+
+#include <cudnn.h>
+
+#ifndef CUDNN_VERSION
+#include <assert.h>
+
+// Here we define the R2 API in terms of functions in the R1 interface
+// This is only for what we use
+
+static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
+  switch (err) {
+  case CUDNN_STATUS_SUCCESS:
+    return "The operation completed successfully.";
+  case CUDNN_STATUS_NOT_INITIALIZED:
+    return "The handle was not initialized(Is your driver recent enought?).";
+  case CUDNN_STATUS_ALLOC_FAILED:
+    return "Ressource allocation failed inside the library.";
+  case CUDNN_STATUS_BAD_PARAM:
+    return "An incorrect value was passed in.";
+  case CUDNN_STATUS_ARCH_MISMATCH:
+    return "The current GPU does not support the required features (only cc 3.0+ are supported).";
+  case CUDNN_STATUS_MAPPING_ERROR:
+    return "An access to GPU memory space failed (probably due to a failure to bind texture).";
+  case CUDNN_STATUS_EXECUTION_FAILED:
+    return "A kernel failed to execute.";
+  case CUDNN_STATUS_INTERNAL_ERROR:
+    return "An internal cuDNN operation failed.";
+  case CUDNN_STATUS_NOT_SUPPORTED:
+    return "The combination of parameters is not currently supported.";
+  default:
+    return "Unknown error code.";
+  }
+}
+
+// some macros to help support cudnn R1 while using R2 code.
+#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
+#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
+#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
+
+typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
+
+static inline cudnnStatus_t
+cudnnGetConvolution2dForwardOutputDim(
+  const cudnnConvolutionDescriptor_t convDesc,
+  const cudnnTensorDescriptor_t inputTensorDesc,
+  const cudnnFilterDescriptor_t filterDesc,
+  int *n,
+  int *c,
+  int *h,
+  int *w) {
+  return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
+				   n, c, h, w);
+}
+
+typedef int cudnnConvolutionFwdAlgo_t;
+typedef int cudnnConvolutionFwdPreference_t;
+
+#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0
+
+static inline cudnnStatus_t
+cudnnGetConvolutionForwardAlgorithm(
+  cudnnHandle_t handle,
+  const cudnnTensorDescriptor_t srcDesc,
+  const cudnnFilterDescriptor_t filterDesc,
+  const cudnnConvolutionDescriptor_t convDesc,
+  const cudnnTensorDescriptor_t destDesc,
+  cudnnConvolutionFwdPreference_t preference,
+  size_t memoryLimitInbytes,
+  cudnnConvolutionFwdAlgo_t *algo) {
+  *algo = 0;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+static inline cudnnStatus_t
+cudnnGetConvolutionForwardWorkspaceSize(
+ cudnnHandle_t handle,
+ const cudnnTensorDescriptor_t srcDesc,
+ const cudnnFilterDescriptor_t filterDesc,
+ const cudnnConvolutionDescriptor_t convDesc,
+ const cudnnTensor4dDescriptor_t destDesc,
+ cudnnConvolutionFwdAlgo_t algo,
+ size_t *sizeInBytes) {
+  *sizeInBytes = 0;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+
+static inline cudnnStatus_t
+cudnnConvolutionForward_v2(
+  cudnnHandle_t handle,
+  const void *alpha,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
+  const cudnnFilterDescriptor_t filterDesc,
+  const void *filterData,
+  const cudnnConvolutionDescriptor_t convDesc,
+  cudnnConvolutionFwdAlgo_t algo,
+  void *workSpace,
+  size_t workSpaceSizeInBytes,
+  const void *beta,
+  const cudnnTensorDescriptor_t destDesc,
+  void *destData) {
+  assert(*(float *)alpha == 1.0);
+  cudnnAccumulateResult_t r;
+  if (*(float *)beta == 0.0) {
+    r = CUDNN_RESULT_NO_ACCUMULATE;
+  } else if (*(float *)beta == 1.0) {
+    r = CUDNN_RESULT_ACCUMULATE;
+  } else {
+    assert(0 && "beta must be 0.0 or 1.0");
+  }
+  return cudnnConvolutionForward(handle, srcDesc, srcData,
+				 filterDesc, filterData,
+				 convDesc, destDesc, destData,
+				 r);
+}
+#define cudnnConvolutionForward cudnnConvolutionForward_v2
+
+static inline cudnnStatus_t
+cudnnConvolutionBackwardFilter_v2(
+  cudnnHandle_t	handle,
+  const void *alpha,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
+  const cudnnTensorDescriptor_t diffDesc,
+  const void *diffData,
+  const cudnnConvolutionDescriptor_t convDesc,
+  const void *beta,
+  const cudnnFilterDescriptor_t gradDesc,
+  void *gradData) {
+  assert(*(float *)alpha == 1.0);
+  cudnnAccumulateResult_t r;
+  if (*(float *)beta == 0.0) {
+    r = CUDNN_RESULT_NO_ACCUMULATE;
+  } else if (*(float *)beta == 1.0) {
+    r = CUDNN_RESULT_ACCUMULATE;
+  } else {
+    assert(0 && "beta must be 0.0 or 1.0");
+  }
+  return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
+					diffDesc, diffData,
+					convDesc, gradDesc, gradData,
+					r);
+}
+
+#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
+
+static inline cudnnStatus_t
+cudnnConvolutionBackwardData_v2(
+  cudnnHandle_t	handle,
+  const void *alpha,
+  const cudnnFilterDescriptor_t filterDesc,
+  const void *filterData,
+  const cudnnTensorDescriptor_t diffDesc,
+  const void *diffData,
+  const cudnnConvolutionDescriptor_t convDesc,
+  const void *beta,
+  const cudnnTensorDescriptor_t gradDesc,
+  void *gradData) {
+  assert(*(float *)alpha == 1.0);
+  cudnnAccumulateResult_t r;
+  if (*(float *)beta == 0.0) {
+    r = CUDNN_RESULT_NO_ACCUMULATE;
+  } else if (*(float *)beta == 1.0) {
+    r = CUDNN_RESULT_ACCUMULATE;
+  } else {
+    assert(0 && "beta must be 0.0 or 1.0");
+  }
+  /* This function needs the casting because its params are not
+     declared as const */
+  return cudnnConvolutionBackwardData(handle,
+				      (cudnnFilterDescriptor_t)filterDesc,
+				      filterData,
+				      (cudnnTensorDescriptor_t)diffDesc,
+				      diffData,
+				      (cudnnConvolutionDescriptor_t)convDesc,
+				      (cudnnTensorDescriptor_t)gradDesc,
+				      gradData,
+				      r);
+}
+
+#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
+
+//Needed for R2 rc2
+# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
+#else
+
+// r2 rc1 and rc2 do not have the same macro defined
+// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
+# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
+
+#endif
+
+#endif
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
+#section support_code
+static cudnnHandle_t _handle = NULL;
+
+static int
+c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
+  cudnnDataType_t dt;
+  size_t ds;
+  switch (var->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
+    return -1;
+  }
+  ds = gpuarray_get_elsize(var->ga.typecode);
+
+  int str0, str1, str2, str3;
+  // cudnn do not like 0s in strides
+  str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
+  str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3];
+  str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3];
+  str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1];
+  cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
+    desc, dt,
+    PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1),
+    PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3),
+    str0, str1, str2, str3);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+		 "Could not set tensor4d descriptor: %s",
+		 cudnnGetErrorString(err));
+    return -1;
+  }
+  return 0;
+}
+
+static int
+c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
+  cudnnDataType_t dt;
+  if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+		    "Only contiguous filters (kernels) are supported.");
+    return -1;
+  }
+  switch (var->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
+    return -1;
+  }
+  cudnnStatus_t err = cudnnSetFilter4dDescriptor(
+    desc, dt,
+    PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1],
+    PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+		 "Could not set filter descriptor: %s.",
+		 cudnnGetErrorString(err));
+    return -1;
+  }
+  return 0;
+}
+
+#section init_code
+
+{
+  cudnnStatus_t err;
+  if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
+		 cudnnGetErrorString(err));
+#if PY_MAJOR_VERSION >= 3
+    return NULL;
+#else
+    return;
+#endif
+  }
+}
--- a/theano/sandbox/gpuarray/dnn_conv_base.c
+++ b/theano/sandbox/gpuarray/dnn_conv_base.c
+#section support_code_struct
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
+
+#section init_code_struct
+
+cudnnStatus_t APPLY_SPECIFIC(err);
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(kerns) = NULL;
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", 
+	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
+if (APPLY_SPECIFIC(output) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
+if (APPLY_SPECIFIC(kerns) != NULL)
+  cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
+#section support_code_struct
+
+int
+APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
+                         PyGpuArrayObject *om,
+                         cudnnConvolutionDescriptor_t desc,
+                         double alpha, double beta,
+                         PyGpuArrayObject **output) {
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  float af = alpha, bf = beta;
+  void *alpha_p;
+  void *beta_p;
+
+  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
+    PyErr_SetString(PyExc_ValueError,
+		    "GpuDnnConv images and kernel must have the same stack size");
+    return 1;
+  }
+
+  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
+
+  switch (input->ga.typecode) {
+  case GA_DOUBLE:
+    alpha_p = (void *)&alpha;
+    beta_p = (void *)&beta;
+    break;
+  case GA_FLOAT:
+    alpha_p = (void *)&af;
+    beta_p = (void *)&bf;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
+    return 1;
+  }
+
+#ifdef CONV_INPLACE
+  Py_XDECREF(*output);
+  *output = om;
+  Py_INCREF(*output);
+#else
+  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
+                         om->ga.typecode, GA_C_ORDER,
+                         pygpu_default_context()) != 0)
+    return 1;
+  if (beta != 0.0 && pygpu_move(*output, om))
+    return 1;
+#endif
+
+  if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+
+  {
+    size_t worksize;
+    gpudata *workspace;
+    PyGpuContextObject *c;
+
+    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
+                                                  APPLY_SPECIFIC(input),
+                                                  APPLY_SPECIFIC(kerns),
+                                                  desc,
+                                                  APPLY_SPECIFIC(output),
+                                                  CONV_ALGO,
+                                                  &worksize);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv: error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    /* 
+     * This is less than ideal since we need to free it after (which
+     * introduces a synchronization point. But we don't have a module
+     * to place a nice get_work_mem() function in.
+     */
+    if (worksize != 0) {
+      c = pygpu_default_context();
+      workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+      if (workspace == NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Could not allocate working memory");
+        return 1;
+      }
+    }
+
+    err = cudnnConvolutionForward(
+      _handle,
+      alpha_p,
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+      desc, CONV_ALGO,
+      worksize == 0 ? NULL : *(void **)workspace, worksize,
+      beta_p,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
+
+    if (worksize != 0)
+      c->ops->buffer_release(workspace);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
+		 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
+#section support_code_struct
+
+int
+APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
+                        PyGpuArrayObject *im,
+                        cudnnConvolutionDescriptor_t desc,
+                        double alpha, double beta, PyGpuArrayObject **input) {
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  float af = alpha, bf = beta;
+  void *alpha_p;
+  void *beta_p;
+
+  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
+    PyErr_SetString(PyExc_ValueError,
+		    "GpuDnnConv images and kernel must have the same stack size");
+    return 1;
+  }
+
+  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
+
+  switch (im->ga.typecode) {
+  case GA_DOUBLE:
+    alpha_p = (void *)&alpha;
+    beta_p = (void *)&beta;
+    break;
+  case GA_FLOAT:
+    alpha_p = (void *)&af;
+    beta_p = (void *)&bf;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
+    return 1;
+  }
+
+#ifdef CONV_INPLACE
+  Py_XDECREF(*input);
+  *input = im;
+  Py_INCREF(*input);
+#else
+  if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
+                         im->ga.typecode, GA_C_ORDER,
+                         pygpu_default_context()) != 0)
+    return 1;
+  if (beta != 0.0 && pygpu_move(*input, im))
+    return 1;
+#endif
+
+  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+
+  err = cudnnConvolutionBackwardData(
+    _handle,
+    alpha_p,
+    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
+    desc,
+    beta_p,
+    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
+#section support_code_struct
+
+int 
+APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
+                        PyGpuArrayObject *km,
+                        cudnnConvolutionDescriptor_t desc,
+                        double alpha, double beta, PyGpuArrayObject **kerns) {
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  float af = alpha, bf = beta;
+  void *alpha_p;
+  void *beta_p;
+
+  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
+    PyErr_SetString(PyExc_ValueError,
+		    "GpuDnnConv images and kernel must have the same stack size");
+    return 1;
+  }
+
+  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+
+  switch (input->ga.typecode) {
+  case GA_DOUBLE:
+    alpha_p = (void *)&alpha;
+    beta_p = (void *)&beta;
+    break;
+  case GA_FLOAT:
+    alpha_p = (void *)&af;
+    beta_p = (void *)&bf;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
+    return 1;
+  }
+
+#ifdef CONV_INPLACE
+  Py_XDECREF(*kerns);
+  *kerns = km;
+  Py_INCREF(*kerns);
+#else
+  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
+                         km->ga.typecode, GA_C_ORDER,
+                         pygpu_default_context()) != 0)
+    return 1;
+  if (beta != 0.0 && pygpu_move(*kerns, km))
+    return 1;
+#endif
+
+  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
+
+  err = cudnnConvolutionBackwardFilter(
+    _handle,
+    alpha_p,
+    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
+    desc,
+    beta_p,
+    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/gpuarray_helper.h
+++ b/theano/sandbox/gpuarray/gpuarray_helper.h
@@ -42,4 +42,9 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
  return out;
 }

+/* This is guaranteed to work and return the raw CUDA/OpenCL object on
+ * all recent (as of June 2015) version of libgpuarray. This is also
+ * promised to keep working in future versions. */
+#define PyGpuArray_DEV_DATA(ary) (*(void **)((ary)->ga.data))
+
 #endif
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -12,11 +12,13 @@ from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, Optimizer, toolbox)
+from theano.gof.optdb import LocalGroupDB

 from theano.scan_module import scan_utils, scan_op, scan_opt

 from theano.tensor.nnet.conv import ConvOp
 from theano.tests.breakpoint import PdbBreakpoint
+
 from .type import GpuArrayType, GpuArrayConstant
 from .basic_ops import (host_from_gpu, gpu_from_host,
                        HostFromGpu, GpuFromHost,
@@ -39,6 +41,10 @@ gpu_cut_copies = EquilibriumDB()

 gpu_seqopt = SequenceDB()

+# Don't register this right now
+conv_groupopt = LocalGroupDB()
+conv_groupopt.__name__ = "gpua_conv_opts"
+
 gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'inplace', 'gpuarray')
 gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
@@ -689,6 +695,9 @@ def local_gpu_conv(node):
    out.values_eq_approx = values_eq_approx
    return [out]

+# Register this here so that it goes after 'local_gpu_conv'
+register_opt()(conv_groupopt)
+

 @register_opt("low_memory")
 @local_optimizer([GpuCAReduceCuda])

--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -7,10 +7,10 @@ from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)

-from .basic_ops import GpuFromHost, HostFromGpu, host_from_gpu
+from .basic_ops import GpuFromHost, HostFromGpu
 from .elemwise import GpuDimShuffle, GpuElemwise

-_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
+_one = scal.constant(numpy.asarray(1.0, dtype='float64'))


 def grab_cpu_scalar(v, nd):
@@ -18,10 +18,10 @@ def grab_cpu_scalar(v, nd):
        n = v.owner
        if (isinstance(n.op, GpuDimShuffle) and
                n.op.new_order == ('x',) * nd):
-            return host_from_gpu(n.inputs[0])
+            return grab_cpu_scalar(n.inputs[0])
        elif (isinstance(n.op, DimShuffle) and
              n.op.new_order == ('x',) * nd):
-            return n.inputs[0]
+            return grab_cpu_scalar(n.inputs[0])
        elif isinstance(n.op, GpuFromHost):
            return grab_cpu_scalar(n.inputs[0], nd=nd)
        else:
@@ -37,7 +37,7 @@ def find_node(v, cls, ignore_clients=False):
    # that has the op class specified. If ignore_clients is False (the
    # default) it will only dig through nodes that have a single
    # client.
-    if v.owner is not None and (ignore_clients or v.clients == 1):
+    if v.owner is not None and (ignore_clients or len(v.clients) == 1):
        if isinstance(v.owner.op, cls):
            return v.owner
        elif (isinstance(v.owner.op, GpuFromHost) and

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
 from __future__ import print_function
 from nose.plugins.skip import SkipTest
 import numpy
+import unittest

 import theano
 import theano.tensor as T
@@ -11,12 +12,13 @@ from theano.sandbox import gpuarray
 # We let that import do the init of the back-end if needed.
 from .test_basic_ops import (mode_with_gpu,
                             mode_without_gpu)
-
 from ..nnet import (
    GpuCrossentropySoftmaxArgmax1HotWithBias,
    GpuCrossentropySoftmax1HotWithBiasDx,
    GpuSoftmaxWithBias, GpuSoftmax)

+mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
+

 def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    """
@@ -290,3 +292,96 @@ def softmax_unittest_template(dtypeInput):
    cmp(2, 10000)
    cmp(128, 16 * 1024)
    cmp(128, 64 * 1024)
+
+
+class test_SoftMax(unittest.TestCase):
+    gpu_op = GpuSoftmax
+    mode = mode_wo_cudnn
+
+    def _test_softmax(
+        self,
+        x,
+        x_gpu,
+        f_z,
+        f_gpu_z,
+        cmp
+    ):
+        """
+        This is basic test for GpuSoftmax and GpuDnnSoftmax
+
+        We check that we loop when there is too much block
+        We use slower code when there isn't enough shared memory
+        """
+        f_z_out = f_z(x)
+        f_gpu_z_out = f_gpu_z(x_gpu)
+
+        f = theano.function([x], f_z_out, mode=mode_without_gpu)
+        f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=self.mode)
+        self._check_types(f, f_gpu, T.nnet.Softmax, self.gpu_op)
+
+        # we need to test n>32*1024 to check that we make the block loop.
+        cmp(1, 5, f, f_gpu)
+        cmp(2, 5, f, f_gpu)
+        cmp(10, 5, f, f_gpu)
+        cmp(100, 5, f, f_gpu)
+        cmp(1000, 5, f, f_gpu)
+        cmp(10000, 5, f, f_gpu)
+        cmp(4074, 400, f, f_gpu)
+        cmp(784, 784, f, f_gpu)
+        cmp(4, 1000, f, f_gpu)
+        cmp(4, 1024, f, f_gpu)
+        cmp(4, 2000, f, f_gpu)
+        cmp(4, 2024, f, f_gpu)
+        # The GTX285 don't have enough shared memory.
+        cmp(4, 4074, f, f_gpu)
+        # The GTX580, 680 and kepler don't have enough shared memory.
+        cmp(2, 10000, f, f_gpu)
+        cmp(128, 16 * 1024, f, f_gpu)
+        cmp(128, 64 * 1024, f, f_gpu)
+        # cudnn permits no more than 2^15 - 1 rows
+        cmp((2 << 15) - 1, 5, f, f_gpu)
+        cmp(5, 2 << 15, f, f_gpu)
+
+        return f, f_gpu
+
+    def _cmp(self, n, m, f, f_gpu):
+        # print "test_softmax",n,m
+        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        out = f(data)
+        gout = f_gpu(data)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+
+    def _check_types(self, graph, graph_gpu, f_type, f_gpu_type):
+        assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
+        assert len([node for node in graph_gpu.maker.fgraph.toposort()
+                    if isinstance(node.op, f_gpu_type)]) == 1
+
+    def test_softmax(self):
+        x = T.fmatrix('x')
+        z = T.nnet.softmax_op
+
+        f, f_gpu = self._test_softmax(
+            x,
+            x,
+            z,
+            z,
+            self._cmp
+        )
+
+        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
+        # we test them only for the Theano softmax.
+        self._cmp(2 << 15, 5, f, f_gpu)
+
+    def test_softmax_shape_0(self):
+        x = T.fmatrix('x')
+        z = T.nnet.softmax_op
+
+        f, f_gpu = self._test_softmax(
+            x,
+            x,
+            z,
+            z,
+            self._cmp
+        )
+        # Theano can handle that case, but cudnn can't
+        self._cmp(0, 10, f, f_gpu)
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -593,7 +593,7 @@ def get_scalar_constant_value(orig_v, elemwise=True,
            # mess with the stabilization optimization and be too slow.
            # We put all the scalar Ops used by get_canonical_form_slice()
            # to allow it to determine the broadcast pattern correctly.
-            elif isinstance(v.owner.op, ScalarFromTensor):
+            elif isinstance(v.owner.op, (ScalarFromTensor, TensorFromScalar)):
                return get_scalar_constant_value(v.owner.inputs[0])
            elif isinstance(v.owner.op, scal.ScalarOp):
                if isinstance(v.owner.op, scal.Second):