Major adaptation to handle explicit context activation.

3a59bd8c · Arnaud Bergeron · babe6f1b · 3a59bd8c · 3a59bd8c · 3a59bd8c
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
 #section support_code
-static cudnnHandle_t _handle = NULL;

 static int
 c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
@@ -99,15 +98,21 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {

 #section init_code

-{
-  cudnnStatus_t err;
-  if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+setup_ext_cuda();
+
+#section support_code_struct
+
+cudnnHandle_t _handle;
+
+#section init_code_struct
+
+cuda_enter(pygpu_default_context()->ctx);
+cudnnStatus_t err;
+_handle = NULL;
+if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
               cudnnGetErrorString(err));
-#if PY_MAJOR_VERSION >= 3
-    return NULL;
-#else
-    return;
-#endif
-  }
+  cuda_exit(pygpu_default_context()->ctx);
+  FAIL;
 }
+cuda_exit(pygpu_default_context()->ctx);
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -10,6 +10,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -43,8 +44,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  Py_INCREF(*output);
 #else
  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
-                         om->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         om->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*output, om))
    return 1;
@@ -55,6 +55,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,

  cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;

+  cuda_enter(c->ctx);
 #ifdef CHOOSE_ALGO
  /* Static variables are only initialized once so this will not
   * reset the previous algo every time */
@@ -86,6 +87,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }
    algo = choice.algo;
@@ -96,6 +98,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
                   "memory information on the GPU: %s\n",
                   cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -107,6 +110,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }
 #endif
@@ -145,6 +149,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -167,6 +172,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                      "are padded such that the padded inputs are larger "
                      "than the kernels. Update your installation of CuDNN "
                      "to V3 or more recent to solve the issue.");
+      cuda_exit(c->ctx);
      return 1;
    }
  }
@@ -175,7 +181,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  {
    size_t worksize;
    gpudata *workspace;
-    PyGpuContextObject *c;
    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
@@ -187,6 +192,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      PyErr_Format(PyExc_RuntimeError,
                   "error getting worksize: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -196,11 +202,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
-      c = pygpu_default_context();
      workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
+        cuda_exit(c->ctx);
        return 1;
      }
    }
@@ -218,6 +224,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    if (worksize != 0)
      c->ops->buffer_release(workspace);
  }
+  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  Py_INCREF(*input);
 #else
  if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
-                         im->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         im->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*input, im))
    return 1;
@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,

  cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;

+  cuda_enter(c->ctx);
+
 #ifdef CHOOSE_ALGO
  static int reuse_algo = 0;
  static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
@@ -83,6 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -94,6 +97,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      cudaGetLastError();
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -104,6 +108,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }
 #endif
@@ -136,6 +141,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -149,7 +155,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,

  size_t worksize;
  gpudata *workspace;
-  PyGpuContextObject *c;

  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
    _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
@@ -158,15 +163,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
    return 1;
  }

  if (worksize != 0) {
-    c = pygpu_default_context();
    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
      PyErr_SetString(PyExc_RuntimeError,
                      "Could not allocate working memory");
+      cuda_exit(c->ctx);
      return 1;
    }
  }
@@ -183,6 +189,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  if (worksize != 0)
    c->ops->buffer_release(workspace);

+  cuda_exit(c->ctx);
+
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  Py_INCREF(*kerns);
 #else
  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
-                         km->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         km->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*kerns, km))
    return 1;
@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,

  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;

+  cuda_enter(c->ctx);
+
 #ifdef CHOOSE_ALGO
  static int reuse_algo = 0;
  static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
@@ -84,6 +86,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -95,6 +98,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      cudaGetLastError();
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -106,6 +110,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }
 #endif
@@ -138,6 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }

@@ -151,7 +157,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,

  size_t worksize;
  gpudata *workspace;
-  PyGpuContextObject *c;

  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
@@ -160,14 +165,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                 cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
    return 1;
  }

  if (worksize != 0) {
-    c = pygpu_default_context();
    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
+      cuda_exit(c->ctx);
      return 1;
    }
  }
@@ -184,6 +190,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  if (worksize != 0)
    c->ops->buffer_release(workspace);

+  cuda_exit(c->ctx);
+
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+
+#section init_code_struct
+
+cudnnStatus_t APPLY_SPECIFIC(err);
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
+if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
+                             cudnnPoolingDescriptor_t desc,
+                             PyGpuArrayObject **out) {
+  cudnnStatus_t err;
+  size_t dims[5];
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+    return 1;
+  }
+
+  if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+
+  cudnnPoolingMode_t mode;
+  int w[3];
+  int p[3];
+  int s[3];
+  int ndims;
+
+  err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "error doing cudnnGetPoolingDescriptor operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+
+  dims[0] = PyGpuArray_DIM(img, 0);
+  dims[1] = PyGpuArray_DIM(img, 1);
+  dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
+  dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
+  if (ndims == 3)
+    dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
+
+  if (theano_prep_output(out, ndims+2, dims, img->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  {
+    const float alpha = 1;
+    const float beta = 0;
+
+    cuda_enter(c->ctx);
+    err = cudnnPoolingForward(
+      _handle, desc,
+      &alpha,
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
+      &beta,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
+    cuda_exit(c->ctx);
+  }
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "GpuDnnPool: error doing cudnnPoolingForward operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(input_grad) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(output_grad) = NULL;
+
+{
+  cudnnStatus_t err;
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (input): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (input_grad): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (output): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (output_grad): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
+if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
+if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
+                                  PyGpuArrayObject *out,
+                                  PyGpuArrayObject *out_grad,
+                                  cudnnPoolingDescriptor_t desc,
+                                  PyGpuArrayObject **inp_grad) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+    return 1;
+  }
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous input gradients are supported.");
+    return 1;
+  }
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
+    return 1;
+  }
+
+  if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+  if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
+    return 1;
+  if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
+                         PyGpuArray_DIMS(inp), out->ga.typecode,
+                         GA_C_ORDER, pygpu_default_context()) != 0) {
+    return 1;
+  }
+
+  if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
+    return 1;
+
+  {
+    const float alpha = 1;
+    const float beta = 0;
+
+    cuda_enter(c->ctx);
+    err = cudnnPoolingBackward(
+      _handle, desc,
+      &alpha,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
+      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
+      &beta,
+      APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
+      );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_softmax.c
+++ b/theano/sandbox/gpuarray/dnn_softmax.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+
+{
+  cudnnStatus_t err;
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
+if (APPLY_SPECIFIC(output) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
+                            PyGpuArrayObject **out) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+
+  if (theano_prep_output(out, PyGpuArray_NDIM(x),
+                         PyGpuArray_DIMS(x), x->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  {
+    const float alpha = 1.;
+    const float beta = 0.;
+
+    cuda_enter(c->ctx);
+    err = cudnnSoftmaxForward(
+      _handle,
+      SOFTMAX_ALGO,
+      SOFTMAX_MODE,
+      (void *)&alpha,
+      APPLY_SPECIFIC(input),
+      PyGpuArray_DEV_DATA(x),
+      (void *)&beta,
+      APPLY_SPECIFIC(output),
+      PyGpuArray_DEV_DATA(*out)
+    );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_softmax_grad.c
+++ b/theano/sandbox/gpuarray/dnn_softmax_grad.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(out);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(dy) = NULL;
+APPLY_SPECIFIC(sm) = NULL;
+APPLY_SPECIFIC(out) = NULL;
+
+{
+  cudnnStatus_t err;
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(out));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(dy) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
+if (APPLY_SPECIFIC(sm) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
+if (APPLY_SPECIFIC(out) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(out));
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
+                                 PyGpuArrayObject *sm,
+                                 PyGpuArrayObject **out) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
+    return 1;
+  if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
+    return 1;
+
+  if (theano_prep_output(out, PyGpuArray_NDIM(dy),
+                         PyGpuArray_DIMS(dy), dy->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*out, APPLY_SPECIFIC(out)) != 0)
+    return 1;
+
+  {
+    const float alpha = 1.;
+    const float beta = 0.;
+
+    cuda_enter(c->ctx);
+    err = cudnnSoftmaxBackward(
+      _handle,
+      SOFTMAX_ALGO,
+      SOFTMAX_MODE,
+      (void *)&alpha,
+      APPLY_SPECIFIC(sm),
+      PyGpuArray_DEV_DATA(sm),
+      APPLY_SPECIFIC(dy),
+      PyGpuArray_DEV_DATA(dy),
+      (void*) &beta,
+      APPLY_SPECIFIC(out),
+      PyGpuArray_DEV_DATA(*out)
+      );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -207,11 +207,10 @@ def test_pooling():
                            (32, 1, 147, 197),
                            ]:
                    data = numpy.random.normal(0, 1, shp).astype("float32")
-                    a = f1(data).__array__()
+                    a = f1(data)
+                    b = f2(data)

-                    b = f2(data).__array__()
-                    assert numpy.allclose(a, b,
-                                          atol=numpy.finfo(numpy.float32).eps)
+                    utt.assert_allclose(a, b)

        # Test the grad
        for shp in [(1, 1, 2, 2),
@@ -228,7 +227,7 @@ def test_pooling():
            def fn(x):
                return max_pool_2d(x, (ws, ws), ignore_border=True,
                                   padding=pad, mode=mode)
-            theano.tests.unittest_tools.verify_grad(fn, [data],
+            utt.verify_grad(fn, [data],
                            cast_to_output_type=False,
                            mode=mode_with_gpu)
            # Confirm that the opt would have inserted it.
@@ -245,8 +244,7 @@ def test_pooling():
                    pad=pad,
                    mode=mode)
                return dnn_op
-            theano.tests.unittest_tools.verify_grad(
-                fn, [data],
+            utt.verify_grad(fn, [data],
                            cast_to_output_type=False,
                            mode=mode_with_gpu)
            # Confirm that we get the good op.
@@ -256,7 +254,7 @@ def test_pooling():
                        for node in fg.maker.fgraph.toposort()])
            g_out = fg(data)

-            # Compare again the CPU result
+            # Compare against the CPU result
            out = max_pool_2d(x, (ws, ws),
                              padding=pad,
                              ignore_border=True, mode=mode)
@@ -269,7 +267,7 @@ def test_pooling():
                assert any([isinstance(node.op, AveragePoolGrad)
                            for node in fc.maker.fgraph.toposort()])
            c_out = fc(data)
-            assert numpy.allclose(c_out, g_out)
+            utt.assert_allclose(c_out, g_out)


 def test_pooling_opt():
@@ -703,7 +701,7 @@ class test_SoftMax(test_nnet.test_SoftMax):

            out = f(data)
            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
-            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+            utt.assert_allclose(out, gout)

        x = T.matrix('x', 'float32')
        x_gpu = T.tensor4('x_gpu', 'float32')