Create the cudnn handle on context initialization and pass it around.

4b94a811 · Arnaud Bergeron · 3f536fb7 · 4b94a811 · 4b94a811 · 4b94a811
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -26,7 +26,7 @@ except ImportError:
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
-                   reg_context, get_context, ContextNotDefined)
+                   reg_context, get_context, ContextNotDefined, _get_props)
 from .basic_ops import as_gpuarray_variable
 from . import fft, dnn, opt, nerv, extra_ops, multinomial
@@ -89,19 +89,24 @@ def init_dev(dev, name=None):
              (name, dev, context.devname),
              file=sys.stderr)
    pygpu_activated = True
+    ctx_props = _get_props(name)
+    ctx_props['dev'] = dev
    if dev.startswith('cuda'):
-        try:
+        if 'cudnn_version' not in ctx_props:
-            cudnn_version = dnn.version()
+            try:
-            # 5200 should not print warning with cudnn 5.1 final.
+                ctx_props['cudnn_version'] = dnn.version()
-            if cudnn_version >= 5200:
+                # 5200 should not print warning with cudnn 5.1 final.
-                warnings.warn("Your cuDNN version is more recent than Theano."
+                if ctx_props['cudnn_version'] >= 5200:
-                              " If you see problems, try updating Theano or"
+                    warnings.warn("Your cuDNN version is more recent than "
-                              " downgrading cuDNN to version 5.1.")
+                                  "Theano. If you encounter problems, try "
-            if config.print_active_device:
+                                  "updating Theano or downgrading cuDNN to "
-                print("Using cuDNN version %d on context %s" %
+                                  "version 5.1.")
-                      (cudnn_version, name), file=sys.stderr)
+                if config.print_active_device:
-        except Exception:
+                    print("Using cuDNN version %d on context %s" %
-            pass
+                          (ctx_props['cudnn_version'], name), file=sys.stderr)
+                ctx_props['cudnn_handle'] = dnn._make_handle(context)
+            except Exception:
+                pass
 # This maps things like 'cuda0' to the context object on that device.
 init_dev.devmap = {}

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -26,7 +26,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from . import pygpu
-from .type import get_context, gpu_context_type, list_contexts
+from .type import (get_context, gpu_context_type, list_contexts,
+                   get_prop, set_prop)
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, gpu_alloc_empty,
                        empty_like, GpuArrayType)
@@ -42,6 +43,44 @@ from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, u
 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
+try:
+    from pygpu import gpuarray
+except ImportError:
+    pass
+def _dnn_lib():
+    if _dnn_lib.handle is None:
+        import ctypes.util
+        lib_name = ctypes.util.find_library('cudnn')
+        if lib_name is None and sys.platform == 'win32':
+            # Update these names when new versions of cudnn are supported.
+            lib_name = ctypes.util.find_library('cudnn64_5.dll')
+            if lib_name is None:
+                lib_name = ctypes.util.find_library('cudnn64_4.dll')
+        if lib_name is None:
+            raise RuntimeError('Could not find cudnn library')
+        _dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
+        cudnn = _dnn_lib.handle
+        cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
+        cudnn.cudnnCreate.restype = ctypes.c_int
+        cudnn.cudnnDestroy.argtypes = [ctypes.c_void_p]
+        cudnn.cudnnDestroy.restype = ctypes.c_int
+    return _dnn_lib.handle
+_dnn_lib.handle = None
+def _make_handle(ctx):
+    cudnn = _dnn_lib()
+    handle = ctypes.c_void_p()
+    with ctx:
+        err = cudnn.cudnnCreate(ctypes.byref(handle))
+    if err != 0:
+        raise RuntimeError("error creating cudnn handle")
+    return handle
 def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"):
    raise RuntimeError(msg)
@@ -144,6 +183,12 @@ def dnn_available(context_name):
 dnn_available.msg = None
+handle_type = CDataType('cudnnHandle_t', 'cudnnDestroy',
+                        headers=['cudnn.h'],
+                        header_dirs=[config.dnn.include_path],
+                        libraries=['cudnn'],
+                        lib_dirs=[config.dnn.library_path])
 class DnnBase(COp):
@@ -154,10 +199,20 @@ class DnnBase(COp):
    # dnn does not know about broadcasting, so we do not need to assert
    # the input broadcasting pattern.
    check_broadcast = False
-    params_type = gpu_context_type
+    params_type = handle_type
+    def dnn_context(self, node):
+        return node.outputs[0].type.context_name
    def get_params(self, node):
-        return node.outputs[0].type.context
+        try:
+            return get_prop(self.dnn_context(node), 'cudnn_handle_param')
+        except KeyError:
+            pass
+        ptr = get_prop(self.dnn_context(node), 'cudnn_handle').value
+        res = handle_type.make_value(ptr)
+        set_prop(self.dnn_context(node), 'cudnn_handle_param', res)
+        return res
    def __init__(self, files=None, c_func=None):
        if files is None:
@@ -165,9 +220,10 @@ class DnnBase(COp):
        COp.__init__(self, ["dnn_base.c"] + files, c_func)
    def c_headers(self):
-        return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h',
+        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/kernel.h',
-                'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h',
+                'gpuarray/util.h', 'gpuarray/ext_cuda.h', 'gpuarray_api.h',
-                'gpuarray/ext_cuda.h', 'gpuarray_api.h', 'numpy_compat.h']
+                'numpy_compat.h', 'cudnn.h', 'cudnn_helper.h',
+                'gpuarray_helper.h']
    def c_header_dirs(self):
        return [os.path.dirname(__file__), pygpu.get_include(),
@@ -183,7 +239,7 @@ class DnnBase(COp):
        return ['-Wl,-rpath,' + config.dnn.library_path]
    def c_code_cache_version(self):
-        return (super(DnnBase, self).c_code_cache_version(), version())
+        return (super(DnnBase, self).c_code_cache_version(), version(), 1)
 class DnnVersion(Op):

--- a/theano/gpuarray/dnn_base.c
+++ b/theano/gpuarray/dnn_base.c
@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
 #section init_code
 setup_ext_cuda();
-#section support_code_struct
-PyGpuContextObject *ctx;
-cudnnHandle_t APPLY_SPECIFIC(_handle);
-#section init_code_struct
-{
-  // We need to keep a reference here to have it available in the destructor.
-  ctx = PARAMS;
-  Py_INCREF(ctx);
-  cuda_enter(PARAMS->ctx);
-  cudnnStatus_t err;
-  APPLY_SPECIFIC(_handle) = NULL;
-  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(PARAMS->ctx);
-    FAIL;
-  }
-  if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
-                            cuda_get_stream(PARAMS->ctx))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(PARAMS->ctx);
-    FAIL;
-  }
-  cuda_exit(PARAMS->ctx);
-}
-#section cleanup_code_struct
-cuda_enter(ctx->ctx);
-cudnnDestroy(APPLY_SPECIFIC(_handle));
-cuda_exit(ctx->ctx);
-Py_DECREF((PyObject *)ctx);
--- a/theano/gpuarray/dnn_batchnorm.c
+++ b/theano/gpuarray/dnn_batchnorm.c
@@ -3,7 +3,9 @@
 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, npy_float64 epsilon,
                     PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
-                     PyGpuArrayObject **x_invstd, PyGpuContextObject *c) {
+                     PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(scale, bn_params) != 0)
@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      beta = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alpha,
      beta,

--- a/theano/gpuarray/dnn_batchnorm_grad.c
+++ b/theano/gpuarray/dnn_batchnorm_grad.c
@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
                       PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
                       PyGpuArrayObject *x_invstd, npy_float64 epsilon,
                       PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
-                       PyGpuArrayObject **dbias, PyGpuContextObject *c) {
+                       PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(doutp, bn_doutput) != 0)
@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
      betaParam = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationBackward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alphaData,
      betaData,

--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
@@ -3,7 +3,9 @@
 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
                     PyGpuArrayObject *est_var, npy_float64 epsilon, 
-                     PyGpuArrayObject **outp, PyGpuContextObject *c) {
+                     PyGpuArrayObject **outp, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(scale, bn_params) != 0)
@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      beta = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alpha,
      beta,

--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
-                         PyGpuContextObject *c) {
+                         cudnnHandle_t _handle) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  PyGpuContextObject *c = input->context;
-  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
    err = cudnnFindConvolutionForwardAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output), 1, &count, &choice);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
    err = cudnnGetConvolutionForwardAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),
      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  {
    size_t worksize;
    gpudata *workspace;
-    err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
+    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      // TODO: Print a warning
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
+      err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
                                                    desc,
@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnConvolutionForward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **input,
-                        PyGpuContextObject *c) {
+                        cudnnHandle_t _handle) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  PyGpuContextObject *c = kerns->context;
-  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    cudnnConvolutionBwdDataAlgoPerf_t choice;
    err = cudnnFindConvolutionBackwardDataAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(input), 1, &count, &choice);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    }
    err = cudnnGetConvolutionBackwardDataAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
+      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(input),
      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  gpudata *workspace;
  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
-    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+    _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(input), algo, &worksize);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  err = cudnnConvolutionBackwardData(
-    APPLY_SPECIFIC(_handle),
+    _handle,
    alpha_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **kerns,
-                        PyGpuContextObject *c) {
+                        cudnnHandle_t _handle) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  PyGpuContextObject *c = input->context;
-  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    cudnnConvolutionBwdFilterAlgoPerf_t choice;
    err = cudnnFindConvolutionBackwardFilterAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(kerns), 1, &count, &choice);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    }
    err = cudnnGetConvolutionBackwardFilterAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),
      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  gpudata *workspace;
  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+    _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(kerns), algo, &worksize);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  err = cudnnConvolutionBackwardFilter(
-    APPLY_SPECIFIC(_handle),
+    _handle,
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),

--- a/theano/gpuarray/dnn_pool.c
+++ b/theano/gpuarray/dnn_pool.c
@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                             PyArrayObject *stride,
                             PyArrayObject *pad,
                             PyGpuArrayObject **out,
-                             PyGpuContextObject *c) {
+                             cudnnHandle_t _handle) {
-  cudnnStatus_t err;
+  PyGpuContextObject *c = img->context;
  size_t dims[5];
+  cudnnStatus_t err;
  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnPoolingForward(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
+      _handle, APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
      beta,

--- a/theano/gpuarray/dnn_pool_grad.c
+++ b/theano/gpuarray/dnn_pool_grad.c
@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyArrayObject *stride,
                                  PyArrayObject *pad,
                                  PyGpuArrayObject **inp_grad,
-                                  PyGpuContextObject *c) {
+                                  cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  cudnnStatus_t err;
  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnPoolingBackward(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
+      _handle, APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),

--- a/theano/gpuarray/dnn_softmax.c
+++ b/theano/gpuarray/dnn_softmax.c
@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL)
 int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
                            PyGpuArrayObject **out,
-                            PyGpuContextObject *c) {
+                            cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
  cudnnStatus_t err;
  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnSoftmaxForward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      SOFTMAX_ALGO,
      SOFTMAX_MODE,
      alpha,

--- a/theano/gpuarray/dnn_softmax_grad.c
+++ b/theano/gpuarray/dnn_softmax_grad.c
@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL)
 int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
                                 PyGpuArrayObject *sm,
                                 PyGpuArrayObject **dx,
-                                 PyGpuContextObject *c) {
+                                 cudnnHandle_t _handle) {
+  PyGpuContextObject *c = dy->context;
  cudnnStatus_t err;
  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
    cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnSoftmaxBackward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      SOFTMAX_ALGO,
      SOFTMAX_MODE,
      alpha,