Type context for dnn.py

8c9b612b · Arnaud Bergeron · e4a14f54 · 8c9b612b · 8c9b612b · 8c9b612b
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -16,7 +16,8 @@ from theano.tensor.signal.downsample import (
    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)

 from . import pygpu, init_dev
-from .basic_ops import (as_gpuarray_variable,
+from .type import get_context, gpu_context_type
+from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, HostFromGpu,
                        GpuAllocEmpty, empty_like)
 from .elemwise import GpuElemwise
@@ -28,29 +29,14 @@ from .nnet import GpuSoftmax
 from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
 from .opt_util import alpha_merge, output_merge, inplace_allocempty

-
-def dnn_available():
-    if dnn_available.avail is not None:
-        return dnn_available.avail
-    if pygpu is None:
-        dnn_available.msg = "PyGPU not available"
-        dnn_available.avail = False
-        return False
-    if not init_dev.device.startswith('cuda'):
-        dnn_available.msg = "Not on a CUDA device. Got %s." % init_dev.device
-        dnn_available.avail = False
-        return False
-    # This is a hack because bin_id is in the from of
-    # "sm_<major><minor>" for cuda devices.
-    if pygpu.get_default_context().bin_id[:-2] < '30':
-        dnn_available.msg = "Device not supported by cuDNN"
-        dnn_available.avail = False
+def _dnn_check_compile():
    preambule = """
 #include <stdio.h>
 #include <cudnn.h>
 #include <cudnn_helper.h>
 """

+    # No need for the context in here since we won't execute that code
    body = """
 cudnnHandle_t _handle = NULL;
 cudnnStatus_t err;
@@ -70,33 +56,64 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
    # default gpu, not the one selected by the user. If mixed
    # GPU are installed or if the GPUs are configured in
    # exclusive mode, this cause bad detection.
-    comp, out, err = GCC_compiler.try_flags(
+    avail, out, err = GCC_compiler.try_flags(
        params, preambule=preambule, body=body,
        try_run=False, output=True)

-    dnn_available.avail = comp
-    if not dnn_available.avail:
-        dnn_available.msg = (
-            "Theano cannot compile with cuDNN. We got this error:\n" +
-            str(err))
-    else:
-        # If we can compile, check that we can import and run.
+    if not avail:
+        return False, ("Theano cannot compile with cuDNN. "
+                       "We got this error:\n" + str(err))
+    return True, None
+
+
+def _dnn_check_version():
    v = version()
    if v < 2000:
-            dnn_available.avail = False
-            dnn_available.msg = (
+        return False, (
            "You have an old release of CuDNN (or a release candidate) "
            "that isn't supported.  Please update to at least v2 final "
            "version.")
-            raise RuntimeError(dnn_available.msg)
    if v >= 3000 and v < 3007:
-            dnn_available.avail = False
-            dnn_available.msg = (
+        return False, (
            "You have installed a release candidate of CuDNN v3. This "
            "isn't supported. Please update to v3 final version.")
+
+    return True, None
+
+
+def dnn_available(context_name):
+    if dnn_available.avail is False:
+        return False
+
+    if pygpu is None:
+        dnn_available.msg = "PyGPU not available"
+        dnn_available.avail = False
+        return False
+
+    # If we haven't checked yet, check if we can compile.
+    if dnn_available.avail is None:
+        dnn_available.avail, dnn_available.msg = _dnn_check_compile()
+        if dnn_available.avail:
+            dnn_available.avail, dnn_available.msg = _dnn_check_version()
+            if not dnn_available.avail:
                raise RuntimeError(dnn_available.msg)
+        if not dnn_available.avail:
+            return False
+
+    # Don't cache these checks since they depend on the context
+    ctx = get_context(context_name)
+
+    if not ctx.kind == 'cuda':
+        dnn_available.msg = "Not on a CUDA device."
+        return False
+
+    # This is a hack because bin_id is in the from of
+    # "<something>_<major><minor>" for cuda devices.
+    if ctx.bin_id[:-2] < '30':
+        dnn_available.msg = "Device not supported by cuDNN"
+        return False

-    return dnn_available.avail
+    return True

 dnn_available.avail = None
 dnn_available.msg = None
@@ -110,6 +127,10 @@ class DnnBase(COp):
    # dnn does not know about broadcasting, so we do not need to assert
    # the input broadcasting pattern.
    check_broadcast = False
+    context_type = gpu_context_type
+
+    def get_context(self, node):
+        return node.outputs[0].type.context

    def __init__(self, files=None, c_func=None):
        if files is None:
@@ -181,7 +202,9 @@ def version():

    This also does a check that the header version matches the runtime version.
    """
-    if not dnn_available():
+    if dnn_available.avail is None:
+        raise RuntimeError("called version() before dnn_available()")
+    if not dnn_available.avail:
        raise Exception(
            "We can't determine the cudnn version as it is not available",
            dnn_available.msg)
@@ -390,9 +413,10 @@ class GpuDnnConv(DnnBase):
        return defs

    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
-        img = as_gpuarray_variable(img)
-        kern = as_gpuarray_variable(kern)
-        output = as_gpuarray_variable(output)
+        ctx_name = infer_context_name(img, kern, output)
+        img = as_gpuarray_variable(img, ctx_name)
+        kern = as_gpuarray_variable(kern, ctx_name)
+        output = as_gpuarray_variable(output, ctx_name)
        if img.type.ndim not in (4, 5):
            raise TypeError('img must be 4D or 5D tensor')
        if kern.type.ndim not in (4, 5):
@@ -574,9 +598,10 @@ class GpuDnnConvGradW(DnnBase):
        return defs

    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
-        img = as_gpuarray_variable(img)
-        topgrad = as_gpuarray_variable(topgrad)
-        output = as_gpuarray_variable(output)
+        ctx_name = infer_context_name(img, topgrad, output)
+        img = as_gpuarray_variable(img, ctx_name)
+        topgrad = as_gpuarray_variable(topgrad, ctx_name)
+        output = as_gpuarray_variable(output, ctx_name)
        if img.type.ndim not in (4, 5):
            raise TypeError('img must be 4D or 5D tensor')
        if topgrad.type.ndim not in (4, 5):
@@ -689,9 +714,10 @@ class GpuDnnConvGradI(DnnBase):
        return defs

    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
-        kern = as_gpuarray_variable(kern)
-        topgrad = as_gpuarray_variable(topgrad)
-        output = as_gpuarray_variable(output)
+        ctx_name = infer_context_name(kern, topgrad, output)
+        kern = as_gpuarray_variable(kern, ctx_name)
+        topgrad = as_gpuarray_variable(topgrad, ctx_name)
+        output = as_gpuarray_variable(output, ctx_name)
        if kern.type.ndim not in (4, 5):
            raise TypeError('kern must be 4D or 5D tensor')
        if topgrad.type.ndim not in (4, 5):
@@ -770,6 +796,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        warnings.warn("workmem is deprecated, use algo instead", stacklevel=2)
        algo = workmem
    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
+    ctx_name = infer_context_name(img, kerns)
    if (border_mode == 'valid' and subsample == (1, 1) and
            direction_hint == 'bprop weights'):
        # Special case: We are asked to use GpuDnnConvGradW. We need to set
@@ -782,12 +809,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph),
+        out = GpuAllocEmpty(img.dtype, ctx_name)(
+            shape_i(kerns, 1, fgraph),
            shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross')(out.shape)
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
-        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3))
+        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)

    elif (border_mode == 'full' and subsample == (1, 1) and
          direction_hint != 'forward!'):
@@ -799,7 +827,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = GpuAllocEmpty(img.dtype)(shape_i(img, 0, fgraph),
+        out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
                                                 shape_i(kerns, 1, fgraph),
                                                 shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
@@ -817,7 +845,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
                                       desc_op.subsample)
-    out = GpuAllocEmpty(img.dtype)(*out_shp)
+    out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp)
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)


@@ -948,7 +976,7 @@ class GpuDnnPool(DnnBase):
        DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")

    def make_node(self, img, desc):
-        img = as_gpuarray_variable(img)
+        img = as_gpuarray_variable(img, infer_context_name(img))

        if desc.owner is not None:
            e_ndim = desc.owner.op.get_ndim() + 2
@@ -1002,7 +1030,7 @@ class GpuDnnPoolGrad(DnnBase):
        The input of the pooling.
    out
        The output of the pooling in the forward.
-    inp_grad
+    out_grad
        Same size as out, but is the corresponding gradient information.
    desc
        The pooling descriptor.
@@ -1016,9 +1044,10 @@ class GpuDnnPoolGrad(DnnBase):
                         "APPLY_SPECIFIC(dnn_pool_grad)")

    def make_node(self, inp, out, out_grad, desc):
-        inp = as_gpuarray_variable(inp)
-        out_grad = as_gpuarray_variable(out_grad)
-        out = as_gpuarray_variable(out)
+        ctx_name = infer_context_name(inp, out, out_grad)
+        inp = as_gpuarray_variable(inp, ctx_name)
+        out_grad = as_gpuarray_variable(out_grad, ctx_name)
+        out = as_gpuarray_variable(out, ctx_name)

        if desc.owner is not None:
            nd = desc.owner.op.get_ndim() + 2
@@ -1147,7 +1176,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    c_func = "APPLY_SPECIFIC(softmax)"

    def make_node(self, x):
-        x = as_gpuarray_variable(x)
+        x = as_gpuarray_variable(x, infer_context_name(x))
        assert x.ndim == 4
        return Apply(self, [x], [x.type()])

@@ -1181,8 +1210,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
    c_func = "APPLY_SPECIFIC(softmax_grad)"

    def make_node(self, dy, sm):
-        dy = as_gpuarray_variable(dy)
-        sm = as_gpuarray_variable(sm)
+        ctx_name = infer_context_name(dy, sm)
+        dy = as_gpuarray_variable(dy, ctx_name)
+        sm = as_gpuarray_variable(sm, ctx_name)
        assert dy.ndim == 4
        assert sm.ndim == 4
        return Apply(self, [dy, sm], [sm.type()])
@@ -1191,9 +1221,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
 # @register_opt('cudnn')  # this optimizer is registered in opt.py instead.
 @local_optimizer([GpuConv])
 def local_conv_dnn(node):
-    if not dnn_available():
-        return
    if isinstance(node.op, GpuConv):
+        if not dnn_available(node.outputs[0].type.context_name):
+            return
        if node.op.border_mode not in ['full', 'valid']:
            return
        img, kern = node.inputs
@@ -1211,9 +1241,9 @@ def local_conv_dnn(node):
 # because for some input/kernel shape configurations, this is faster.
 @local_optimizer([GpuConv])
 def local_conv_dnn_alternative(node):
-    if not dnn_available():
-        return
    if isinstance(node.op, GpuConv):
+        if not dnn_available(node.outputs[0].type.context_name):
+            return
        border_mode = node.op.border_mode
        subsample = node.op.subsample
        if border_mode not in ['full', 'valid'] or subsample != (1, 1):
@@ -1304,8 +1334,8 @@ def local_dnn_convi_output_merge(node, *inputs):

 @register_opt('cudnn')
 @op_lifter([DownsampleFactorMax])
-def local_pool_dnn_alternative(node):
-    if not dnn_available():
+def local_pool_dnn_alternative(node, ctx_name):
+    if not dnn_available(ctx_name):
        return
    if not node.op.ignore_border:
        return
@@ -1320,8 +1350,8 @@ def local_pool_dnn_alternative(node):

 @register_opt('cudnn')
 @op_lifter([MaxPoolGrad])
-def local_pool_dnn_grad_stride(node):
-    if not dnn_available():
+def local_pool_dnn_grad_stride(node, ctx_name):
+    if not dnn_available(ctx_name):
        return
    if not node.op.ignore_border:
        return
@@ -1340,8 +1370,8 @@ def local_pool_dnn_grad_stride(node):

 @register_opt('cudnn')
 @op_lifter([AveragePoolGrad])
-def local_avg_pool_dnn_grad_stride(node):
-    if not dnn_available():
+def local_avg_pool_dnn_grad_stride(node, ctx_name):
+    if not dnn_available(ctx_name):
        return
    if not node.op.ignore_border:
        return
@@ -1363,20 +1393,21 @@ def local_avg_pool_dnn_grad_stride(node):
 @register_opt('cudnn')
 @local_optimizer([GpuSoftmax])
 def local_softmax_dnn(node):
-    if not dnn_available():
-        return
    if isinstance(node.op, GpuSoftmax):
+        if not dnn_available(node.outputs[0].type.context_name):
+            return
        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
        ins = gpu_contiguous(ins)
        out = GpuDnnSoftmax('accurate', 'channel')(ins)
-        out = as_gpuarray_variable(out.dimshuffle(0, 1))
+        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
        return [out]


 @register_opt('cudnn')
 @local_optimizer([GpuElemwise])
 def local_log_softmax_dnn(node):
-    if not dnn_available() or version() < 3000:
+    # This looks for GpuDnnSoftmax so we know that we have cudnn.
+    if version() < 3000:
        # No log-softmax before cudnn v3
        return
    if (isinstance(node.op, GpuElemwise) and
@@ -1395,7 +1426,14 @@ class NoCuDNNRaise(Optimizer):
        Raise a RuntimeError if cudnn can't be used.

        """
-        if not dnn_available():
+        try:
+            dnn_available(None)
+        except ValueError:
+            # This is most likely due to get_context()
+            pass
+
+        # This means we will have a problem no matter what context.
+        if not dnn_available.avail:
            # Make an assert error as we want Theano to fail, not
            # just skip this optimization.
            raise AssertionError(
@@ -1408,8 +1446,8 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')

 @register_opt('cudnn')
 @op_lifter([SoftmaxGrad])
-def local_softmax_dnn_grad(node):
-    if not dnn_available():
+def local_softmax_dnn_grad(node, ctx_name):
+    if not dnn_available(ctx_name):
        return
    ins = []
    for n in node.inputs:

--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle);
 #section init_code_struct

 {
-  cuda_enter(pygpu_default_context()->ctx);
+  cuda_enter(CONTEXT->ctx);
  cudnnStatus_t err;
  APPLY_SPECIFIC(_handle) = NULL;
  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
                 cudnnGetErrorString(err));
-    cuda_exit(pygpu_default_context()->ctx);
+    cuda_exit(CONTEXT->ctx);
    FAIL;
  }
-  cuda_exit(pygpu_default_context()->ctx);
+  cuda_exit(CONTEXT->ctx);
 }
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
-                         PyGpuArrayObject **output) {
+                         PyGpuArrayObject **output,
+                         PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -4,12 +4,12 @@ int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **input) {
+                        double alpha, double beta, PyGpuArrayObject **input,
+                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
@@ -4,12 +4,12 @@ int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **kerns) {
+                        double alpha, double beta, PyGpuArrayObject **kerns,
+                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
-  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI

 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                             cudnnPoolingDescriptor_t desc,
-                             PyGpuArrayObject **out) {
+                             PyGpuArrayObject **out,
+                             PyGpuContextObject *c) {
  cudnnStatus_t err;
  size_t dims[5];
-  PyGpuContextObject *c = pygpu_default_context();

  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");

--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyGpuArrayObject *out,
                                  PyGpuArrayObject *out_grad,
                                  cudnnPoolingDescriptor_t desc,
-                                  PyGpuArrayObject **inp_grad) {
+                                  PyGpuArrayObject **inp_grad,
+                                  PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,

  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
                         PyGpuArray_DIMS(inp), inp->ga.typecode,
-                         GA_C_ORDER, pygpu_default_context()) != 0) {
+                         GA_C_ORDER, c) != 0) {
    return 1;
  }


--- a/theano/sandbox/gpuarray/dnn_softmax.c
+++ b/theano/sandbox/gpuarray/dnn_softmax.c
@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL)
 #section support_code_struct

 int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
-                            PyGpuArrayObject **out) {
+                            PyGpuArrayObject **out,
+                            PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
    return 1;

--- a/theano/sandbox/gpuarray/dnn_softmax_grad.c
+++ b/theano/sandbox/gpuarray/dnn_softmax_grad.c
@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL)

 int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
                                 PyGpuArrayObject *sm,
-                                 PyGpuArrayObject **dx) {
+                                 PyGpuArrayObject **dx,
+                                 PyGpuContextObject *c) {
  cudnnStatus_t err;
-  PyGpuContextObject *c = pygpu_default_context();

  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
    return 1;