Type context for elemwise.py

a28251c6 · Arnaud Bergeron · 58371141 · a28251c6
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -20,8 +20,8 @@ try:
 except ImportError:
    pass
-from .basic_ops import (as_gpuarray_variable, HideC,
+from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
-                        GpuKernelBase, Kernel)
+                        infer_context_name)
 from .type import GpuArrayType
 from .fp16_help import load_w, write_w
@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
    def make_node(self, *inputs):
+        ctx_name = infer_context_name(*inputs)
        res = Elemwise.make_node(self, *inputs)
        outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
+                                context_name=ctx_name,
                                dtype=o.type.dtype)() for o in res.outputs]
        if len(outputs) > 1:
            raise NotImplementedError()
-        inputs = [as_gpuarray_variable(i) for i in inputs]
+        inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
        node = Apply(self, inputs, outputs)
        # Try to generate the kernel to catch SupportCodeErrors
@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        return node
+    def get_context(self, node):
+        return node.inputs[0].type.context
    def generate_kernel(self, node, nodename):
        inps = [make_argument(i, 'i%d' % (n,)) for n, i in
                enumerate(node.inputs)]
@@ -177,8 +182,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        return self.scalar_op.c_support_code()
    def _gpu_kernel_code(self, node, nodename):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
        # This is useless by itself, but will serve an eventual c_code
        # implementation
        k = self.generate_kernel(node, nodename)
@@ -191,8 +194,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        return '\n'.join(res)
    def gpu_kernels(self, node, nodename):
-        if pygpu.get_default_context().kind == 'opencl':
-            raise MethodNotDefined('cuda only')
        src = self._gpu_kernel_code(node, nodename)
        nd = node.outputs[0].ndim
        params = ['uintp']
@@ -214,12 +215,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                       objvar='elem_%d_%s' % (nd, nodename))]
    def c_code(self, node, name, inputs, outputs, sub):
-        if pygpu.get_default_context().kind == 'opencl':
+        if node.inputs[0].type.context.kind != 'cuda':
            raise MethodNotDefined('cuda only')
        nd = node.outputs[0].ndim
        fail = sub["fail"]
        initial_dims = ','.join('1' for i in xrange(nd))
        opname = str(self.scalar_op)
+        ctx = sub['context']
        # check that all inputs have valid dimensions
        emitted_inames = {}
@@ -264,7 +266,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            if iname in emitted_inames:
                continue
            code += """
-        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
        {
            PyErr_Format(PyExc_TypeError,
@@ -279,7 +280,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                 PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
                (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
            {
-                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Input dimension mis-match. Input"
                             " %(idx)d (indices start at 0) has shape[%%i] == %%i"
@@ -314,15 +314,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        {
            %(oname)s = pygpu_empty(%(nd)d, dims,
                            %(typecode)s, GA_C_ORDER,
-                            pygpu_default_context(), Py_None);
+                            %(ctx)s, Py_None);
            if (!%(oname)s) {
-                        //TODO, this check don't seam good.
+                %(fail)s
-                        //TODO, set exception?
-                            %(fail)s
            }
        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
            else:
                input_idx = self.inplace_pattern[idx]
@@ -348,8 +344,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                %(fail)s;
            }
        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
        z = outputs[0]
        code += """numEls = PyGpuArray_SIZE(%(z)s);
@@ -367,7 +361,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        if (threads_per_block * n_blocks < numEls)
            threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
-                //std::cerr << "calling callkernel returned\\n";
        """ % locals()
        kname = 'elem_%d_%s' % (nd, name)
@@ -588,7 +581,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    quite possible that the GPU might be slower for some cases.
    """
+    __props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
+                 'pre_scalar_op')
    _f16_ok = True
    def __init__(self, scalar_op, axis=None,
@@ -607,24 +601,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        if pre_scalar_op:
            assert pre_scalar_op.nin == 1
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.axis == other.axis and
-                self.reduce_mask == other.reduce_mask and
-                self.dtype == other.dtype and
-                self.acc_dtype == other.acc_dtype and
-                self.scalar_op == other.scalar_op and
-                self.pre_scalar_op == other.pre_scalar_op)
-    def __hash__(self):
-        return (hash(type(self)) ^
-                hash(self.axis) ^
-                hash(self.reduce_mask) ^
-                hash(self.dtype) ^
-                hash(self.acc_dtype) ^
-                hash(type(self.scalar_op)) ^
-                hash(type(self.pre_scalar_op)))
    def __str__(self):
        pre = ""
        if self.pre_scalar_op:
@@ -641,7 +617,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            self.pre_scalar_op = None
    def make_node(self, x):
-        x = as_gpuarray_variable(x)
+        x = as_gpuarray_variable(x, infer_context_name(x))
+        if x.type.context.kind != 'cuda':
+            raise TypeError("GpuCAReduceCuda doesn't work for non-cuda devices")
        ret = super(GpuCAReduceCuda, self).make_node(x)
        self = copy.copy(self)
        self.axis = ret.op.axis
@@ -666,7 +644,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                "complex" in self._acc_dtype(x.dtype)):
            raise NotImplementedError("We don't support complex in gpu reduction")
        return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
-                                              ret.outputs[0].type.broadcastable)()])
+                                              ret.outputs[0].type.broadcastable,
+                                              context_name=x.type.context_name)()])
+    def get_context(self, node):
+        return node.inputs[0].type.context
    def perform(self, node, inp, out):
        raise MethodNotDefined("")
@@ -1914,7 +1896,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        version = [17]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
-        version.extend(self.scalar_op.c_code_cache_version())
+        scalar_node = Apply(
+            self.scalar_op,
+            [Scalar(dtype=input.type.dtype)() for input in node.inputs],
+            [Scalar(dtype=output.type.dtype)() for output in node.outputs])
+        version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
        for i in node.inputs + node.outputs:
            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
        if all(version):
@@ -2676,8 +2662,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        # cache the kernel object
        self.get_kernel_cache(node)
-        return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
+        return super(GpuCAReduceCPY, self).make_thunk(
-                                                      compute_map, no_recycling)
+            node, storage_map, compute_map, no_recycling)
    def get_kernel_cache(self, node):
        attr = '@cache_reduction_k'
@@ -2776,33 +2762,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
                    j += 1
            code += """
         if (need_out) {
-             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
+             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, %(ctx)s, Py_None);
             if (!%(output)s) {
                 %(fail)s
             }
         }
        """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
+                   ctx=sub['context'],
                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))
        else:
            code += """
        if (%(output)s == NULL || %(output)s->ga.nd != 0) {
            Py_XDECREF(%(output)s);
            %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
-                                     pygpu_default_context(), Py_None);
+                                     %(ctx)s, Py_None);
            if (!%(output)s) {
                %(fail)s
            }
        }
-        """ % dict(output=output, fail=sub['fail'],
+        """ % dict(output=output, fail=sub['fail'], ctx=sub['context'],
                   out_type=dtype_to_typecode(node.outputs[0].type.dtype))
        if acc_dtype != node.outputs[0].type.dtype:
            code += """
        tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
-                          %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
+                          %(acc_type)s, GA_C_ORDER, %(ctx)s, Py_None);
-                          Py_None);
        if (!tmp) %(fail)s
-        """ % dict(output=output, fail=sub['fail'],
+        """ % dict(output=output, fail=sub['fail'], ctx=sub['context'],
                   acc_type=dtype_to_typecode(acc_dtype))
        else:
            code += """
@@ -2893,7 +2879,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
            reduce_expr = "a * b"
        else:
            raise NotImplementedError()
-        return ReductionKernel(pygpu.get_default_context(), odtype,
+        return ReductionKernel(node.inputs[0].type.context, odtype,
                               self.scalar_op.identity, reduce_expr, redux,
                               arguments=[make_argument(node.inputs[0], 'a')],
                               init_nd=node.inputs[0].ndim)