Some fixes from tests.

f5ceb43d · Arnaud Bergeron · 175d3b15 · f5ceb43d · f5ceb43d · f5ceb43d
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -243,10 +243,13 @@ class GpuKernelBase(object):
        cleanups = '\n'.join(self._generate_kernel_cleanup(k) for k in kernels)
        return cleanups

-    def _GpuKernelBase_version(self):
-        return (3,)
+    # This is a shorthand for if your op only has a fixed version
+    # You can reimplement it, but make sure to call kernel_version()
+    def c_code_cache_version_apply(self, node):
+        return (self.c_code_cache_version(), self.kernel_version(node))

-    GpuKernelBase_version = property(_GpuKernelBase_version)
+    def kernel_version(self, node):
+        return (3, node.get_context().bin_id)


 class HostFromGpu(Op):
@@ -1044,4 +1047,4 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
        return s

    def c_code_cache_version(self):
-        return (5, self.GpuKernelBase_version)
+        return (5,)
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -1408,14 +1408,16 @@ def local_softmax_dnn(node):
 @local_optimizer([GpuElemwise])
 def local_log_softmax_dnn(node):
    # This looks for GpuDnnSoftmax so we know that we have cudnn.
-    if version() < 3000:
-        # No log-softmax before cudnn v3
-        return
    if (isinstance(node.op, GpuElemwise) and
            isinstance(node.op.scalar_op, Log) and
            node.inputs[0].owner and
            isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
            len(node.inputs[0].clients) == 1):
+        # Don't move this call to version outside the condition, it
+        # needs to be here.
+        if version() < 3000:
+            # No log-softmax before cudnn v3
+            return
        softmax_node = node.inputs[0].owner
        new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode)
        return [new_softmax(softmax_node.inputs[0])]

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -776,7 +776,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(%(nd_out)s, new_dims,
                                %(out_typecode)s, GA_C_ORDER,
-                                pygpu_default_context(), Py_None);
+                                %(ctx)s, Py_None);
            if (NULL == %(z)s)
            {
                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
@@ -1896,7 +1896,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)

    def c_code_cache_version_apply(self, node):
-        version = [17]  # the version corresponding to the c code in this Op
+        version = [18]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(
@@ -1906,6 +1906,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
        for i in node.inputs + node.outputs:
            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
+        version.extend(self.kernel_version(node))
        if all(version):
            return tuple(version)
        else:

--- a/theano/sandbox/gpuarray/gemm16.c
+++ b/theano/sandbox/gpuarray/gemm16.c
@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf;

 int gemm16(PyGpuArrayObject *C, float alpha,
           PyGpuArrayObject *A, PyGpuArrayObject *B,
-           float beta, PyGpuArrayObject **out) {
+           float beta, PyGpuArrayObject **out,
+           PyGpuContextObject *c) {
  PyGpuArrayObject *_A = NULL;
  PyGpuArrayObject *_B = NULL;
  GpuKernel *gk;

--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -145,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,

 @opt.register_opt()
 @opt.op_lifter([tensor.Dot])
-def local_dot_to_gemm16(node):
+def local_dot_to_gemm16(node, ctx_name):
    if nerv is None:
        return
    A = node.inputs[0]
@@ -153,7 +153,6 @@ def local_dot_to_gemm16(node):
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
        fgraph = node.inputs[0].fgraph
-        ctx_name = infer_context_name(A, B)
        C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -969,7 +969,7 @@ def local_scan_to_gpua(node, context_name):
                            context_name=context_name)

    nw_op = scan_op.Scan(scan_ins, scan_outs, info,
-                         typebuild=typebuild).make_node(*nw_ins)
+                         typeConstructor=typebuild).make_node(*nw_ins)
    return nw_op.outputs



--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx):
            if (alloc.owner and
                    isinstance(alloc.owner.op, GpuAllocEmpty) and
                    len(alloc.clients) > 1):
-                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
+                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
+                                         alloc.owner.op.context_name)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
            return maker(node, inputs)
        return opt

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -57,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs):
    cls = kwargs.pop('cls', None)
    if len(kwargs) != 0:
        raise TypeError('Unexpected argument %s', list(kwargs.keys())[0])
-    return gpuarray.array(r, dtype=dtype, cls=cls)
+    return gpuarray.array(r, dtype=dtype, cls=cls,
+                          context=get_context(test_ctx_name))


 def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
@@ -14,8 +14,8 @@ from theano import tensor
 from theano.tests.unittest_tools import seed_rng

 # We let that import do the init of the back-end if needed.
-from .config import mode_with_gpu
-from ..type import GpuArrayType
+from .config import mode_with_gpu, test_ctx_name
+from ..type import GpuArrayType, get_context
 from ..conv import GpuConv
 from theano.sandbox.gpuarray import dnn

@@ -28,7 +28,7 @@ try:
 except ImportError:
    pass

-gftensor4 = GpuArrayType('float32', [False] * 4)
+gftensor4 = GpuArrayType('float32', [False] * 4, context_name=test_ctx_name)


 def py_conv_valid_numpy(img, kern):
@@ -135,8 +135,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
            numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
        npy_kern = -(theano._asarray(numpy.arange(
            numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
-    img = pygpu.array(npy_img)
-    kern = pygpu.array(npy_kern)
+    img = pygpu.array(npy_img, context=get_context(test_ctx_name))
+    kern = pygpu.array(npy_kern, context=get_context(test_ctx_name))

    # we take the stride after the transfert as we make c_contiguous
    # data on the GPU.

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -116,7 +116,7 @@ class test_GpuCAReduceCPY(test_elemwise.test_CAReduce):

    def test_infer_shape(self):
        for dtype in self.dtypes:
-            super(test_GpuCAReduceCPY, self).test_infer_shape(self, dtype)
+            super(test_GpuCAReduceCPY, self).test_infer_shape(dtype)


 class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
@@ -195,7 +195,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
    op = GpuCAReduceCuda
    reds = [scalar.add, scalar.mul,
            scalar.maximum, scalar.minimum]
-    pre_scalar_op = scalar.sqr
+    pre_scalar_op = None

    def test_perform(self):
        return