Fixes found from test_basic_ops.py

32a13f0e · Arnaud Bergeron · dd51383f · 32a13f0e · 32a13f0e · 32a13f0e
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -54,7 +54,7 @@ if pygpu:
        from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
                                GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
-        from .basic_ops import host_from_gpu, gpu_from_host
+        from .basic_ops import host_from_gpu, GpuFromHost
        from .elemwise import GpuElemwise
        from .subtensor import (GpuSubtensor, GpuIncSubtensor,
                                GpuAdvancedIncSubtensor1)

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -181,7 +181,7 @@ class GpuKernelBase(object):
                dict(cname=k.codevar, code=code))
    def _generate_kernel_vars(self, k):
-        return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
+        return """GpuKernel %(kname)s;""" % dict(kname=k.objvar)
    def c_support_code(self):
        return """
@@ -215,11 +215,11 @@ class GpuKernelBase(object):
  if (GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1, &bcode, &sz,
                     "%(kname)s", %(numargs)u, types, GA_USE_BINARY, NULL)
      != GA_NO_ERROR) {
-    if ((%(err)s = GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1,
+    if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1,
-                                  &%(cname)s, NULL, "%(kname)s", %(numargs)u,
+                              &%(cname)s, NULL, "%(kname)s", %(numargs)u,
-                                  types, %(flags)s, NULL)) != GA_NO_ERROR) {
+                              types, %(flags)s, NULL)) != GA_NO_ERROR) {
      PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
-                   %(err)s, Gpu_error(%(ctx)s->ops, %(ctx)s->ctx, %(err)s));
+                   err, Gpu_error(%(ctx)s->ops, %(ctx)s->ctx, err));
      %(fail)s
    }
  }
@@ -371,7 +371,7 @@ class GpuFromHost(Op):
    def c_code(self, node, name, inputs, outputs, sub):
        return """
-        PyGpuArrayObject *%(name)s_tmp;
+        PyArrayObject *%(name)s_tmp;
        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
        if (%(name)s_tmp == NULL)
          %(fail)s
@@ -551,9 +551,9 @@ class GpuAlloc(HideC, Alloc):
                if (err != GA_NO_ERROR)
                {
                    PyErr_Format(PyExc_MemoryError,
-                                 "GpuAlloc: Error memsetting %%d"
+                                 "GpuAlloc: Error memsetting %%llu"
                                 " element of device memory to 0.",
-                                 PyGpuArray_SIZE(%(zz)s));
+                                 (unsigned long long)PyGpuArray_SIZE(%(zz)s));
                    %(fail)s;
                }
            }

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -173,7 +173,8 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                        ("npy_float64", "ga_double"),
                        ]:
            kop = kop.replace(npy, ga)
-        return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
+        return ElemwiseKernel(self.get_context(node), inps + outs, kop,
+                              preamble=support_code)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
@@ -269,7 +270,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
        if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
        {
            PyErr_Format(PyExc_TypeError,
-                         "need %(nd)s dims, not %%i",
+                         "need %(nd)s dims, not %%u",
                         PyGpuArray_NDIM(%(iname)s));
            %(fail)s;
        }
@@ -282,11 +283,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Input dimension mis-match. Input"
-                             " %(idx)d (indices start at 0) has shape[%%i] == %%i"
+                             " %(idx)d (indices start at 0) has shape[%%d] == %%llu"
-                             ", but the output's size on that axis is %%i.",
+                             ", but the output's size on that axis is %%llu.",
                             i,
-                             PyGpuArray_DIMS(%(iname)s)[i],
+                             (unsigned long long)PyGpuArray_DIMS(%(iname)s)[i],
-                             dims[i]
+                             (unsigned long long)dims[i]
                            );
                %(fail)s;
            }
@@ -333,11 +334,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Output dimension mis-match. Output"
                             " %(idx)d (indices start at 0), working inplace"
-                             " on input %(input_idx)s, has shape[%%i] == %%i"
+                             " on input %(input_idx)s, has shape[%%i] == %%llu"
-                             ", but the output's size on that axis is %%i.",
+                             ", but the output's size on that axis is %%llu.",
                             i,
-                             PyGpuArray_DIMS(%(oname)s)[i],
+                             (unsigned long long)PyGpuArray_DIMS(%(oname)s)[i],
-                             dims[i]
+                             (unsigned long long)dims[i]
                            );
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
@@ -446,10 +447,12 @@ class GpuDimShuffle(HideC, DimShuffle):
    _f16_ok = True
    def make_node(self, input):
+        ctx_name = infer_context_name(input)
        res = DimShuffle.make_node(self, input)
        otype = GpuArrayType(dtype=res.outputs[0].type.dtype,
-                             broadcastable=res.outputs[0].type.broadcastable)
+                             broadcastable=res.outputs[0].type.broadcastable,
-        input = as_gpuarray_variable(input)
+                             context_name=ctx_name)
+        input = as_gpuarray_variable(input, ctx_name)
        return Apply(self, [input], [otype()])
    def __str__(self):
@@ -650,7 +653,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    def get_context(self, node):
        return node.inputs[0].type.context
-    def perform(self, node, inp, out):
+    def perform(self, node, inp, out, ctx):
        raise MethodNotDefined("")
    def supports_c_code(self, inputs):
@@ -680,7 +683,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
        out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]
-        sub = {'fail': 'fake failure code'}
+        sub = {'fail': 'fake failure code', 'context': 'fake context'}
        try:
            self.c_code(node, name, inp, out, sub)
@@ -715,7 +718,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        if (PyGpuArray_NDIM(%(x)s) != %(nd_in)s)
        {
            PyErr_Format(PyExc_TypeError,
-                         "required nd=%(nd_in)s, got nd=%%i", PyGpuArray_NDIM(%(x)s));
+                         "required nd=%(nd_in)s, got nd=%%u", PyGpuArray_NDIM(%(x)s));
            %(fail)s;
        }
        """ % locals(), file=sio)
@@ -1320,8 +1323,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                     (void *)%(z)s->ga.data,
                                     (void *)&%(z)s->ga.offset};
            if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
-                                " n_threads=%%lu, size=%%lu, ndim=%%d\\n",
+                                " n_threads=%%llu, size=%%llu, ndim=%%u\\n",
-                                n_threads,numEls,
+                                n_threads, numEls,
                                PyGpuArray_NDIM(%(x)s));
            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
            int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
@@ -1503,9 +1506,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
            if (verbose) {
              fprintf(stderr,
-                "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
+                "running kernel_reduce_10_%(name)s n_blocks=(%%llu,%%llu)\\n",
-                n_blocks[0],
+                (unsigned long long)n_blocks[0],
-                n_blocks[1]);
+                (unsigned long long)n_blocks[1]);
            }
            assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -215,7 +215,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        iadd_node = gop(xview, y).owner
        self.iadd_node = iadd_node
-    def perform(self, node, inputs, out_):
+    def perform(self, node, inputs, out_, ctx):
        out, = out_
        x, y = inputs[:2]
        indices = list(reversed(inputs[2:]))
@@ -326,7 +326,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
                                  %(view_ndim)s,
                                  dims,
                                  xview_strides,
-                                  %(x)s->ctx,
+                                  %(x)s->context,
                                  1,
                                  (PyObject *)%(x)s,
                                  (PyObject *)&PyGpuArrayType);
@@ -360,10 +360,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """
        return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
-    def c_support_code_apply(self, node, nodename):
+    def c_support_code_struct(self, node, nodename):
        gop = self.iadd_node.op
        sub_name = nodename + "_add_to_zview"
-        ret = gop.c_support_code_apply(self.iadd_node, sub_name)
+        ret = gop.c_support_code_struct(self.iadd_node, sub_name)
        ret += """
        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
                                                    PyGpuArrayObject* src){
@@ -371,10 +371,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """ % locals()
        inputs = ["dst", "src"]
        outputs = ["ret"]
-        sub = {"fail": "return NULL;", "context": "dst->ctx"}
+        sub = {"fail": "return NULL;", "context": "dst->context"}
        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
        ret += """
            return ret;
        }
        """
        return ret
@@ -608,7 +609,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
    def get_context(self, node):
-        return self.node.outputs[0].type.context
+        return node.outputs[0].type.context
    def c_code_cache_version(self):
        return (6,)
@@ -617,6 +618,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
        return ['<numpy_compat.h>', '<gpuarray_helper.h>',
                '<gpuarray/types.h>']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
    def c_code(self, node, name, inputs, outputs, sub):
        ctx = self.get_context(node)
        if ctx.kind != 'cuda':
@@ -754,7 +758,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]
-    def c_support_code_apply(self, node, nodename):
+    def c_support_code_struct(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
@@ -765,7 +769,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
        itemsize_out = numpy.dtype(dtype_out).itemsize
        k_var = "k_vector_add_fast_" + nodename
-        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
+        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
        int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
                                     PyGpuArrayObject* py_other,
                                     PyGpuArrayObject *indices_arr)