Merge pull request #4323 from abergeron/gpua_newelem

Use the new GpuElemwise from libgpuarray

Merge pull request #4323 from abergeron/gpua_newelem
a536464a · Frédéric Bastien · 57ffd6a0 · 0dbb97c6 · a536464a · a536464a
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -42,7 +42,7 @@ register_transfer(transfer)

 def init_dev(dev, name=None):
    v = pygpu.gpuarray.api_version()
-    if v[0] != -10000:
+    if v[0] != -9999:
        raise RuntimeError("Wrong major API version for gpuarray:", v[0],
                           "Make sure Theano and libgpuarray/pygpu "
                           "are in sync.")

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
 from __future__ import absolute_import, print_function, division
 import copy
-from theano.compat import izip
 import numpy

 import theano
-from theano import Apply, scalar, config
-from theano import scalar as scal
+from theano import Apply, scalar, config, Op
 from six.moves import StringIO, xrange
 from theano.gof.utils import MethodNotDefined
 from theano.scalar import Scalar
@@ -14,41 +12,20 @@ from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
 try:
    import pygpu
    from pygpu import gpuarray
-    from pygpu.tools import ScalarArg, ArrayArg
-    from pygpu.elemwise import ElemwiseKernel
+    from pygpu.tools import ArrayArg
    from pygpu.reduction import ReductionKernel
-    from pygpu.gpuarray import dtype_to_typecode, dtype_to_ctype
+    from pygpu.gpuarray import dtype_to_typecode
 except ImportError:
    pass

 from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
                        infer_context_name)
-from .type import GpuArrayType
+from .type import GpuArrayType, gpu_context_type
 from .fp16_help import load_w, write_w


-def _is_scalar(v):
-    False
-
-
 def make_argument(v, name):
-    if _is_scalar(v):
-        return ScalarArg(numpy.dtype(v.type.dtype), name)
-    else:
-        return ArrayArg(numpy.dtype(v.type.dtype), name)
-
-
-def ensure_allocated(storage, shape, dtype, ctx):
-    odat = storage[0]
-    if odat is not None:
-        if odat.shape != shape:
-            # It is unsafe to try to resize odat,
-            # we have to allocate output storage.
-            odat = None
-    if odat is None:
-        odat = pygpu.empty(shape, dtype=dtype, context=ctx)
-    storage[0] = odat
-    return odat
+    return ArrayArg(numpy.dtype(v.type.dtype), name)


 def as_C_string_const(s):
@@ -56,11 +33,12 @@ def as_C_string_const(s):
                     for l in s.split('\n'))


-class GpuElemwise(GpuKernelBase, HideC, Elemwise):
+class GpuElemwise(HideC, Elemwise):
    """
    Elemwise on the GPU.

    """
+    params_type = gpu_context_type
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)
    _f16_ok = True
@@ -109,20 +87,21 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
    def get_params(self, node):
        return node.inputs[0].type.context

-    def generate_kernel(self, node, nodename):
-        inps = [make_argument(i, 'i%d' % (n,)) for n, i in
-                enumerate(node.inputs)]
-        scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
+    def _get_vnames(self, node):
+        inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)]
+        outs = ['o%d' % (n,) for n, _ in enumerate(node.outputs) if n not in self.inplace_pattern]
+        return inps, outs

-        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
-                enumerate(node.outputs) if n not in self.inplace_pattern]
+    def _generate_op_string(self, node):
+        scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
        scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
+        inps, outs = self._get_vnames(node)

        fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
                          [o() for o in scal_v_outs])

-        scal_in = [i.name + '[i]' if i.dtype != 'float16' else
-                   '__half2float(' + i.name + '[i])' for i in inps]
+        scal_in = [i if si.dtype != 'float16' else
+                   'load_half(&' + i + ')' for i, si in zip(inps, scal_v_ins)]

        scal_out = []
        oi = 0
@@ -133,13 +112,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            else:
                arg = outs[oi]
                oi += 1
-            if arg.dtype == 'float16':
+            if node.outputs[n].dtype == 'float16':
                scal_f16.append(('tmpf16%i' % (len(scal_f16),), arg))
                scal_out.append(scal_f16[-1][0])
            else:
-                scal_out.append(arg.name + '[i]')
+                scal_out.append(arg)

-        kop = self.scalar_op.c_code(fake_node, nodename + '_scalar',
+        kop = self.scalar_op.c_code(fake_node, 'elem_scalar',
                                    scal_in, scal_out,
                                    dict(fail='return;'))

@@ -154,7 +133,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            # variables inthe middle are float32
            code.append(kop.replace('npy_float16', 'ga_float'))
            for f in scal_f16:
-                code.append('%s[i] = __float2half_rn(%s);' % (f[1].name, f[0]))
+                code.append('store_half(&%s, %s);' % (f[1], f[0]))
            code.append('}')
            kop = '\n'.join(code)

@@ -178,76 +157,74 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                        ("npy_float64", "ga_double"),
                        ]:
            kop = kop.replace(npy, ga)
-        return ElemwiseKernel(self.get_params(node), inps + outs, kop,
-                              preamble=support_code)
+        return support_code, kop

    def c_headers(self):
-        return ['<numpy_compat.h>', '<gpuarray/types.h>']
-
-    def c_support_code(self):
-        return self.scalar_op.c_support_code()
-
-    def _gpu_kernel_code(self, node, nodename):
-        # This is useless by itself, but will serve an eventual c_code
-        # implementation
-        k = self.generate_kernel(node, nodename)
-        nd = node.inputs[0].type.ndim
-        res = []
-        for i in range(0, nd + 1):
-            res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
-        res.append(k.contig_src + ';')
+        return ['<numpy_compat.h>', '<gpuarray/types.h>',
+                '<gpuarray/elemwise.h>']
+
+    def c_support_code_struct(self, node, name):
+        return "\nGpuElemwise *ge;\n"
+
+    def c_init_code_struct(self, node, name, sub):
+        inps, outs = self._get_vnames(node)
+        nargs = len(inps) + len(outs)
+        support_code, kop = self._generate_op_string(node)
+        res = """
+        gpuelemwise_arg args[%(nargs)s] = {{0}};
+        """ % dict(nargs=nargs)
+
+        for n, (i, name) in enumerate(zip(node.inputs, inps)):
+            res += """
+            args[%(n)s].name = %(name)s;
+            args[%(n)s].typecode = %(typecode)s;
+            args[%(n)s].flags = GE_READ;
+            """ % dict(n=n, name='"%s"' % (name,),
+                       typecode=i.type.typecode)
+
+        p = 0
+        for n, o in enumerate(node.outputs):
+            if n in self.inplace_pattern:
+                assert(len(node.outputs) == 1)
+                res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n])
+            else:
+                nn = len(inps) + p
+                name = outs[p]
+                p += 1
+                res += """
+                args[%(n)s].name = %(name)s;
+                args[%(n)s].typecode = %(typecode)s;
+                args[%(n)s].flags = GE_WRITE;
+                """ % dict(n=nn, name='"%s"' % (name,),
+                           typecode=o.type.typecode)
+
+        res += """
+        ge = GpuElemwise_new(%(ctx)s->ops, %(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, 0);
+        if (ge == NULL) {
+           PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
+           %(fail)s
+        }
+        """ % dict(nargs=nargs, ctx=sub['params'], fail=sub['fail'],
+                   support=as_C_string_const(support_code),
+                   kop=as_C_string_const(kop), nd=node.inputs[0].ndim)

-        return '\n'.join(res)
-
-    def gpu_kernels(self, node, nodename):
-        src = self._gpu_kernel_code(node, nodename)
-        nd = node.outputs[0].ndim
-        params = ['uintp']
-        params.extend('uintp' for _ in range(nd))
-        num_inputs = len(node.inputs)
-        num_outputs = len(node.outputs)
-        for n in range(num_inputs + num_outputs):
-            if (n - len(node.inputs)) in self.inplace_pattern:
-                continue
-            params.extend([gpuarray.GpuArray, 'uintp'])
-            params.extend('intp' for _ in range(nd))
-        acc_dtype = getattr(self, 'acc_dtype', None)
-        if acc_dtype is None:
-            acc_dtype = node.outputs[0].type.dtype
-        return [Kernel(code=src, name="elem_%d" % nd, params=params,
-                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
-                                              acc_dtype,
-                                              node.outputs[0].type.dtype),
-                       objvar='elem_%d_%s' % (nd, nodename))]
+        return res

    def c_code(self, node, name, inputs, outputs, sub):
-        if node.inputs[0].type.context.kind != 'cuda':
-            raise MethodNotDefined('cuda only')
        nd = node.outputs[0].ndim
        fail = sub["fail"]
        initial_dims = ','.join('1' for i in xrange(nd))
        opname = str(self.scalar_op)
        ctx = sub['params']
+        nargs = len(node.inputs) + len(node.outputs) - len(self.inplace_pattern)

        # check that all inputs have valid dimensions
        emitted_inames = {}
-        num_kernel_params = 1 + nd + len(inputs + outputs) * (2 + nd)
        code = """
-        size_t n_blocks = 0;
-        size_t threads_per_block = 0;
-        size_t numEls = 0;
-        const ssize_t zero = 0;
-        void *kernel_params[%(num_kernel_params)d] = {0};
-        int err;
+        // +1 is so that MSVC is happy when nd == 0
+        size_t dims[%(nd)s+1] = {%(initial_dims)s};
+        void *rargs[%(nargs)s] = {0};
        """ % locals()
-        if nd > 0:
-            code += """
-            size_t dims[%(nd)s] = {%(initial_dims)s};
-            """ % locals()
-        else:
-            code += """
-            size_t *dims = NULL;
-            """
        for idx, iname in enumerate(inputs):
            if iname in emitted_inames:
                assert emitted_inames[iname] is node.inputs[idx]
@@ -256,19 +233,15 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            broadcasts = map(int, node.inputs[idx].broadcastable)
            broadcasts = ', '.join(map(str, broadcasts))
            nd = node.inputs[idx].ndim
-            if nd > 0:
-                code += """
-                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
-                """ % locals()
-            else:
-                code += """
-                int *broadcasts_%(iname)s = NULL;
-                """ % locals()
+            code += """
+            int broadcasts_%(iname)s[%(nd)s+1] = {%(broadcasts)s};
+            """ % locals()
            emitted_inames[iname] = node.inputs[idx]

        # check that all inputs have valid dimensions
        emitted_inames = {}
        for idx, iname in enumerate(inputs):
+            code += "rargs[%(idx)s] = &%(iname)s->ga;\n" % dict(idx=idx, iname=iname)
            if iname in emitted_inames:
                continue
            code += """
@@ -300,6 +273,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            """ % locals()
            emitted_inames[iname] = True
        # check that all outputs have valid dimensions
+        p = len(node.inputs)
        for idx, oname in enumerate(outputs):
            typecode = dtype_to_typecode(node.outputs[idx].dtype)
            if idx not in self.inplace_pattern.keys():
@@ -325,7 +299,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
                %(fail)s
            }
        }
-        """ % locals()
+        rargs[%(p)s] = &%(oname)s->ga;
+                """ % locals()
+                p += 1
            else:
                input_idx = self.inplace_pattern[idx]
                iname = inputs[input_idx]
@@ -351,92 +327,35 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
            }
        }
        """ % locals()
-        z = outputs[0]
-        code += """numEls = PyGpuArray_SIZE(%(z)s);
-
-        //first use at least a full warp
-        threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE

-        //next start adding multiprocessors
-        // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
-        n_blocks = std::min(numEls/threads_per_block +
-                               (numEls %% threads_per_block?1:0),
-                           (size_t)30);
-
-        // next start adding more warps per multiprocessor
-        if (threads_per_block * n_blocks < numEls)
-            threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
-
-        """ % locals()
-
-        kname = 'elem_%d_%s' % (nd, name)
-        param = ["(void *)&numEls"]
-        for i in range(nd):
-            param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
-                                                                       i=i))
-        for n, (name, var) in enumerate(zip(inputs + outputs,
-                                            node.inputs + node.outputs)):
-            if (n - len(inputs)) in self.inplace_pattern:
-                continue
-            dtype = dtype_to_ctype(var.dtype)
-            param.append("(void *)%(name)s->ga.data" % locals())
-            param.append("(void *)&%(name)s->ga.offset" % locals())
-            for i in range(nd):
-                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? (void *)&zero: (void *)&PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
-        for n, p in enumerate(param):
-            code += "kernel_params[%(n)d] = %(p)s;\n" % locals()
        code += """
-        err = GpuKernel_call(&%(kname)s, 1, &threads_per_block, &n_blocks, 0, kernel_params);
-        if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "gpuarray error: %(kname)s: %%s.",
-                         GpuKernel_error(&%(kname)s, err));
-            %(fail)s;
+        if (GpuElemwise_call(ge, rargs, GE_BROADCAST) != GA_NO_ERROR) {
+          PyErr_SetString(PyExc_RuntimeError, "Error in the elemwise call");
+          %(fail)s
        }
-        """ % dict(kname=kname, fail=fail)
+        """ % dict(fail=sub['fail'])
+
        if config.gpuarray.sync:
+            z = outputs[0]
            code += """
            err = GpuArray_sync(&%(z)s->ga);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: %(kname)s: %%s.",
-                             GpuKernel_error(&%(kname)s, err));
+                             "gpuarray error: %%s.",
+                             GpuArray_error(&%(z)s->ga, err));
                %(fail)s;
            }
            """ % locals()
-        return str(code)

-    def perform(self, node, inputs, output_storage, ctx):
-        # Try to reuse the kernel from a previous call to hopefully
-        # avoid recompiling
-        if not hasattr(node, '_cache_elemwise_k'):
-            node._cache_elemwise_k = self.generate_kernel(node, "kcode")
-
-        out_shape = []
-        for values in izip(*[input.shape for input in inputs]):
-            if any(v == 0 for v in values):
-                # All non-broadcasted dimensions should be zero
-                assert max(values) <= 1
-                out_shape.append(0)
-            else:
-                out_shape.append(max(values))
-        out_shape = tuple(out_shape)
-
-        args = copy.copy(inputs)
-        for n, (stor, out) in enumerate(izip(output_storage, node.outputs)):
-            if n in self.inplace_pattern:
-                stor[0] = inputs[self.inplace_pattern[n]]
-            else:
-                args.append(ensure_allocated(stor, out_shape, out.type.dtype, ctx))
+        return str(code)

-        node._cache_elemwise_k(*args, broadcast=True)
-        if config.gpuarray.sync:
-            output_storage[0][0].sync()
+    # To disable the superclass perform.
+    perform = Op.perform

    def c_code_cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (4, ver)
+            return (6, ver)
        else:
            return ver

@@ -585,7 +504,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):

    This op was recently upgraded from just GpuSum a general CAReduce. Not
    many code cases are supported for scalar_op being anything other than
-    scal.Add instances yet.
+    scalar.Add instances yet.

    Important note: if you implement new cases for this op, be sure to
    benchmark them and make sure that they actually result in a speedup.
@@ -735,7 +654,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        # It might be nice to use a property of the op class to do this,
        # but tensor.elemwise.CAReduce has this exact same check so I guess
        # this is OK to do
-        if self.scalar_op in [scal.minimum, scal.maximum]:
+        if self.scalar_op in [scalar.minimum, scalar.maximum]:
            conds = ["(PyGpuArray_DIMS(%s)[%d] == 0)" % (x, i)
                     for i in xrange(nd_in)
                     if self.reduce_mask[i]]
@@ -1060,13 +979,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        if hasattr(self.scalar_op, 'identity'):
            return str(self.scalar_op.identity)
        else:
-            assert isinstance(self.scalar_op, (scal.Maximum,
-                                               scal.Minimum))
+            assert isinstance(self.scalar_op, (scalar.Maximum,
+                                               scalar.Minimum))
            if self.pre_scalar_op:  # TODO: multiple dtypes
                # dtype = node.inputs[0].dtype
                dtype = 'float32'

-                dummy_var = scal.Scalar(dtype=dtype)()
+                dummy_var = scalar.Scalar(dtype=dtype)()

                dummy_node = self.pre_scalar_op.make_node(dummy_var)


--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
 from __future__ import absolute_import, print_function, division

 import os
-import copy

 import numpy
 from six import integer_types
 from six.moves import StringIO

-import theano
 from theano import tensor, gof
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
-import theano.tensor.inplace

 try:
    import pygpu
@@ -18,10 +15,9 @@ try:
 except ImportError:
    pass

-from .type import GpuArrayType
+from .type import GpuArrayType, gpu_context_type
 from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
                        infer_context_name)
-from .elemwise import GpuElemwise


 class GpuSubtensor(HideC, Subtensor):
@@ -168,7 +164,7 @@ class GpuSubtensor(HideC, Subtensor):
        return (6,)


-class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
+class GpuIncSubtensor(IncSubtensor):
    """
    Implement IncSubtensor on the gpu.

@@ -181,45 +177,20 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
    :meth:`copy_of_x`, etc. specialize the c_code for this Op.

    """
-
-    @property
-    def _f16_ok(self):
-        return self.iadd_node.op._f16_ok
-
-    def c_headers(self):
-        return self.iadd_node.op.c_headers()
-
-    def c_init_code(self):
-        return self.iadd_node.op.c_init_code()
-
-    def gpu_kernels(self, node, nodename):
-        subname = nodename + "_add_to_zview"
-        return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
+    _f16_ok = True
+    params_type = gpu_context_type

    def make_node(self, x, y, *inputs):
        ctx_name = infer_context_name(x, y)
        x = as_gpuarray_variable(x, ctx_name)
        y = as_gpuarray_variable(y, ctx_name)
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
-        op = copy.copy(self)
-        ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
-        op.create_iadd_node(ret)
+        ret = gof.Apply(self, [x, y] + rval.inputs[2:], [x.type()])
        return ret

    def get_params(self, node):
        return node.outputs[0].type.context

-    def create_iadd_node(self, node):
-        # We store a iadd_node in the op that contain the info needed
-        # for the inplace add.
-        cop = theano.tensor.inplace.add_inplace
-        gop = GpuElemwise(cop.scalar_op, copy.copy(cop.inplace_pattern),
-                          "Gpu" + cop.name, cop.nfunc_spec)
-        y = node.inputs[1]
-        xview = y.type()
-        iadd_node = gop(xview, y).owner
-        self.iadd_node = iadd_node
-
    def perform(self, node, inputs, out_, ctx):
        out, = out_
        x, y = inputs[:2]
@@ -261,18 +232,6 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
                x.__setitem__(cdata, y)
        out[0] = x

-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        owner = getattr(self, "owner", None)
-        if owner:
-            self.create_iadd_node(owner)
-
-    def __getstate__(self):
-        d = copy.copy(self.__dict__)
-        if "iadd_node" in d:
-            d.pop('iadd_node')
-        return d
-
    def do_type_checking(self, node):
        """
        Should raise NotImplementedError if c_code does not support
@@ -365,47 +324,52 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """
        return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()

+    def c_headers(self):
+        return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
+                '<gpuarray/elemwise.h>']
+
    def c_support_code_struct(self, node, nodename):
-        gop = self.iadd_node.op
-        sub_name = nodename + "_add_to_zview"
-        ret = gop.c_support_code_struct(self.iadd_node, sub_name)
-        ret += """
-        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
-                                                    PyGpuArrayObject* src){
-           PyGpuArrayObject* ret = NULL;
-        """ % locals()
-        inputs = ["dst", "src"]
-        outputs = ["ret"]
-        sub = {"fail": "return NULL;", "params": "dst->context"}
-        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
-        ret += """
-            return ret;
+        return "\nGpuElemwise *iadd;\n"

+    def c_init_code_struct(self, node, name, sub):
+        return """
+        gpuelemwise_arg args[2] = {{0}};
+        args[0].name = "a";
+        args[0].typecode = %(type1)s;
+        args[0].flags = GE_READ|GE_WRITE;
+        args[1].name = "b";
+        args[1].typecode = %(type2)s;
+        args[1].flags = GE_READ;
+        iadd = GpuElemwise_new(%(ctx)s->ops, %(ctx)s->ctx, "", "a += b",
+                               2, args, %(nd)s, 0);
+        if (iadd == NULL) {
+          PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
+          %(fail)s
        }
-        """
-        return ret
+        """ % dict(ctx=sub['params'], fail=sub['fail'],
+                   type1=node.inputs[0].type.typecode,
+                   type2=node.inputs[1].type.typecode,
+                   nd=node.inputs[1].ndim)

    def add_to_zview(self, nodename, x, fail):
        return """
-        PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
-
-        if (! add_result )
        {
+          void *args[2];
+          args[0] = &zview->ga;
+          args[1] = &%(x)s->ga;
+          if (GpuElemwise_call(iadd, args, GE_BROADCAST) != GA_NO_ERROR) {
+            PyErr_SetString(PyExc_RuntimeError, "Error doing inplace add");
            Py_DECREF(zview);
-            %(fail)s;
-        }
-        else
-        {
-            Py_DECREF(add_result);
+            %(fail)s
+          }
        }
        """ % locals()

    def c_code_cache_version(self):
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
-        elemwise_version = self.iadd_node.c_code_cache_version()
-        if not parent_version or not elemwise_version:
+        if not parent_version:
            return
-        return parent_version + elemwise_version + (3,)
+        return parent_version + (5,)


 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -18,40 +18,18 @@ from pygpu import ndgpuarray as gpuarray

 # This is acutally a test for GpuElemwise
 class test_gpu_Broadcast(test_elemwise.test_Broadcast):
-    op = GpuElemwise
-    type = GpuArrayType
    cop = GpuElemwise
    ctype = GpuArrayType
    # The order is important
    linkers = [gof.PerformLinker, gof.CLinker]

-    def setUp(self):
-        if get_context(test_ctx_name).kind != 'cuda':
-            self.linkers = [gof.PerformLinker]
-
-    def rand_val(self, shp):
-        return rand_gpuarray(*shp, **dict(cls=gpuarray))
-
    def rand_cval(self, shp):
        return rand_gpuarray(*shp, **dict(cls=gpuarray))

-    def test_c(self):
-        if get_context(test_ctx_name).kind != 'cuda':
-            raise SkipTest("Cuda specific tests")
-        super(test_gpu_Broadcast, self).test_c()
-
-    def test_c_inplace(self):
-        if get_context(test_ctx_name).kind != 'cuda':
-            raise SkipTest("Cuda specific tests")
-        super(test_gpu_Broadcast, self).test_c_inplace()
-

 def test_elemwise_pow():
    # Test that GpuElemwise(pow) can compile with any combination of integer
    # or float input dtype.
-    if get_context(test_ctx_name).kind != 'cuda':
-        raise SkipTest("Cuda specific tests")
-
    dtypes = ["uint8", "uint16", "uint32", "uint64",
              "int8", "int16", "int32", "int64",
              "float16", "float32", "float64"]
@@ -65,10 +43,10 @@ def test_elemwise_pow():
            output = base ** exp
            f = theano.function([base, exp], output)

-            # Call the function to make sure the output is valid
            base_val = numpy.random.randint(0, 5, size=10).astype(dtype_base)
            exp_val = numpy.random.randint(0, 3, size=10).astype(dtype_exp)

+            # Call the function to make sure the output is valid
            out = f(base_val, exp_val)
            expected_out = base_val ** exp_val
            assert_allclose(out, expected_out)

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -166,10 +166,12 @@ class test_Broadcast(unittest.TestCase):
    linkers = [gof.PerformLinker, gof.CLinker]

    def rand_val(self, shp):
-        return numpy.asarray(numpy.random.rand(*shp))
+        return numpy.asarray(numpy.random.rand(*shp),
+                             dtype=theano.config.floatX)

    def rand_cval(self, shp):
-        return numpy.asarray(numpy.random.rand(*shp))
+        return numpy.asarray(numpy.random.rand(*shp),
+                             dtype=theano.config.floatX)

    def setUp(self):
        unittest_tools.seed_rng()
@@ -189,8 +191,10 @@ class test_Broadcast(unittest.TestCase):
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),
                         ((), ())]:
-            x = type('float64', [(entry == 1) for entry in xsh])('x')
-            y = type('float64', [(entry == 1) for entry in ysh])('y')
+            x = type(theano.config.floatX,
+                     [(entry == 1) for entry in xsh])('x')
+            y = type(theano.config.floatX,
+                     [(entry == 1) for entry in ysh])('y')
            e = op(scalar.add)(x, y)
            f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
            xv = rand_val(xsh)
@@ -202,8 +206,10 @@ class test_Broadcast(unittest.TestCase):
            # test Elemwise.infer_shape
            # the Shape op don't implement c_code!
            if isinstance(linker, gof.PerformLinker):
-                x = type('float64', [(entry == 1) for entry in xsh])('x')
-                y = type('float64', [(entry == 1) for entry in ysh])('y')
+                x = type(theano.config.floatX,
+                         [(entry == 1) for entry in xsh])('x')
+                y = type(theano.config.floatX,
+                         [(entry == 1) for entry in ysh])('y')
                e = op(scalar.add)(x, y)
                f = copy(linker).accept(FunctionGraph(
                    [x, y], [e.shape])).make_function()
@@ -218,8 +224,10 @@ class test_Broadcast(unittest.TestCase):
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),
                         ((), ())]:
-            x = type('float64', [(entry == 1) for entry in xsh])('x')
-            y = type('float64', [(entry == 1) for entry in ysh])('y')
+            x = type(theano.config.floatX,
+                     [(entry == 1) for entry in xsh])('x')
+            y = type(theano.config.floatX,
+                     [(entry == 1) for entry in ysh])('y')
            e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
            f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
            xv = rand_val(xsh)
@@ -232,8 +240,10 @@ class test_Broadcast(unittest.TestCase):
            # test Elemwise.infer_shape
            # the Shape op don't implement c_code!
            if isinstance(linker, gof.PerformLinker):
-                x = type('float64', [(entry == 1) for entry in xsh])('x')
-                y = type('float64', [(entry == 1) for entry in ysh])('y')
+                x = type(theano.config.floatX,
+                         [(entry == 1) for entry in xsh])('x')
+                y = type(theano.config.floatX,
+                         [(entry == 1) for entry in ysh])('y')
                e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
                f = copy(linker).accept(FunctionGraph(
                    [x, y], [e.shape])).make_function()
@@ -267,13 +277,15 @@ class test_Broadcast(unittest.TestCase):
    def test_fill(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0])('x')
-        y = self.ctype('float64', [1, 1])('y')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
+        for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
+                                       [self.type, self.ctype],
+                                       [self.rand_val, self.rand_cval]):
+            x = t(theano.config.floatX, [0, 0])('x')
+            y = t(theano.config.floatX, [1, 1])('y')
            e = op(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
            f = linker().accept(FunctionGraph([x, y], [e])).make_function()
-            xv = self.rand_cval((5, 5))
-            yv = self.rand_cval((1, 1))
+            xv = rval((5, 5))
+            yv = rval((1, 1))
            f(xv, yv)
            assert (xv == yv).all()

@@ -292,24 +304,28 @@ class test_Broadcast(unittest.TestCase):
    def test_weird_strides(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0, 0, 0, 0])('x')
-        y = self.ctype('float64', [0, 0, 0, 0, 0])('y')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
+        for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
+                                       [self.type, self.ctype],
+                                       [self.rand_val, self.rand_cval]):
+            x = t(theano.config.floatX, [0, 0, 0, 0, 0])('x')
+            y = t(theano.config.floatX, [0, 0, 0, 0, 0])('y')
            e = op(scalar.add)(x, y)
            f = linker().accept(FunctionGraph([x, y], [e])).make_function()
-            xv = self.rand_cval((2, 2, 2, 2, 2))
-            yv = self.rand_cval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
+            xv = rval((2, 2, 2, 2, 2))
+            yv = rval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
            zv = xv + yv
            assert (f(xv, yv) == zv).all()

    def test_same_inputs(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0])('x')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
+        for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
+                                       [self.type, self.ctype],
+                                       [self.rand_val, self.rand_cval]):
+            x = t(theano.config.floatX, [0, 0])('x')
            e = op(scalar.add)(x, x)
            f = linker().accept(FunctionGraph([x], [e])).make_function()
-            xv = self.rand_cval((2, 2))
+            xv = rval((2, 2))
            zv = xv + xv
            assert (f(xv) == zv).all()