Merge pull request #1582 from abergeron/compyte2-rb

Compyte2 rebase

Merge pull request #1582 from abergeron/compyte2-rb
8bd900f8 · Frédéric Bastien · 382d2ed1 · 217b616b · 8bd900f8 · 8bd900f8
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -932,7 +932,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
    adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
    mode = mode_with_gpu
    dtype = 'float32'
-    ignore_topo = (B.HostFromGpu, B.GpuFromHost)
+    ignore_topo = (B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp)
    fast_compile = False
    ops = (cuda.GpuSubtensor, cuda.GpuIncSubtensor,
           cuda.GpuAdvancedSubtensor1, cuda.GpuAdvancedIncSubtensor1)

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
 import logging

 import theano
-from theano.configparser import config
+from theano.configparser import config, AddConfigVar, BoolParam
 from theano.compile import optdb

 _logger_name = 'theano.sandbox.gpuarray'
@@ -18,6 +18,13 @@ try:
 except ImportError:
    pygpu = None

+AddConfigVar('gpuarray.sync',
+             """If True, every op will make sure its work is done before
+                returning.  Setting this to True will slow down execution,
+                but give much more accurate results in profiling.""",
+             BoolParam(False),
+             in_c_key=True)
+
 # This is for documentation not to depend on the availability of pygpu
 from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                  GpuArraySharedVariable, gpuarray_shared_constructor)

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -6,8 +6,10 @@ import theano
 from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor, scalar, config
 from theano.scalar import Scalar
+from theano.tensor.basic import Alloc

 from theano.gof.python25 import all, any
+from theano.gof.utils import MethodNotDefined

 try:
    import pygpu
@@ -17,6 +19,7 @@ except ImportError:

 from type import GpuArrayType

+
 def as_gpuarray_variable(x):
    if hasattr(x, '_as_GpuArrayVariable'):
        return x._as_GpuArrayVariable()
@@ -29,6 +32,32 @@ def as_gpuarray(x):
    return gpuarray.array(x, copy=False)


+class HideC(object):
+    def __hide(*args):
+        raise MethodNotDefined()
+
+    c_code = __hide
+    c_code_cleanup = __hide
+
+    c_headers = __hide
+    c_header_dirs = __hide
+    c_libraries = __hide
+    c_lib_dirs = __hide
+
+    c_support_code = __hide
+    c_support_code_apply = __hide
+
+    c_compile_args = __hide
+    c_no_compile_args = __hide
+    c_init_code = __hide
+
+    def c_code_cache_version(self):
+        return ()
+
+    def c_code_cache_version_apply(self, node):
+        return self.c_code_cache_version()
+
+
 class HostFromGpu(Op):
    def __eq__(self, other):
        return type(self) == type(other)
@@ -66,7 +95,7 @@ class HostFromGpu(Op):
        } else {
            %(name)s_ga = &%(inp)s->ga;
        }
-        %(name)s_dtype = typecode_to_dtype(%(inp)s->ga.typecode);
+        %(name)s_dtype = typecode_to_dtype(%(name)s_ga->typecode);
        Py_XDECREF(%(out)s);
        // PyArray_Empty below steals a reference to the dtype we pass it
        // so we need an extra one to spare.
@@ -89,7 +118,7 @@ class HostFromGpu(Op):
            %(fail)s
        }
        """ % {'name': name, 'fail': sub['fail'], 'inp': inputs[0],
-                'out': outputs[0]}
+               'out': outputs[0]}

    def c_code_cache_version(self):
        return (1,)
@@ -97,7 +126,7 @@ class HostFromGpu(Op):
    def grad(self, inputs, grads):
        gz, = grads
        return [gpu_from_host(gz)]
-                                                   
+
    def R_op(self, inputs, eval_points):
        ev, = eval_points
        if isinstance(ev, tensor.TensorType):
@@ -150,48 +179,22 @@ class GpuFromHost(Op):

    def c_code(self, node, name, inputs, outputs, sub):
        return """
-        PyArrayObject *%(name)s_tmp;
-        int %(name)serr;
-        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
-        if (%(name)s_tmp == NULL) {
-            // PyArray_GETCONTIGUOUS sets an error message if it fails
-            %(fail)s
-        }
        Py_XDECREF(%(out)s);
-        %(out)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
-        if (%(out)s == NULL) {
-            Py_DECREF(%(name)s_tmp);
-            // new_GpuArray calls __new__ which will set an error message
-            // if it returns NULL.
-            %(fail)s
-        }
-        %(name)serr = GpuArray_empty(&%(out)s->ga,
-                                     GpuArray_default_context()->ops,
-                                     GpuArray_default_context()->ctx,
-                                     get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
+        %(out)s = pygpu_fromhostdata(PyArray_DATA(%(inp)s),
+                                     get_typecode((PyObject *)PyArray_DESCR(%(inp)s)),
                                     PyArray_NDIM(%(inp)s),
                                     (size_t *)PyArray_DIMS(%(inp)s),
-                                     GA_C_ORDER);
-        if (%(name)serr != GA_NO_ERROR) {
-            Py_DECREF(%(name)s_tmp);
-            Py_DECREF(%(out)s);
-            %(out)s = NULL;
-            PyErr_SetString(PyExc_MemoryError, "Can't allocate device memory for result.");
-            %(fail)s
-        }
-        %(name)serr = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
-                                     PyArray_NBYTES(%(name)s_tmp));
-        Py_DECREF(%(name)s_tmp);
-        if (%(name)serr != GA_NO_ERROR) {
-            Py_DECREF(%(out)s);
-            PyErr_SetString(PyExc_RuntimeError, "Could not copy array data to device");
+                                     (ssize_t *)PyArray_STRIDES(%(inp)s),
+                                     pygpu_default_context(),
+                                     Py_None);
+        if (%(out)s == NULL) {
            %(fail)s
        }
        """ % {'name': name, 'inp': inputs[0],
               'out': outputs[0], 'fail': sub['fail']}

    def c_code_cache_version(self):
-        return (1,)
+        return (4,)

 gpu_from_host = GpuFromHost()

@@ -276,7 +279,7 @@ class GpuFromCuda(Op):
        ssize_t *%(name)sstr;

        cuCtxGetCurrent(&%(name)scur);
-        if (%(name)scur != cuda_get_ctx(GpuArray_default_context()->ctx)) {
+        if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
            %(fail)s
        }
@@ -297,15 +300,7 @@ class GpuFromCuda(Op):
            %(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
        }

-        Py_XDECREF(%(out)s);
-        %(out)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
-        if (%(out)s == NULL) {
-            free(%(name)sdims);
-            free(%(name)sstr);
-            %(fail)s
-        }
-
-        %(name)sdata = cuda_make_buf(GpuArray_default_context()->ctx,
+        %(name)sdata = cuda_make_buf(pygpu_default_context()->ctx,
                                     (CUdeviceptr)%(in)s->devdata,
                                     ((size_t)%(in)s->data_allocated)*4);
        if (%(name)sdata == NULL) {
@@ -315,24 +310,23 @@ class GpuFromCuda(Op):
            PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
            %(fail)s
        }
-        %(name)serr = GpuArray_fromdata(&%(out)s->ga,
-                                        GpuArray_default_context()->ops,
-                                        %(name)sdata, 0, GA_FLOAT, %(in)s->nd,
-                                        %(name)sdims, %(name)sstr, 1);
+        Py_XDECREF(%(out)s);
+        %(out)s = pygpu_fromgpudata(%(name)sdata, 0, GA_FLOAT, %(in)s->nd,
+                                    %(name)sdims, %(name)sstr,
+                                    pygpu_default_context(), 1,
+                                    (PyObject *)%(in)s,
+                                    (PyObject *)&PyGpuArrayType);
+        pygpu_default_context()->ops->buffer_release(%(name)sdata);
        free(%(name)sdims);
        free(%(name)sstr);
-        if (%(name)serr != GA_NO_ERROR) {
-            Py_DECREF(%(out)s);
-            PyErr_SetString(PyExc_MemoryError, "Could not allocate GpuArray structure.");
+        if (%(out)s == NULL) {
            %(fail)s
        }
-        Py_INCREF(%(in)s);
-        %(out)s->base = (PyObject *)%(in)s;
-        """ % {'name':name, 'in': inputs[0], 'out': outputs[0],
+        """ % {'name': name, 'in': inputs[0], 'out': outputs[0],
               'fail': sub['fail']}

    def c_code_cache_version(self):
-        return (1,)
+        return (5,)

 gpu_from_cuda = GpuFromCuda()

@@ -417,11 +411,15 @@ class CudaFromGpu(Op):
        CUcontext %(name)scur;

        cuCtxGetCurrent(&%(name)scur);
-        if (%(name)scur != cuda_get_ctx(GpuArray_default_context()->ctx)) {
+        if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
            %(fail)s
        }

+        if (GpuArray_sync(&%(inp)s->ga) != GA_NO_ERROR) {
+            PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
+            %(fail)s
+        }
        Py_XDECREF(%(out)s);
        %(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
        if (!%(out)s) {
@@ -441,61 +439,123 @@ class CudaFromGpu(Op):
               'fail': sub['fail']}

    def c_code_cache_version(self):
-        return (1,)
+        return (3,)


 cuda_from_gpu = CudaFromGpu()


-class GpuAlloc(Op):
+class GpuAlloc(HideC, Alloc):
    def __str__(self):
        return 'GpuAlloc'

-    def __hash__(self):
-        return hash(type(self))
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
    def make_node(self, value, *shape):
-        v = as_gpuarray_variable(value)
-        sh = [tensor.as_tensor_variable(s) for s in shape]
-        bcast = []
-        if v.ndim > len(shape):
-            raise TypeError(
-                'GpuAlloc value has more dimensions than arguments',
-                value.ndim, len(shape))
-        for i, s in enumerate(sh):
-            if s.type.dtype[:3] not in ('int', 'uint'):
-                raise TypeError('Shape arguments must be integers', s)
-            try:
-                const_shp = tensor.get_scalar_constant_value(s)
-            except tensor.NotScalarConstantError:
-                const_shp = None
-            bcast.append(numpy.all(1 == const_shp))
-        otype = GpuArrayType(dtype=v.dtype, broadcastable=bcast)
-        return Apply(self, [v] + sh, [otype()])
+        res = Alloc.make_node(self, value, *shape)
+        value = as_gpuarray_variable(value)
+        otype = GpuArrayType(dtype=res.outputs[0].dtype,
+                             broadcastable=res.outputs[0].broadcastable)
+        return Apply(self, [value] + res.inputs[1:], [otype()])

    def perform(self, node, inputs, outs):
        out, = outs
        v = inputs[0]
        sh = tuple(map(int, inputs[1:]))
        if out[0] is None or out[0].shape != sh:
-            out[0] = gpuarray.empty(sh, dtype=v.dtype)
-        out[0][...] = v
+            if v.size == 1 and numpy.asarray(v)[0].item() == 0:
+                out[0] = gpuarray.zeros(sh, dtype=v.dtype)
+            else:
+                out[0] = gpuarray.empty(sh, dtype=v.dtype)
+                out[0][...] = v
+        else:
+            out[0][...] = v
+        if config.gpuarray.sync:
+            out[0].sync()
+
+    def c_code(self, node, name, inp, out, sub):
+        vv = inp[0]
+        ndim = len(inp[1:])
+        zz, = out
+
+        code = """
+        int i;
+        size_t %(name)s_shape[%(ndim)s];
+        """ % dict(name=name, ndim=ndim)
+
+        for i, shp_i in enumerate(inp[1:]):
+            code += """
+        %(name)s_shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
+        """ % dict(name=name, i=i, shp_i=shp_i)
+
+        code += """
+        int need_new_out = (NULL == %(zz)s || %(zz)s->ga.nd != %(ndim)s);
+
+        if (!need_new_out)
+            for (i = 0; i < %(ndim)s; i++)
+                need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
+
+        if (need_new_out) {
+            Py_XDECREF(%(zz)s);
+            %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
+                                 %(vv)s->ga.typecode, GA_C_ORDER,
+                                 pygpu_default_context(), Py_None);
+            if (!%(zz)s) {
+                %(fail)s
+            }
+        }

-    def infer_shape(self, node, input_shapes):
-        return [node.inputs[1:]]
+        if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) {
+            PyErr_SetString(PyExc_ValueError, "setarray failed");
+            %(fail)s
+        }
+        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, fail=sub['fail'])

-    def grad(self, input, grads):
-        return [None for i in inputs]
+        if config.gpuarray.sync:
+            code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)

-    def do_constant_folding(self, node):
-        if not getattr(node.ouputs[0], 'clients', []):
-            return False
-        for client in node.outputs[0].clients:
-            if client[0] == 'output':
-                return False
-        return True
+        return code
+
+    def c_code_cache_version(self):
+        return (1,)

 gpu_alloc = GpuAlloc()
+
+
+class GpuReshape(HideC, tensor.Reshape):
+    """
+    Implement Reshape on the gpu.
+    """
+    # __hash__, __eq__, __str__ come from tensor.Reshape
+    def make_node(self, x, shp):
+        x = as_gpuarray_variable(x)
+        res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
+        otype = GpuArrayType(dtype=res.dtype,
+                             broadcastable=res.broadcastable)
+        return Apply(self, [x, shp], [otype()])
+
+    def perform(self, node, inp, out_):
+        x, shp = inp
+        out, = out_
+        if (len(shp) != self.ndim):
+            raise ValueError('shape argument to GpuReshape.perform'
+                             ' has incorrect length %i'
+                             ', should be %i' % (len(shp), self.ndim), shp)
+        s = shp.prod()
+
+        if shp.prod() != x.size:
+            # We need to do check here to raise the same error as NumPy.
+            # We should make pygpu do the same.
+            ss = 1
+            nb_m1 = 0
+            for i in shp:
+                if i == -1:
+                    nb_m1 += 1
+                else:
+                    ss *= i
+            if nb_m1 > 1:
+                raise ValueError("Only one -1 is accepted in the new shape")
+            elif nb_m1 == 1:
+                if (x.size % ss) != 0:
+                    raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.")
+            else:
+                raise ValueError("total size of new array must be unchanged")
+        out[0] = x.reshape(tuple(shp))
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
+from theano import Op, Apply, config
+
+from theano.tensor.blas import Gemv, Gemm
+from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
+
+try:
+    import pygpu
+    from pygpu import blas
+except ImportError, e:
+    # To make sure theano is importable
+    pass
+
+
+class BlasOp(HideC):
+    def c_headers(self):
+        return ['<blas_api.h>']
+
+    def c_header_dirs(self):
+        return [pygpu.get_include()]
+
+    def c_init_code(self):
+        return ['import_pygpu__blas();']
+
+
+class GpuGemv(BlasOp, Gemv):
+    def make_node(self, y, alpha, A, x, beta):
+        res = Gemv.make_node(self, y, alpha, A, x, beta)
+        A = as_gpuarray_variable(A)
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        return Apply(self, [y, alpha, A, x, beta], [y.type()])
+
+    def perform(self, node, inputs, out_storage):
+        y, alpha, A, x, beta = inputs
+        out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, trans=False,
+                                      overwrite_y=self.inplace)
+
+    def c_code(self, node, name, inp, out, sub):
+        vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
+                    beta=inp[4], fail=sub['fail'], name=name)
+        if self.inplace:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = %(y)s;
+                   Py_INCREF(%(out)s);
+                   """ % vars
+        else:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
+                   if (%(out)s == NULL) {
+                       %(fail)s
+                   }
+                   """ % vars
+        code += """
+        if (pygpu_blas_rgemv(cb_no_trans,
+                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
+                             %(A)s, %(x)s,
+                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
+                             %(out)s) == NULL) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """
+        return code
+
+    def c_code_cache_version(self):
+        return (0,)
+
+gpugemv_no_inplace = GpuGemv(inplace=False)
+gpugemv_inplace = GpuGemv(inplace=True)
+
+
+class GpuGemm(BlasOp, Gemm):
+    def make_node(self, C, alpha, A, B, beta):
+        res = Gemm.make_node(self, C, alpha, A, B, beta)
+        A = as_gpuarray_variable(A)
+        B = as_gpuarray_variable(B)
+        C = as_gpuarray_variable(C)
+        return Apply(self, [C, alpha, A, B, beta], [C.type()])
+
+    def perform(self, node, inputs, outputs):
+        C, alpha, A, B, beta = inputs
+        outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
+                                  overwrite_c=self.inplace)
+
+    def c_code(self, node, name, inp, out, sub):
+        vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
+                    beta=inp[4], fail=sub['fail'], name=name)
+        if self.inplace:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = %(C)s;
+                   Py_INCREF(%(out)s);
+                   """ % vars
+        else:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
+                   if (%(out)s == NULL) {
+                       %(fail)s
+                   }
+                   """ % vars
+        code += """
+        if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
+                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
+                             %(A)s, %(B)s,
+                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
+                             %(out)s) == NULL) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """
+        return code
+
+    def c_code_cache_version(self):
+        return (0,)
+
+
+gpugemm_no_inplace = GpuGemm(inplace=False)
+gpugemm_inplace = GpuGemm(inplace=True)
+
+from theano.compile import optdb
+from theano.gof import local_optimizer, LocalOptGroup
+from theano.tensor.opt import in2out
+
+
+@local_optimizer([gpugemv_no_inplace])
+def local_inplace_gpuagemv(node):
+    if node.op == gpugemv_no_inplace:
+        return [gpugemv_inplace(*node.inputs)]
+
+
+@local_optimizer([gpugemm_no_inplace])
+def local_inplace_gpuagemm(node):
+    if node.op == gpugemm_no_inplace:
+        return [gpugemm_inplace(*node.inputs)]
+
+gpuablas_opt_inplace = in2out(LocalOptGroup(
+        local_inplace_gpuagemv, local_inplace_gpuagemm),
+                              name='gpuablas_opt_inplace')
+optdb.register('InplaceGpuaBlasOpt',
+               gpuablas_opt_inplace,
+               70.0, 'fast_run', 'inplace', 'gpuarray')
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
+import copy
+from itertools import izip
+
 import numpy
-from theano import Op, Apply, scalar
+from theano import Op, Apply, scalar, config
+from theano.tensor.elemwise import Elemwise, DimShuffle, CAReduceDtype

 try:
+    import pygpu
    from pygpu.tools import ScalarArg, ArrayArg
    from pygpu.elemwise import ElemwiseKernel
+    from pygpu.reduction import ReductionKernel
 except ImportError:
    pass

-from basic_ops import as_gpuarray_variable
-from type import GpuArrayType
+from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
+from theano.sandbox.gpuarray.type import GpuArrayType

 from theano.gof.utils import MethodNotDefined

+
 def _is_scalar(v):
    False

+
 def make_argument(v, name):
    if _is_scalar(v):
        return ScalarArg(numpy.dtype(v.type.dtype), name)
    else:
        return ArrayArg(numpy.dtype(v.type.dtype), name)

-def ensure_out(o, ref):
-    if o is None:
-        return ref._empty_like_me()
-    else:
-        return o
-
-class GpuElemwise(Op):
-    nin = property(lambda self: self.scalar_op.nin)
-    nout = property(lambda self: self.scalar_op.nout)

-    def __init__(self, scalar_op):
-        self.scalar_op = scalar_op
-        self.destroy_map = {}
+def ensure_allocated(storage, shape, dtype):
+    odat = storage[0]
+    if odat is not None:
+        if odat.shape != shape:
+            # It is unsafe to try to resize odat,
+            # we have to allocate output storage.
+            odat = None
+    if odat is None:
+        odat = pygpu.empty(shape, dtype=dtype)
+    storage[0] = odat
+    return odat

-    def __getstate__(self):
-        d = copy.copy(self.__dict__)
-        d.pop('__epydoc_asRoutine', None)
-        d.pop('_hashval')
-        return d

-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self._rehash()
+def as_C_string_const(s):
+    return '\n'.join('"%s\\n"' % (l.replace('"', '\\"'))
+                     for l in s.split('\n'))

-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.scalar_op == other.scalar_op)

-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.scalar_op)
+class GpuElemwise(HideC, Elemwise):
+    nin = property(lambda self: self.scalar_op.nin)
+    nout = property(lambda self: self.scalar_op.nout)

    def __str__(self):
-        return "GpuElemwise{%s}(gpuarray)" % (self.scalar_op,)
+        if self.name is not None:
+            return self.name
+        items = str(sorted(self.inplace_pattern.items()))
+        return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)

    def make_node(self, *inputs):
-        _inputs = [as_gpuarray_variable(i) for i in inputs]
-        if self.nin > 0 and len(_inputs) != self.nin:
-            raise TypeError("Wrong argument count", (self.nin, len(_inputs)))
-        for i in _inputs[1:]:
-            if i.type.ndim != inputs[0].type.ndim:
-                raise TypeError('mismatched rank amongst inputs')
-
-        broadcastable = []
-        for d in xrange(_inputs[0].type.ndim):
-            bcast_d = True
-            for i in _inputs:
-                if not i.type.broadcastable[d]:
-                    bcast_d = False
-                    break
-            broadcastable.append(bcast_d)
-        assert len(broadcastable) == _inputs[0].type.ndim
-
-        assert self.nout > 0
+        res = Elemwise.make_node(self, *inputs)
+        outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
+                                dtype=o.type.dtype)() for o in res.outputs]
+        inputs = [as_gpuarray_variable(i) for i in inputs]
+        res = Apply(self, inputs, outputs)
+        # Try to generate the kernel to catch SupportCodeErrors
+        k = self.generate_kernel(res, 'test')
+        return res
+
+    def generate_kernel(self, node, nodename):
        inps = [make_argument(i, 'i%d' % (n,)) for n, i in
-                enumerate(inputs)]
-        scal_ins = [scalar.Scalar(i.dtype) for i in inputs]
-                          
-        res = Apply(self, _inputs, 
-                    [GpuArrayType(o.dtype, broadcastable)()
-                     for o in self.scalar_op.output_types(scal_ins)])
+                enumerate(node.inputs)]
+        scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs]

        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
-                enumerate(res.outputs)]
-        scal_out = [scalar.Scalar(o.dtype) for o in res.outputs]
+                enumerate(node.outputs) if not n in self.inplace_pattern]
+        scal_out = [scalar.Scalar(o.dtype) for o in node.outputs]

        fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
                          [o() for o in scal_out])

-        kcode = self.scalar_op.c_code(fake_node, 'kcode',
-                                      [i.expr() for i in inps],
-                                      [o.expr() for o in outs],
-                                      sub=dict(fail='return;'))
-        res.tag.kcode = kcode
-
        try:
-            code = self.scalar_op.c_support_code_apply(fake_node, 'kcode')
+            code = self.scalar_op.c_support_code_apply(fake_node, nodename)
            if code:
-                raise SupportCodeError()
+                raise SupportCodeError(code)
        except MethodNotDefined:
            pass

        support_code = ""
        try:
-            support_code += self.scalar_op.c_support_code()
+            support_code = self.scalar_op.c_support_code()
        except MethodNotDefined:
            pass

-        if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)":
-            # Avoid the C++ complex struct
-            raise SupportCodeError()
+        if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
+            support_code.strip() != ""):
+            # The macro is fine, the C++ struct is not.
+            raise SupportCodeError(support_code)

-        k = ElemwiseKernel(None, inps+outs, kcode, preamble=support_code)
-        res.tag.kernel = k
+        scal_out = []
+        oi = 0
+        for n in range(len(fake_node.outputs)):
+            if n in self.inplace_pattern:
+                scal_out.append(inps[self.inplace_pattern[n]].name+'[i]')
+            else:
+                scal_out.append(outs[oi].name+'[i]')
+                oi += 1

-        return res
+        kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
+                                    [i.name+'[i]' for i in inps],
+                                    scal_out,
+                                    dict(fail='return;'))
+
+        # Translate types for scalar composite ops (except complex).
+        support_code += """
+#define npy_float64 ga_double
+#define npy_float32 ga_float
+#define npy_uint8 ga_ubyte
+#define npy_int8 ga_byte
+#define npy_uint16 ga_ushort
+#define npy_int16 ga_short
+#define npy_uint32 ga_uint
+#define npy_int32 ga_int
+#define npy_uint64 ga_ulong
+#define npy_int64 ga_long
+"""
+        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
+
+    def c_support_code_apply(self, node, nodename):
+        # This is useless by itself, but will serve an eventual c_code
+        # implementation
+        k = self.generate_kernel(node, nodename)
+
+        nd = node.inputs[0].type.ndim
+        res = []
+        for i in range(1, nd):
+            var = "static const char %s_%s[] = " % (nodename, str(i))
+            res.append(var + as_C_string_const(k.render_basic(i)) + ';')
+            res.append("static const gpukernel *%s_%s_k = NULL;" % (nodename,
+                                                                    str(i)))
+        var = "static const char %s_c[] = " % (nodename,)
+        res.append(var + as_C_string_const(k.contig_src) + ';')
+        res.append("static const gpukernel *%s_c_k = NULL;" % (nodename,))
+        return '\n'.join(res)
+
+    def perform(self, node, inputs, output_storage):
+        # Try to reuse the kernel from a previous call to hopefully
+        # avoid recompiling
+        if not hasattr(node, '_cache_elemwise_k'):
+            node._cache_elemwise_k = self.generate_kernel(node, "kcode")
+
+        out_shape = []
+        for values in izip(*[input.shape for input in inputs]):
+            if any(v == 0 for v in values):
+                # All non-broadcasted dimensions should be zero
+                assert max(values) <= 1
+                out_shape.append(0)
+            else:
+                out_shape.append(max(values))
+        out_shape = tuple(out_shape)

-    def perform(self, node, inps, out):
-        k = node.tag.kernel
-        outs = [ensure_out(o[0], inps[0]) for o in out]
+        args = copy.copy(inputs)
+        for n, (stor, out) in enumerate(izip(output_storage, node.outputs)):
+            if n in self.inplace_pattern:
+                stor[0] = inputs[self.inplace_pattern[n]]
+            else:
+                args.append(ensure_allocated(stor, out_shape, out.type.dtype))

-        # the dict call is there to avoid syntax error in python <= 2.5
-        k(*(inps+outs), **dict(broadcast=True))
+        # the dict call is there to avoid a syntax error in python < 2.6
+        node._cache_elemwise_k(*args, **dict(broadcast=True))
+        if config.gpuarray.sync:
+            output_storage[0][0].sync()

-        for o, og in zip(out, outs):
-            o[0] = og

 class SupportCodeError(Exception):
    """
    We do not support certain things (such as the C++ complex struct)
    """
+
+
+class GpuDimShuffle(HideC, DimShuffle):
+    def make_node(self, input):
+        res = DimShuffle.make_node(self, input)
+        otype = GpuArrayType(dtype=res.outputs[0].type.dtype,
+                             broadcastable=res.outputs[0].type.broadcastable)
+        input = as_gpuarray_variable(input)
+        return Apply(self, [input], [otype()])
+
+    def __str__(self):
+        if self.inplace:
+            s = "InplaceGpuDimShuffle{%s}"
+        else:
+            s = "GpuDimShuffle{%s}"
+        return s % (','.join(str(x) for x in self.new_order))
+
+    def perform(self, node, inp, out):
+        input, = inp
+        storage, = out
+
+        res = input
+
+        res = res.transpose(self.shuffle+self.drop)
+
+        shape = list(res.shape[:len(self.shuffle)])
+        for augm in self.augment:
+            shape.insert(augm, 1)
+        res = res.reshape(shape)
+
+        if not self.inplace:
+            res = res.copy()
+
+        storage[0] = res
+
+    def c_support_code_apply(self, node, name):
+        def copy_shape(nd_out):
+            stmts = []
+            e = 0
+            for d in range(nd_out):
+                if d in self.augment:
+                    stmts.append("sh[%s] = 1;" % (d,))
+                else:
+                    stmts.append("sh[%s] = tmp->ga.dimensions[%s];" % (d, e))
+                    e += 1
+            return '\n            '.join(stmts)
+
+        return """
+        static const unsigned int %(name)s_ax[] = {%(shuffle)s};
+
+        static PyGpuArrayObject *%(name)s_f(PyGpuArrayObject *a) {
+            PyGpuArrayObject *res, *tmp;
+            size_t sh[%(nd_out)s];
+
+            tmp = pygpu_transpose(a, %(name)s_ax);
+            if (!tmp) return NULL;
+            %(copy_shape)s
+            res = pygpu_reshape(tmp, %(nd_out)s, sh, GA_ANY_ORDER, 1, -1);
+            Py_DECREF(tmp);
+            return res;
+        }
+        """ % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)),
+                   name=name, nd_out=len(self.new_order),
+                   copy_shape=copy_shape(len(self.new_order)))
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        d = dict(name=name, fail=sub['fail'], inp=inputs[0], out=outputs[0],
+                 nd=len(self.input_broadcastable))
+        process = """
+        PyGpuArrayObject *tmp = NULL;
+        if (%(inp)s->ga.nd != %(nd)s) {
+            PyErr_SetString(PyExc_TypeError, "input nd");
+            %(fail)s
+        }
+
+        Py_XDECREF(%(out)s);
+        %(out)s = %(name)s_f(%(inp)s);
+        if (%(out)s == NULL) {%(fail)s}
+        """ % d
+
+        if not self.inplace:
+            process += """
+            tmp = pygpu_copy(%(out)s, GA_ANY_ORDER);
+            Py_DECREF(%(out)s);
+            if (!tmp) {
+                %(out)s = NULL;
+                %(fail)s
+            }
+            %(out)s = tmp;
+            """ % d
+        return process
+
+    def c_code_cache_version(self):
+        return (3,)
+
+
+class GpuCAReduce(HideC, CAReduceDtype):
+    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
+        if not hasattr(scalar_op, 'identity'):
+            raise ValueError("No identity on scalar op")
+        CAReduceDtype.__init__(self, scalar_op, axis=axis, dtype=dtype,
+                               acc_dtype=acc_dtype)
+
+    def __str__(self):
+        ax = ''
+        if self.axis is not None:
+            ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
+        return "GpuReduce{%s}%s" % (self.scalar_op, ax)
+
+    def make_node(self, input):
+        res = CAReduceDtype.make_node(self, input)
+        input = as_gpuarray_variable(input)
+        otype = GpuArrayType(dtype=res.outputs[0].dtype,
+                             broadcastable=res.outputs[0].broadcastable)
+
+        if res.op.axis is not None:
+            redux = []
+            for i in range(len(input.type.broadcastable)):
+                redux.append(i in res.op.axis)
+                # since redux is just another way to describe what is in axis
+                # it doesn't need to be compared in __eq__ or __hash__
+            res.op.redux = redux
+
+        return Apply(res.op, [input], [otype()])
+
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        if self.axis is None:
+            redux = [True] * node.inputs[0].ndim
+        else:
+            redux = self.redux
+        acc_dtype = getattr(self, 'acc_dtype', None)
+        if acc_dtype is None:
+            acc_dtype = node.outputs[0].type.dtype
+        if any(redux):
+            node._cache_reduction_k = self.generate_kernel(node, acc_dtype,
+                                                           redux)
+        return super(GpuCAReduce, self).make_thunk(node, storage_map,
+                                                   compute_map, no_recycling)
+
+    def generate_kernel(self, node, odtype, redux):
+        if isinstance(self.scalar_op, scalar.basic.Add):
+            reduce_expr = "a + b"
+        elif isinstance(self.scalar_op, scalar.basic.Mul):
+            reduce_expr = "a * b"
+        else:
+            raise NotImplementedError()
+        return ReductionKernel(pygpu.get_default_context(), odtype,
+                               self.scalar_op.identity, reduce_expr, redux,
+                               arguments=[make_argument(node.inputs[0], 'a')],
+                               init_nd=node.inputs[0].ndim
+        )
+
+    def perform(self, node, inp, out):
+        input, = inp
+        output, = out
+
+        if self.axis is None:
+            redux = [True] * input.ndim
+        else:
+            redux = self.redux
+
+        if any(redux):
+            output[0] = node._cache_reduction_k(input).astype(copy=False,
+                                             dtype=node.outputs[0].type.dtype)
+        else:
+            output[0] = pygpu.gpuarray.array(input, copy=True,
+                                             dtype=node.outputs[0].type.dtype)
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
-import theano, numpy
-from theano import tensor
+import copy
+import theano
+import numpy
+from theano import tensor, scalar
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
                        Optimizer, toolbox, DestroyHandler,
@@ -8,8 +10,12 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
 from theano.gof.python25 import all, any
 from theano.sandbox.gpuarray.type import GpuArrayType

-from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc
-from elemwise import GpuElemwise, _is_scalar
+from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
+                                               gpu_alloc, GpuReshape)
+from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
+                                              GpuDimShuffle, GpuCAReduce)
+from theano.sandbox.gpuarray.subtensor import GpuSubtensor
+from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm

 gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
@@ -26,6 +32,7 @@ optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')

+
 def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
@@ -35,6 +42,36 @@ def register_opt(*tags, **kwargs):

 register_opt()(theano.tensor.opt.local_track_shape_i)

+
+def op_lifter(OP):
+    """
+    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
+    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
+    """
+    def f(maker):
+        def local_opt(node):
+            if type(node.op) is OP:
+                # This does not support nodes that have more than one output.
+                assert len(node.outputs) == 1
+                # either one of our inputs is on the gpu or
+                # all of our client are on the gpu
+                if (any([i.owner and i.owner.op == host_from_gpu
+                         for i in node.inputs]) or
+                    all([c != 'output' and c.op == gpu_from_host
+                         for c, idx in node.outputs[0].clients])):
+                    new_op = maker(node)
+                    # This is needed as sometimes new_op inherit from OP.
+                    if new_op and new_op != node.op:
+                        if isinstance(new_op, theano.Op):
+                            return [host_from_gpu(new_op(*node.inputs))]
+                        else:  # suppose it is a variable on the GPU
+                            return [host_from_gpu(new_op)]
+            return False
+        local_opt.__name__ = maker.__name__
+        return local_optimizer([OP])(local_opt)
+    return f
+
+
 class InputToGpuOptimizer(Optimizer):
    "Transfer the input to the gpu to start the rolling wave."

@@ -63,6 +100,7 @@ class InputToGpuOptimizer(Optimizer):
 gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
                    0, 'fast_run', 'fast_compile', 'merge')

+
 @local_optimizer([])
 def local_cut_gpu_host_gpu(node):
    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
@@ -78,67 +116,117 @@ gpu_cut_copies.register('cut_gpua_constant_transfers',
 optdb['canonicalize'].register('local_cut_gpua_host_gpua',
                               local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')

+
 @register_opt()
-@local_optimizer([tensor.Alloc])
+@op_lifter(tensor.Alloc)
 def local_gpualloc(node):
-    replace = False
-    if node.op == tensor.alloc:
-        if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu:
-            replace = True
-        elif all([c != 'output' and c.op == gpu_from_host
-                  for c, idx in node.outputs[0].clients]):
-            replace = True
-        elif all([c != 'output' and c.op == tensor.join and
-                  all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc]
-                       for i in c.inputs[1:]])
-                  for c, idx in node.outputs[0].clients]):
-            replace = True
-    if replace:
-        val = node.inputs[0]
-        shp = node.inputs[1:]
-        old_out = node.outputs[0]
-        val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
-        new_out = host_from_gpu(gpu_alloc(val, *shp))
-        if new_out.type != old_out.type:
-            assert new_out.type.ndim == old_out.type.ndim
-            assert new_out.type.dtype == old_out.type.dtype
-            for b_old, b_new in zip(old_out.type.broadcastable,
-                                    new_out.type.broadcastable):
-                assert b_new or (not b_old)
-            new_out = tensor.patternbroadcast(new_out. old_out.broadcastable)
-
-        return [new_out]
+    return gpu_alloc
+

 @register_opt()
-@local_optimizer([])
+@op_lifter(tensor.Reshape)
+def local_gpureshape(node):
+    op = node.op
+    name = op.name
+    if name:
+        name = 'Gpu' + name
+    res = GpuReshape(op.ndim, op.name)
+    return res
+
+
+@register_opt()
+@op_lifter(tensor.Flatten)
+def local_gpuflatten(node):
+    op = node.op
+    if op.outdim != 1:
+        return None
+    res = GpuReshape(op.outdim, None)
+    o = res(node.inputs[0], theano.tensor.constant([-1]))
+    return o
+
+
+@register_opt()
+@op_lifter(tensor.Elemwise)
 def local_gpu_elemwise(node):
-    do_replace = False
-    gpu_out = False
-    # check for gpu_from_host(Elemwise)) and extract the Elemwise node
-    if node.op == gpu_from_host:
-        host_i, = node.inputs
-        if (host_i.owner and
-            isinstance(host_i.owner.op, tensor.Elemwise) and
-            len(host_i.clients) == 1):
-            node = host_i.owner
-            do_replace = True
-            gpu_out = True
-    # check for elemwise(..., host_from_gpu, ...)
-    if isinstance(node.op, tensor.Elemwise):
-        if numpy.any([i.owner and
-                      i.owner.op == host_from_gpu
-                      for i in node.inputs]):
-                do_replace = True
-    if numpy.all([_is_scalar(i)
-                  for i in node.inputs]):
-            do_replace = False
-
-    if do_replace:
-        new_op = GpuElemwise(node.op.scalar_op)
-        gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
-        if gpu_out:
-            return [gpu_elemwise]
-        else:
-            return [host_from_gpu(gpu_elemwise)]
-    else:
-        return False
+    op = node.op
+    name = op.name
+    if name:
+        name = 'Gpu'+name
+    res = GpuElemwise(op.scalar_op, name=name,
+                      inplace_pattern=copy.copy(op.inplace_pattern),
+                      nfunc_spec=op.nfunc_spec)
+    return res
+
+
+def max_inputs_to_GpuElemwise(node):
+    ptr_size = 8
+    int_size = 4
+
+    # we take the limit from CUDA for now
+    argument_limit = 232
+    ndim = node.inputs[0].type.ndim
+    # number of elements and shape
+    size_param_mandatory = (int_size * (ndim + 1)) + \
+        (ptr_size + int_size * ndim) * len(node.outputs)
+
+    nb_bytes_avail = argument_limit - size_param_mandatory
+    nb_bytes_per_input = ptr_size + ndim * int_size
+    max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
+
+    return max_nb_inputs
+
+gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
+    GpuElemwise,
+    max_inputs_to_GpuElemwise)
+optdb.register('gpua_elemwise_fusion',
+               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00,
+               'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
+
+inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
+    GpuElemwise)
+optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
+               'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')
+
+
+@register_opt()
+@op_lifter(tensor.DimShuffle)
+def local_gpua_dimshuffle(node):
+    return GpuDimShuffle(node.op.input_broadcastable,
+                         node.op.new_order)
+
+
+@register_opt()
+@op_lifter(tensor.SpecifyShape)
+def local_gpua_specifyShape(node):
+    return tensor.specify_shape
+
+
+@register_opt()
+@op_lifter(tensor.Subtensor)
+def local_gpua_subtensor(node):
+    return GpuSubtensor(node.op.idx_list)
+
+
+@register_opt()
+@op_lifter(tensor.CAReduce)
+def local_gpua_careduce(node):
+    if (isinstance(node.op.scalar_op, scalar.basic.Add) or
+        isinstance(node.op.scalar_op, scalar.basic.Mul)):
+        return GpuCAReduce(node.op.scalar_op, axis=node.op.axis,
+                           dtype=getattr(node.op, 'dtype', None),
+                           acc_dtype=getattr(node.op, 'acc_dtype', None))
+
+@register_opt()
+@op_lifter(tensor.blas.Gemv)
+def local_gpua_gemv(node):
+    return GpuGemv(inplace=node.op.inplace)
+
+@register_opt()
+@op_lifter(tensor.blas_c.CGemv)
+def local_gpua_gemv2(node):
+    return GpuGemv(inplace=node.op.inplace)
+
+@register_opt()
+@op_lifter(tensor.blas.Gemm)
+def local_gpua_gemm(node):
+    return GpuGemm(inplace=node.op.inplace)
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
+import StringIO
+
+import numpy
+
+import theano
+from theano import tensor, gof
+from theano.tensor.subtensor import Subtensor, get_idx_list
+
+from theano.gof.python25 import all, any
+
+try:
+    import pygpu
+    from pygpu import gpuarray
+except ImportError:
+    pass
+
+from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
+
+
+class GpuSubtensor(HideC, Subtensor):
+    def make_node(self, x, *inputs):
+        rval = tensor.Subtensor.make_node(self, x, *inputs)
+        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
+                             broadcastable=rval.outputs[0].type.broadcastable)
+        x = as_gpuarray_variable(x)
+        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
+
+    def perform(self, node, inputs, out_):
+        out, = out_
+        x = inputs[0]
+        if self.perform_cache_cdata is not None:
+            out[0] = x.__getitem__(self.perform_cache_cdata)
+            return
+
+        cdata = get_idx_list(inputs, self.idx_list)
+        if len(cdata) == 1:
+            cdata = cdata[0]
+        if len(inputs) == 1:
+            self.perform_cache_cdata = cdata
+
+        out[0] = x.__getitem__(cdata)
+
+    def c_support_code(self):
+        return """
+        static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step,
+                               int start_n, int stop_n, int step_n,
+                               size_t len) {
+            if (step_n) *step = 1;
+            if (*step == 0) {
+                PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
+                return -1;
+            }
+            if (start_n) *start = (*step < 0) ? len-1 : 0;
+            else {
+                if (*start < 0) *start += len;
+                if (*start < 0) *start = (*step < 0) ? -1 : 0;
+                if (*start >= len) *start = (*step < 0) ? len-1 : len;
+            }
+
+            if (stop_n) *stop = (*step < 0) ? -1 : len;
+            else {
+                if (*stop < 0) *stop += len;
+                if (*stop < 0) *stop = (*step < 0) ? -1 : 0;
+                if (*stop >= len) *stop = (*step < 0) ? len-1 : len;
+            }
+            if (*stop < *start && *step > 0)
+                *stop = *start;
+            return 0;
+        }
+        """
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        inp_ndim = node.inputs[0].ndim
+        inp = inputs[0]
+        indices = inputs[1:]
+
+        # pad out the index list to the same dimension as the input
+        idx_list = self.idx_list + \
+            ((slice(None),) * (inp_ndim - len(self.idx_list)))
+
+        # This case fails when we use pygpu_index(), so here is some
+        # special code
+        if len(idx_list) == 0:
+            return """
+        Py_XDECREF(%(out)s);
+        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
+        if (!%(out)s) { %(fail)s }
+""" % dict(out=outputs[0], inp=inp, fail=sub['fail'])
+
+        sio = StringIO.StringIO()
+        print >> sio, """
+        ssize_t starts[%(sz)s];
+        ssize_t stops[%(sz)s];
+        ssize_t steps[%(sz)s];
+        ssize_t cur;
+        int err;
+
+        if (%(inp)s->ga.nd != %(sz)s) {
+            PyErr_SetString(PyExc_IndexError, "invalid index");
+            %(fail)s
+        }
+        """ % dict(sz=len(idx_list), inp=inp, fail=sub['fail'])
+
+        def fix_idx(idx):
+            if idx is None:
+                return "0", 1
+            elif isinstance(idx, (numpy.integer, int)):
+                return str(idx), 0
+            elif isinstance(idx, gof.Type):
+                return indices.pop(0), 0
+            else:
+                assert 0, idx
+
+        for i, idx in enumerate(idx_list):
+            if isinstance(idx, slice):
+                start, start_n = fix_idx(idx.start)
+                stop, stop_n = fix_idx(idx.stop)
+                step, step_n = fix_idx(idx.step)
+                print >>sio, """
+                starts[%(i)s] = %(start)s;
+                stops[%(i)s] = %(stop)s;
+                steps[%(i)s] = %(step)s;
+                if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s],
+                                %(start_n)s, %(stop_n)s, %(step_n)s,
+                                %(inp)s->ga.dimensions[%(i)s]) == -1) {
+                    %(fail)s
+                }
+                """ % dict(i=i, start=start, stop=stop, step=step,
+                           start_n=start_n, stop_n=stop_n, step_n=step_n,
+                           fail=sub['fail'], inp=inp)
+            else:
+                if isinstance(idx, gof.Type):
+                    start = indices.pop(0)
+                elif isinstance(idx, (numpy.integer, int)):
+                    start = idx
+                else:
+                    assert 0, idx
+                print >>sio, """
+                cur = %(start)s;
+                if (cur < 0)
+                    cur += %(inp)s->ga.dimensions[%(i)s];
+                starts[%(i)s] = cur;
+                steps[%(i)s] = 0;
+                """ % dict(i=i, start=start, fail=sub['fail'], inp=inp)
+
+        print >>sio, """
+        Py_XDECREF(%(out)s);
+        %(out)s = pygpu_index(%(inp)s, starts, stops, steps);
+        if (!%(out)s) { %(fail)s }
+""" % dict(name=name, fail=sub['fail'], inp=inp, out=outputs[0])
+
+        return sio.getvalue()
+
+    def c_code_cache_version(self):
+        return (5,)
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -5,8 +5,9 @@ from copy import copy, deepcopy
 import numpy
 import theano
 import theano.tensor as T
-from theano.compile import DeepCopyOp
-from theano.tensor.tests.test_basic import safe_make_node
+from theano.tensor import TensorType
+from theano.tensor.basic import alloc
+from theano.tensor.tests.test_basic import rand, safe_make_node, T_reshape
 from theano.tests.unittest_tools import SkipTest
 from numpy.testing.noseclasses import KnownFailureTest

@@ -33,7 +34,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType,
                                          gpuarray_shared_constructor)
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
                                               gpu_alloc, gpu_from_cuda,
-                                               cuda_from_gpu)
+                                               cuda_from_gpu, HostFromGpu,
+                                               GpuFromHost, GpuReshape)

 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -42,11 +44,10 @@ rng = numpy.random.RandomState(seed=utt.fetch_seed())
 from pygpu import gpuarray

 if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray'\
-)
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
 else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray')
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')


@@ -88,139 +89,110 @@ def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
 def rand_gpuarray(*shape, **kwargs):
    r = rng.rand(*shape) * 2 - 1
    dtype = kwargs.pop('dtype', theano.config.floatX)
+    cls = kwargs.pop('cls', None)
    if len(kwargs) != 0:
        raise TypeError('Unexpected argument %s', kwargs.keys()[0])
-    return gpuarray.array(r, dtype=dtype)
+    return gpuarray.array(r, dtype=dtype, cls=cls)


-def makeTester(name, op, expected, good=None, bad_build=None, checks=None,
-               bad_runtime=None, mode=None, skip=False, eps=1e-10):
-    if good is None:
-        good = {}
-    if bad_build is None:
-        bad_build = {}
-    if bad_runtime is None:
-        bad_runtime = {}
+def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
+               mode_nogpu=mode_without_gpu, skip=False, eps=1e-10):
    if checks is None:
        checks = {}

    _op = op
-    _expected = expected
-    _good = good
-    _bad_build = bad_build
-    _bad_runtime = bad_runtime
+    _gpu_op = gpu_op
+    _cases = cases
    _skip = skip
    _checks = checks

-    class Checker(unittest.TestCase):
+    class Checker(unittest.TestCase, utt.TestOptimizationMixin):
        op = staticmethod(_op)
-        expected = staticmethod(_expected)
-        good = _good
-        bad_build = _bad_build
-        bad_runtime = _bad_runtime
+        gpu_op = staticmethod(_gpu_op)
+        cases = _cases
        skip = _skip
        checks = _checks

        def setUp(self):
            eval(self.__class__.__module__ + '.' + self.__class__.__name__)

-        def test_good(self):
+        def test_all(self):
            if skip:
                raise SkipTest(skip)

-            for testname, inputs in good.items():
-                inputs = [copy(input) for input in inputs]
-                inputrs = [fake_shared(input) for input in inputs]
+            for testname, inputs in cases.items():
+                self.run_case(testname, inputs)

-                try:
-                    node = safe_make_node(self.op, *inputrs)
-                except Exception, exc:
-                    err_msg = ("Test %s::%s: Error occured while making "
-                               "a node with inputs %s") % (self.op, testname,
-                                                           inputs)
-                    exc.args += (err_msg,)
-                    raise
+        def run_case(self, testname, inputs):
+            inputs_ref = [theano.shared(inp) for inp in inputs]
+            inputs_tst = [theano.shared(inp) for inp in inputs]

-                try:
-                    f = inplace_func([], node.outputs, mode=mode,
-                                     name='test_good')
-                except Exception, exc:
-                    err_msg = ("Test %s::%s: Error occured while trying to "
-                               "make a Function") % (self.op, testname)
-                    exc.args += (err_msg,)
-                    raise
+            try:
+                node_ref = safe_make_node(self.op, *inputs_ref)
+                node_tst = safe_make_node(self.op, *inputs_tst)
+            except Exception, exc:
+                err_msg = ("Test %s::%s: Error occured while making "
+                           "a node with inputs %s") % (self.gpu_op, testname,
+                                                       inputs)
+                exc.args += (err_msg,)
+                raise

-                if isinstance(self.expected, dict) and \
-                        testname in self.expected:
-                    expecteds = self.expected[testname]
-                else:
-                    expecteds = self.expected(*inputs)
-
-                if not isinstance(expecteds, (list, tuple)):
-                    expecteds = (expecteds,)
-
-                try:
-                    variables = f()
-                except Exception, exc:
-                    err_msg = ("Test %s::%s: Error occured while calling "
-                               "the Function on the inputs %s") % (self.op,
-                                                                   testname,
-                                                                   inputs)
-                    exc.args += (err_msg,)
-                    raise
+            try:
+                f_ref = inplace_func([], node_ref.outputs, mode=mode_nogpu)
+                f_tst = inplace_func([], node_tst.outputs, mode=mode_gpu)
+            except Exception, exc:
+                err_msg = ("Test %s::%s: Error occured while trying to "
+                           "make a Function") % (self.gpu_op, testname)
+                exc.args += (err_msg,)
+                raise

-                for i, (variable, expected) in \
-                        enumerate(izip(variables, expecteds)):
-                    if variable.dtype != expected.dtype or \
-                            variable.shape != expected.shape or \
-                            not GpuArrayType.values_eq_approx(variable,
-                                                             expected):
-                        self.fail(("Test %s::%s: Output %s gave the wrong "
-                                   "value. With inputs %s, expected %s "
-                                   "(dtype %s), got %s (dtype %s).") % (
-                                self.op, testname, i, inputs, expected,
-                                expected.dtype, variable, variable.dtype))
-
-                for description, check in self.checks.items():
-                    if not check(inputs, variables):
-                        self.fail(("Test %s::%s: Failed check: %s "
-                                   "(inputs were %s, ouputs were %s)") %
-                                  (self.op, testname, description,
-                                   inputs, variables))
-
-        def test_bad_build(self):
-            if skip:
-                raise SkipTest(skip)
-            for testname, inputs in self.bad_build.items():
-                inputs = [copy(input) for input in inputs]
-                inputrs = [fake_shared(input) for input in inputs]
-                self.assertRaises(Exception, safe_make_node, self.op, *inputrs)
+            self.assertFunctionContains1(f_tst, self.gpu_op)

-        def test_bad_runtime(self):
-            if skip:
-                raise SkipTest(skip)
-            for testname, inputs in self.bad_runtime.items():
-                inputrs = [fake_shared(input) for input in inputs]
-                try:
-                    node = safe_make_node(self.op, *inputrs)
-                except Exception, exc:
-                    err_msg = ("Test %s::%s: Error occured while trying to "
-                               "make a node with inputs %s") % (self.op,
-                                                                testname,
-                                                                inputs)
-                    exc.args += (err_msg,)
-                    raise
+            ref_e = None
+            try:
+                expecteds = f_ref()
+            except Exception, exc:
+                ref_e = exc

-                try:
-                    f = inplace_func([], node.outputs, mode=mode,
-                                     name="test_bad_runtime")
-                except Exception, exc:
-                    err_msg = ("Test %s::%s: Error occured while trying to "
-                               "make a Function") % (self.op, testname)
+            try:
+                variables = f_tst()
+            except Exception, exc:
+                if ref_e is None:
+                    err_msg = ("Test %s::%s: exception when calling the "
+                               "Function") % (self.gpu_op, testname)
                    exc.args += (err_msg,)
                    raise
-
-                self.assertRaises(Exception, f, [])
+                else:
+                    # if we raised an exception of the same type we're good.
+                    if isinstance(exc, type(ref_e)):
+                        return
+                    else:
+                        err_msg = ("Test %s::%s: exception raised during test "
+                                   "call was not the same as the reference "
+                                   "call (got: %s, expected %s)") % \
+                                   (self.gpu_op, testname, type(exc),
+                                    type(ref_e))
+                        exc.args += (err_msg,)
+                        raise
+
+            for i, (variable, expected) in \
+                    enumerate(izip(variables, expecteds)):
+                if variable.dtype != expected.dtype or \
+                        variable.shape != expected.shape or \
+                        not TensorType.values_eq_approx(variable,
+                                                        expected):
+                    self.fail(("Test %s::%s: Output %s gave the wrong "
+                               "value. With inputs %s, expected %s "
+                               "(dtype %s), got %s (dtype %s).") % (
+                            self.op, testname, i, inputs, expected,
+                            expected.dtype, variable, variable.dtype))
+
+            for description, check in self.checks.items():
+                if not check(inputs, variables):
+                    self.fail(("Test %s::%s: Failed check: %s "
+                               "(inputs were %s, ouputs were %s)") %
+                              (self.op, testname, description,
+                               inputs, variables))

    Checker.__name__ = name
    return Checker
@@ -300,31 +272,37 @@ def gpu_alloc_expected(x, *shp):

 GpuAllocTester = makeTester(
    name="GpuAllocTester",
-    op=gpu_alloc,
-    expected=gpu_alloc_expected,
-    good=dict(
-        correct01=(rand_gpuarray(), numpy.int32(7)),
-        correct01_bcast=(rand_gpuarray(1), numpy.int32(7)),
-        correct02=(rand_gpuarray(), numpy.int32(4), numpy.int32(7)),
-        correct12=(rand_gpuarray(7), numpy.int32(4), numpy.int32(7)),
-        correct13=(rand_gpuarray(7), numpy.int32(2), numpy.int32(4),
+    op=alloc,
+    gpu_op=gpu_alloc,
+    cases=dict(
+        correct01=(rand(), numpy.int32(7)),
+# just gives a DeepCopyOp with possibly wrong results on the CPU
+#        correct01_bcast=(rand(1), numpy.int32(7)),
+        correct02=(rand(), numpy.int32(4), numpy.int32(7)),
+        correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
+        correct13=(rand(7), numpy.int32(2), numpy.int32(4),
+                   numpy.int32(7)),
+        correct23=(rand(4, 7), numpy.int32(2), numpy.int32(4),
                   numpy.int32(7)),
-        correct23=(rand_gpuarray(4, 7), numpy.int32(2), numpy.int32(4),
-                   numpy.int32(7))
-        ),
-    bad_runtime=dict(
-        bad_shape12=(rand_gpuarray(7), numpy.int32(7), numpy.int32(5)),
+        bad_shape12=(rand(7), numpy.int32(7), numpy.int32(5)),
        )
 )

-def test_deep_copy():
-    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
-
-    f = theano.function([g], g)
-
-    assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
-
-    res = f(a)

-    assert GpuArrayType.values_eq(res, a)
+class G_reshape(T_reshape):
+    def shortDescription(self):
+        return None
+
+    def __init__(self, name):
+        T_reshape.__init__(self, name,
+                           shared=gpuarray_shared_constructor,
+                           op=GpuReshape,
+                           mode=mode_with_gpu,
+                           # avoid errors with limited devices
+#                             dtype='float32',
+                             ignore_topo=(HostFromGpu, GpuFromHost,
+                                          theano.compile.DeepCopyOp,
+                                          theano.sandbox.gpuarray.elemwise.GpuElemwise,
+                                          theano.tensor.opt.Shape_i,
+                                          theano.tensor.opt.MakeVector))
+        assert self.op == GpuReshape
--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
+from unittest import TestCase
+
+from theano.tensor.blas import gemv_inplace, gemm_inplace
+
+from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
+
+from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
+                                          gpugemm_inplace)
+
+GpuGemvTester = makeTester('GpuGemvTester',
+                           op=gemv_inplace, gpu_op=gpugemv_inplace,
+                           cases=dict(
+        dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
+        dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
+#        test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
+#        test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
+#        test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
+        test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
+        )
+)
+
+GpuGemmTester = makeTester('GpuGemmTester',
+                           op=gemm_inplace, gpu_op=gpugemm_inplace,
+                           cases=dict(
+        test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
+        test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
+        test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
+        test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
+        test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
+        test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
+        test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
+        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.0],
+        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.0],
+        )
+)
--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
+import unittest
+
+from theano import scalar, gof
+from theano.gof import FunctionGraph
+from theano.gof.python25 import all, any
+from theano.tests.unittest_tools import SkipTest
+
+from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
+                                               test_CAReduce)
+
+from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
+from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
+                                              GpuCAReduce)
+from theano.sandbox.gpuarray.type import GpuArrayType
+
+from pygpu.array import gpuarray
+
+# This is acutally a test for GpuElemwise
+class test_gpu_Broadcast(test_Broadcast):
+    op = GpuElemwise
+    type = GpuArrayType
+    
+    def rand_val(self, shp):
+        return rand_gpuarray(*shp, **dict(cls=gpuarray))
+
+    # no c_code() yet
+    #cop = GpuElemwise
+    #ctype = GpuArrayType
+
+    #def rand_cval(self, shp):
+    #    return rand_gpuarray(*shp, **dict(cls=gpuarray))
+
+
+class test_GpuDimShuffle(test_DimShuffle):
+    op = GpuDimShuffle
+
+class test_GpuCAReduce(test_CAReduce):
+    dtypes = ["float32"]
+    bin_dtypes = ["uint8", "int8"]
+    op = GpuCAReduce
+    reds = [scalar.add, scalar.mul]
+
+    def test_perform(self):
+        for dtype in self.dtypes + self.bin_dtypes:
+            for op in self.reds:
+                self.with_linker(gof.PerformLinker(), op, dtype=dtype)
+
+    def test_perform_nan(self):
+        for dtype in self.dtypes:
+            for op in self.reds:
+                self.with_linker(gof.PerformLinker(), op, dtype=dtype,
+                                 test_nan=True)
+
+    def test_c(self):
+        raise SkipTest("no C code")
+
+    def test_c_nan(self):
+        raise SkipTest("no C code")
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
+import numpy
+
+import theano
+from theano.tests import unittest_tools as utt
+from theano.sandbox.gpuarray.basic_ops import GpuReshape
+import theano.sandbox.gpuarray
+
+if theano.sandbox.gpuarray.pygpu is None:
+    raise SkipTest("pygpu not installed")
+
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
+    if not cuda_ndarray.use.device_number:
+        cuda_ndarray.use('gpu')
+    theano.sandbox.gpuarray.init_dev('cuda')
+
+if not theano.sandbox.gpuarray.pygpu_activated:
+    raise SkipTest("pygpu disabled")
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
+
+
+def test_flatten():
+    m = theano.tensor.fmatrix()
+    f = theano.function([m], m.flatten(), mode=mode_with_gpu)
+    val = numpy.random.rand(10,11).astype("float32")
+    res = f(val)
+    utt.assert_allclose(res, val.flatten())
+    assert res.shape == val.flatten().shape
+    assert GpuReshape in [type(node.op)
+                          for node in f.maker.fgraph.toposort()]
\ No newline at end of file
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
+from theano.tensor.tests.test_subtensor import T_subtensor
+
+from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
+from theano.sandbox.gpuarray.subtensor import GpuSubtensor
+
+from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+
+from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
+
+from theano.compile import DeepCopyOp
+
+from theano import tensor
+
+class G_subtensor(T_subtensor):
+    def shortDescription(self):
+        return None
+
+    def __init__(self, name):
+        T_subtensor.__init__(self, name,
+                             shared=gpuarray_shared_constructor,
+                             sub=GpuSubtensor,
+                             mode=mode_with_gpu,
+                             # avoid errors with limited devices
+                             dtype='float32',
+                             ignore_topo=(HostFromGpu,GpuFromHost,DeepCopyOp))
+        assert self.sub == GpuSubtensor
--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ b/theano/sandbox/gpuarray/tests/test_type.py
+import operator
+
+import theano
+from theano.compile import DeepCopyOp
+
+from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
+
+from theano.sandbox.gpuarray.type import GpuArrayType
+
+def test_deep_copy():
+    a = rand_gpuarray(20, dtype='float32')
+    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
+
+    f = theano.function([g], g)
+
+    assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
+
+    res = f(a)
+
+    assert GpuArrayType.values_eq(res, a)
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
 import numpy

 import theano
+from theano.tensor.var import _tensor_py_operators
 from theano import Type, Variable, Constant, tensor, config, scalar
 from theano.compile import SharedVariable

@@ -26,7 +27,10 @@ class GpuArrayType(Type):
        except gpuarray.GpuArrayException:
            raise TypeError("Unsupported dtype for %s: %s" %
                            (self.__class__.__name__, self.dtype))
-    
+
+    def __str__(self):
+        return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
+
    def filter(self, data, strict=False, allow_downcast=None):
        if strict:
            if not isinstance(data, gpuarray.GpuArray):
@@ -103,8 +107,8 @@ class GpuArrayType(Type):
            return GpuArrayType.values_eq(a, b)
        else:
            res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
-                            op_tmpl="res[i] = ((%(a)s - %(b)s) <" \
-                                "(1e-8 + 1e-5 * fabs(%(b)s)))")
+                            op_tmpl="res[i] = ((%(a)s - %(b)s) <"
+                            "(1e-8 + 1e-5 * fabs(%(b)s)))")
            return numpy.asarray(res).all()

    def value_zeros(self, shape):
@@ -134,7 +138,7 @@ class GpuArrayType(Type):
            return numpy.dtype(self.dtype).itemsize

    def c_declare(self, name, sub):
-        return "GpuArrayObject *%s;" % (name,)
+        return "PyGpuArrayObject *%s;" % (name,)

    def c_init(self, name, sub):
        return "%s = NULL;" % (name,)
@@ -149,17 +153,17 @@ class GpuArrayType(Type):
        }
        /* First check if we are the base type exactly (the most common case),
           then do the full subclass check if needed. */
-        if (py_%(name)s->ob_type != &GpuArrayType &&
-            !PyObject_TypeCheck(py_%(name)s, &GpuArrayType)) {
+        if (py_%(name)s->ob_type != &PyGpuArrayType &&
+            !PyObject_TypeCheck(py_%(name)s, &PyGpuArrayType)) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
            %(fail)s
        }
-        %(name)s = (GpuArrayObject *)py_%(name)s;
+        %(name)s = (PyGpuArrayObject *)py_%(name)s;
        Py_INCREF(%(name)s);
        """ % {'name': name, 'fail': sub['fail']}

    def c_cleanup(self, name, sub):
-        return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name }
+        return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name}

    def c_sync(self, name, sub):
        return """
@@ -184,7 +188,8 @@ class GpuArrayType(Type):
        # We need arrayobject for the PyArrayDescr struct def
        # (even if we just use a pointer to it in a function def)
        return ['<compyte/array.h>', '<compyte/kernel.h>', '<compyte/error.h>',
-                '<numpy/arrayobject.h>', '<gpuarray_api.h>']
+                '<compyte/buffer_blas.h>', '<numpy/arrayobject.h>',
+                '<gpuarray_api.h>']

    def c_header_dirs(self):
        return [pygpu.get_include(), numpy.get_include()]
@@ -193,10 +198,13 @@ class GpuArrayType(Type):
        return ['compyte']

    def c_code_cache_version(self):
-        return (1,)
+        ver = pygpu.gpuarray.api_version()
+        # we only use the major version since the minor revision are
+        # API-compatible.
+        return (1, ver[0])


-class _operators(tensor.basic._tensor_py_operators):
+class _operators(_tensor_py_operators):
    def _as_TensorVariable(self):
        from basic_ops import host_from_gpu
        return host_from_gpu(self)
@@ -204,10 +212,6 @@ class _operators(tensor.basic._tensor_py_operators):
    def _as_GpuArrayVariable(self):
        return self

-    dtype = property(lambda s: s.type.dtype)
-    broadcastable = property(lambda s: s.type.broadcastable)
-    ndim = property(lambda s: s.type.ndim)
-

 class GpuArrayVariable(_operators, Variable):
    pass
@@ -276,12 +280,6 @@ theano.compile.register_view_op_c_code(GpuArrayType, """

 theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
    Py_XDECREF(%(oname)s);
-    %(oname)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
+    %(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);
    if (!%(oname)s) { %(fail)s }
-    int err;
-    err = GpuArray_copy(&%(oname)s->ga, &%(iname)s->ga, GA_ANY_ORDER);
-    if (err != GA_NO_ERROR) {
-        PyErr_SetString(PyExc_RuntimeError, "Error during copy");
-        %(fail)s
-    }
-""", version=(1,))
+""", version=(5,))
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -3018,7 +3018,7 @@ class Composite(ScalarOp):
        rval = []
        for subnode in self.fgraph.toposort():
            try:
-                rval.append(subnode.op.c_support_code())
+                rval.append(subnode.op.c_support_code().strip())
            except gof.utils.MethodNotDefined:
                pass
        # remove duplicate code blocks

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -143,10 +143,6 @@ class DimShuffle(Op):

        # list of dimensions of the input to drop
        self.drop = []
-        # this maps i before dropping dimensions to j after dropping dimensions
-        # so self.shuffle can be set properly later on
-        i2j = {}
-        j = 0
        for i, b in enumerate(input_broadcastable):
            if i not in new_order:
                # we want to drop this dimension because it's not a value in
@@ -158,14 +154,9 @@ class DimShuffle(Op):
                    raise ValueError(
                            "You cannot drop a non-broadcastable dimension.",
                            (input_broadcastable, new_order))
-            else:
-                i2j[i] = j
-                j += 1

-        # transposition of non-broadcastable dimensions
-        # This is how the dimensions will be permuted, without accounting for
-        # the extra 'x' broadcastable dimensions to insert.
-        self.shuffle = [i2j[x] for x in new_order if x != 'x']
+        # this is the list of the original dimensions that we keep
+        self.shuffle = [x for x in new_order if x != 'x']

        # list of dimensions of the output that are broadcastable and were not
        # in the original input
@@ -237,16 +228,12 @@ class DimShuffle(Op):
        res = input
        if type(res) != numpy.ndarray and type(res) != numpy.memmap:
            raise TypeError(res)
-        shape = list(res.shape)
-        for drop in reversed(self.drop):
-            shape.pop(drop)
-        res = res.reshape(shape)

        # transpose
-        res = res.transpose(self.shuffle)
+        res = res.transpose(self.shuffle+self.drop)

        # augment
-        shape = list(res.shape)
+        shape = list(res.shape[:len(self.shuffle)])
        for augm in self.augment:
            shape.insert(augm, 1)
        res = res.reshape(shape)
@@ -259,9 +246,6 @@ class DimShuffle(Op):

    def infer_shape(self, node, shapes):
        ishp, = shapes
-        ishp = list(ishp)
-        for drop in reversed(self.drop):
-            del ishp[drop]
        # transpose
        rval = [ishp[i] for i in self.shuffle]


--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -410,9 +410,9 @@ def local_dimshuffle_lift(node):
    inode = input.owner
    if inode and isinstance(inode.op, Elemwise) and (len(input.clients) == 1):
        # Don't use make_node to have tag.test_value set.
-        ret = inode.op(*[DimShuffle(inp.type.broadcastable,
-                                    op.new_order,
-                                    op.inplace)(inp) for inp in
+        ret = inode.op(*[op.__class__(inp.type.broadcastable,
+                                      op.new_order,
+                                      op.inplace)(inp) for inp in
                         inode.inputs], **dict(return_list=True))
        return ret
    if inode and isinstance(inode.op, DimShuffle):
@@ -424,8 +424,8 @@ def local_dimshuffle_lift(node):
                                                   iinput.type.ndim):
            return [iinput]
        else:
-            ret = DimShuffle(iinput.type.broadcastable, new_order,
-                             inplace)(iinput, **dict(return_list=True))
+            ret = op.__class__(iinput.type.broadcastable, new_order,
+                               inplace)(iinput, **dict(return_list=True))
            return ret


@@ -460,7 +460,7 @@ def dimshuffle_as_view(node):
    op = node.op
    if not isinstance(op, DimShuffle) or op.inplace:
        return False
-    new_op = DimShuffle(op.input_broadcastable, op.new_order, inplace=True)
+    new_op = op.__class__(op.input_broadcastable, op.new_order, inplace=True)
    return [new_op(*node.inputs)]

 #Step 60 is the inplace optimization stage.
@@ -4609,7 +4609,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
        # worthwhile if the summation axis doesn't line up with a
        # contiguous dimension)

-        if not isinstance(node.op, OP):
+        if type(node.op) is not OP:
            return False
        inputs = []  # inputs of the new Elemwise op.
        s_inputs = []  # inputs of the new scalar op used by the Composite.

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -44,7 +44,7 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
        dtensor3, SpecifyShape, Mean,
        itensor3, Tile, switch, Diagonal, Diag,
        nonzero, flatnonzero, nonzero_values,
-        stacklists)
+        stacklists, DimShuffle)

 from theano.tests import unittest_tools as utt

@@ -4204,9 +4204,30 @@ class T_op_cache(unittest.TestCase):
        self.assertTrue(numpy.all(fn_py(a) == fn_c_or_py(a)))


-class T_reshape(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
+class T_reshape(utt.InferShapeTester, utt.TestOptimizationMixin):
+    def __init__(self, name, shared=tensor._shared, op=Reshape, mode=None,
+                 ignore_topo=(DeepCopyOp, opt.MakeVector,
+                              opt.Shape_i, DimShuffle, theano.tensor.Elemwise)):
+        self.shared = shared
+        self.op = op
+        #The tag canonicalize is needed for the shape test in FAST_COMPILE
+        self.mode = mode
+        self.ignore_topo = ignore_topo
+        return super(T_reshape, self).__init__(name)
+
+    def function(self, inputs, outputs):
+        f = function(inputs, outputs, mode=self.mode)
+        if self.mode is not None or theano.config.mode != "FAST_COMPILE":
+            topo = f.maker.fgraph.toposort()
+            topo_ = [node for node in topo if not isinstance(node.op,
+                                                             self.ignore_topo)]
+            assert len(topo_) == 1, topo_
+        return f
+
+    def eval_output_and_check(self, t):
+        f = self.function([], t)
+        tval = f()
+        return tval

    def test_reshape(self):
        a = dvector()
@@ -4215,7 +4236,7 @@ class T_reshape(unittest.TestCase):

        #basic to 1 dim(without list)
        c = reshape(b, as_tensor_variable(6), ndim=1)
-        f = inplace_func([b], c)
+        f = self.function([b], c)

        b_val1 = numpy.asarray([[0, 1, 2], [3, 4, 5]])
        c_val1 = numpy.asarray([0, 1, 2, 3, 4, 5])
@@ -4231,7 +4252,7 @@ class T_reshape(unittest.TestCase):

        #basic to 1 dim(with list)
        c = reshape(b, (as_tensor_variable(6),), ndim=1)
-        f = inplace_func([b], c)
+        f = self.function([b], c)
        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
                         numpy.asarray([0, 1, 2, 3, 4, 5]))
        #print f.maker.fgraph.toposort()
@@ -4239,14 +4260,14 @@ class T_reshape(unittest.TestCase):

        #basic to shape object of same ndim
        c = reshape(b, d.shape)
-        f = inplace_func([b, d], c)
+        f = self.function([b, d], c)
        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]]),
                           [[0, 1], [2, 3], [4, 5]]) ==
                         numpy.asarray([[0, 1], [2, 3], [4, 5]]))

        #basic to 2 dims
        c = reshape(a, [2, 3])
-        f = inplace_func([a], c)
+        f = self.function([a], c)
        assert numpy.all(f(numpy.asarray([0, 1, 2, 3, 4, 5])) ==
                         numpy.asarray([[0, 1, 2], [3, 4, 5]]))

@@ -4255,7 +4276,7 @@ class T_reshape(unittest.TestCase):
        a_val_copy = numpy.asarray([0, 1, 2, 3, 4, 5])
        b_val = numpy.asarray([[0, 1, 2], [3, 4, 5]])

-        f_sub = inplace_func([a, b], c - b)
+        f_sub = self.function([a, b], c - b)
        assert numpy.all(f_sub(a_val, b_val) == 0.0)
        assert numpy.all(a_val == a_val_copy)

@@ -4264,35 +4285,33 @@ class T_reshape(unittest.TestCase):
        a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
        b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')

-        f_sub = inplace_func([a, b], c - b)
+        f_sub = self.function([a, b], c - b)
        assert numpy.all(f_sub(a_val, b_val) == 0.0)
        assert numpy.all(a_val == a_val_copy)

        # verify gradient
        def just_vals(v):
            return Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
-        utt.verify_grad(just_vals, [a_val])
+        utt.verify_grad(just_vals, [a_val], mode=self.mode)

        #test infer_shape
-        f_sub = function([a, b], (c - b).shape)
-        if config.mode == "FAST_COMPILE":
-            assert len(f_sub.maker.fgraph.toposort()) == 3
-        else:
-            topo = f_sub.maker.fgraph.toposort()
-            assert len(topo) == 1
-            topo[0].op == theano.compile.function_module.deep_copy_op
-            #assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5]]))==[2,3])#work in FAST_RUN, but fail on other!
-            #assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5],[6,7]]))==[2,3])#work in FAST_RUN, but fail on other!
+        self._compile_and_check([a], [c], (a_val,), self.op)

        # test broadcast flag for constant value of 1
        c = reshape(b, (b.shape[0], b.shape[1], 1))
-        f = inplace_func([b], c)
+        f = self.function([b], c)
        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
                         numpy.asarray([[[0], [1], [2]], [[3], [4], [5]]]))
        assert (f.maker.fgraph.toposort()[-2].outputs[0].type.broadcastable ==
                (False, False, True))

-        assert numpy.all(f_sub(a_val, b_val) == [2, 3])
+    def test_m1(self):
+        t = tensor3()
+        rng = numpy.random.RandomState(seed=utt.fetch_seed())
+        val = rng.uniform(size=(3, 4, 5)).astype(config.floatX)
+        for out in [t.reshape([-1]), t.reshape([-1, 5]),
+                    t.reshape([5, -1]), t.reshape([5, -1, 3])]:
+            self._compile_and_check([t], [out], [val], self.op)

    def test_reshape_long_in_shape(self):
        v = dvector('v')
@@ -4311,14 +4330,14 @@ class T_reshape(unittest.TestCase):
        r = a.reshape(shapes, ndim=1)
        z = zeros_like(r)

-        f = function([a, shapes], z.shape)
+        f = self.function([a, shapes], z.shape)
        self.assertRaises(ValueError, f, a_val, [13])

        #Test reshape to 2 dim
        r = a.reshape(shapes, ndim=2)
        z = zeros_like(r)

-        f = function([a, shapes], z.shape)
+        f = self.function([a, shapes], z.shape)

        self.assertRaises(ValueError, f, a_val, [-1, 5])
        self.assertRaises(ValueError, f, a_val, [7, -1])

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -11,7 +11,7 @@ from theano.gof.python25 import all, any
 from theano import gof, scalar, config

 from theano import tensor
-from theano.tensor import TensorType
+from theano.tensor import TensorType, as_tensor_variable
 from theano.compile.mode import get_default_mode
 from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
                                    Prod, ProdWithoutZeros)
@@ -24,6 +24,7 @@ def FunctionGraph(i, o):


 class test_DimShuffle(unittest_tools.InferShapeTester):
+    op = DimShuffle

    def with_linker(self, linker):
        for xsh, shuffle, zsh in [((2, 3), (1, 'x', 0), (3, 1, 2)),
@@ -38,12 +39,12 @@ class test_DimShuffle(unittest_tools.InferShapeTester):
                                  ((1,), ('x', 'x'), (1, 1))]:
            ib = [(entry == 1) for entry in xsh]
            x = TensorType('float64', ib)('x')
-            e = DimShuffle(ib, shuffle)(x)
+            e = self.op(ib, shuffle)(x)
            f = copy(linker).accept(FunctionGraph([x], [e])).make_function()
            assert f(numpy.ones(xsh)).shape == zsh
            #test that DimShuffle.infer_shape work correctly
            x = TensorType('float64', ib)('x')
-            e = DimShuffle(ib, shuffle)(x)
+            e = self.op(ib, shuffle)(x)
            f = copy(linker).accept(FunctionGraph([x], [e.
                shape])).make_function()
            assert all(f(numpy.ones(xsh))) == all(zsh)
@@ -51,12 +52,12 @@ class test_DimShuffle(unittest_tools.InferShapeTester):
        # Test when we drop a axis that is not broadcastable
        ib = [False, True, False]
        x = TensorType('float64', ib)('x')
-        self.assertRaises(ValueError, DimShuffle, ib, shuffle)
+        self.assertRaises(ValueError, self.op, ib, shuffle)

        # Test when we drop a axis that don't have shape 1
        ib = [True, True, False]
        x = TensorType('float64', ib)('x')
-        e = DimShuffle(ib, (1, 2))(x)
+        e = self.op(ib, (1, 2))(x)
        f = copy(linker).accept(FunctionGraph([x], [e.shape])).make_function()
        self.assertRaises(TypeError, f, numpy.ones((2, 1, 4)))

@@ -89,8 +90,8 @@ class test_DimShuffle(unittest_tools.InferShapeTester):
            adtens = TensorType('float64', ib)('x')
            adtens_val = numpy.ones(xsh)
            self._compile_and_check([adtens],
-                                    [DimShuffle(ib, shuffle)(adtens)],
-                                    [adtens_val], DimShuffle,
+                                    [self.op(ib, shuffle)(adtens)],
+                                    [adtens_val], self.op,
                                    warn=False)

    def test_too_big_rank(self):
@@ -137,10 +138,23 @@ class test_reduce_axes(unittest.TestCase):
            m = x.var(a)

 class test_Broadcast(unittest.TestCase):
+    # this is to allow other types to reuse this class to test their ops
+    type = TensorType
+    op = Elemwise
+
+    ctype = TensorType
+    cop = Elemwise
+
+    def rand_val(self, shp):
+        return numpy.asarray(numpy.random.rand(*shp))
+
+    def rand_cval(self, shp):
+        return numpy.asarray(numpy.random.rand(*shp))
+
    def setUp(self):
        unittest_tools.seed_rng()

-    def with_linker(self, linker):
+    def with_linker(self, linker, op, type, rand_val):
        for xsh, ysh in [((3, 5), (3, 5)),
                         ((3, 5), (1, 5)),
                         ((3, 5), (3, 1)),
@@ -150,12 +164,12 @@ class test_Broadcast(unittest.TestCase):
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),
                         ((), ())]:
-            x = TensorType('float64', [(entry == 1) for entry in xsh])('x')
-            y = TensorType('float64', [(entry == 1) for entry in ysh])('y')
-            e = Elemwise(scalar.add)(x, y)
+            x = type('float64', [(entry == 1) for entry in xsh])('x')
+            y = type('float64', [(entry == 1) for entry in ysh])('y')
+            e = op(scalar.add)(x, y)
            f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
-            xv = numpy.asarray(numpy.random.rand(*xsh))
-            yv = numpy.asarray(numpy.random.rand(*ysh))
+            xv = rand_val(xsh)
+            yv = rand_val(ysh)
            zv = xv + yv

            self.assertTrue((f(xv, yv) == zv).all())
@@ -163,14 +177,14 @@ class test_Broadcast(unittest.TestCase):
            #test Elemwise.infer_shape
            #the Shape op don't implement c_code!
            if isinstance(linker, gof.PerformLinker):
-                x = TensorType('float64', [(entry == 1) for entry in xsh])('x')
-                y = TensorType('float64', [(entry == 1) for entry in ysh])('y')
-                e = Elemwise(scalar.add)(x, y)
+                x = type('float64', [(entry == 1) for entry in xsh])('x')
+                y = type('float64', [(entry == 1) for entry in ysh])('y')
+                e = op(scalar.add)(x, y)
                f = copy(linker).accept(FunctionGraph([x,
                     y], [e.shape])).make_function()
                assert tuple(f(xv, yv)) == tuple(zv.shape)

-    def with_linker_inplace(self, linker):
+    def with_linker_inplace(self, linker, op, type, rand_val):
        for xsh, ysh in [((5, 5), (5, 5)),
                         ((5, 5), (1, 5)),
                         ((5, 5), (5, 1)),
@@ -179,12 +193,12 @@ class test_Broadcast(unittest.TestCase):
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),
                         ((), ())]:
-            x = TensorType('float64', [(entry == 1) for entry in xsh])('x')
-            y = TensorType('float64', [(entry == 1) for entry in ysh])('y')
-            e = Elemwise(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
+            x = type('float64', [(entry == 1) for entry in xsh])('x')
+            y = type('float64', [(entry == 1) for entry in ysh])('y')
+            e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
            f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
-            xv = numpy.asarray(numpy.random.rand(*xsh))
-            yv = numpy.asarray(numpy.random.rand(*ysh))
+            xv = rand_val(xsh)
+            yv = rand_val(ysh)
            zv = xv + yv

            f(xv, yv)
@@ -193,13 +207,13 @@ class test_Broadcast(unittest.TestCase):
            #test Elemwise.infer_shape
            #the Shape op don't implement c_code!
            if isinstance(linker, gof.PerformLinker):
-                x = TensorType('float64', [(entry == 1) for entry in xsh])('x')
-                y = TensorType('float64', [(entry == 1) for entry in ysh])('y')
-                e = Elemwise(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
+                x = type('float64', [(entry == 1) for entry in xsh])('x')
+                y = type('float64', [(entry == 1) for entry in ysh])('y')
+                e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
                f = copy(linker).accept(FunctionGraph([x,
                     y], [e.shape])).make_function()
-                xv = numpy.asarray(numpy.random.rand(*xsh))
-                yv = numpy.asarray(numpy.random.rand(*ysh))
+                xv = rand_val(xsh)
+                yv = rand_val(ysh)
                zv = xv + yv

                f(xv, yv)
@@ -207,30 +221,33 @@ class test_Broadcast(unittest.TestCase):
                assert xv.shape == zv.shape

    def test_perform(self):
-        self.with_linker(gof.PerformLinker())
+        self.with_linker(gof.PerformLinker(), self.op, self.type,
+                         self.rand_val)

    def test_c(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        self.with_linker(gof.CLinker())
+        self.with_linker(gof.CLinker(), self.cop, self.ctype, self.rand_cval)

    def test_perform_inplace(self):
-        self.with_linker_inplace(gof.PerformLinker())
+        self.with_linker_inplace(gof.PerformLinker(), self.op, self.type,
+                                 self.rand_val)

    def test_c_inplace(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        self.with_linker_inplace(gof.CLinker())
+        self.with_linker_inplace(gof.CLinker(), self.cop, self.ctype,
+                                 self.rand_cval)

    def test_fill(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = TensorType('float64', [0, 0])('x')
-        y = TensorType('float64', [1, 1])('y')
-        e = Elemwise(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
+        x = self.ctype('float64', [0, 0])('x')
+        y = self.ctype('float64', [1, 1])('y')
+        e = self.cop(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
        f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function()
-        xv = numpy.ones((5, 5))
-        yv = numpy.random.rand(1, 1)
+        xv = self.rand_cval((5, 5))
+        yv = self.rand_cval((1, 1))
        f(xv, yv)
        assert (xv == yv).all()

@@ -245,27 +262,28 @@ class test_Broadcast(unittest.TestCase):
    def test_weird_strides(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = TensorType('float64', [0, 0, 0, 0, 0])('x')
-        y = TensorType('float64', [0, 0, 0, 0, 0])('y')
-        e = Elemwise(scalar.add)(x, y)
+        x = self.ctype('float64', [0, 0, 0, 0, 0])('x')
+        y = self.ctype('float64', [0, 0, 0, 0, 0])('y')
+        e = self.cop(scalar.add)(x, y)
        f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function()
-        xv = numpy.random.rand(2, 2, 2, 2, 2)
-        yv = numpy.random.rand(2, 2, 2, 2, 2).transpose(4, 0, 3, 1, 2)
+        xv = self.rand_cval((2, 2, 2, 2, 2))
+        yv = self.rand_cval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
        zv = xv + yv
        assert (f(xv, yv) == zv).all()

    def test_same_inputs(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = TensorType('float64', [0, 0])('x')
-        e = Elemwise(scalar.add)(x, x)
+        x = self.ctype('float64', [0, 0])('x')
+        e = self.cop(scalar.add)(x, x)
        f = gof.CLinker().accept(FunctionGraph([x], [e])).make_function()
-        xv = numpy.random.rand(2, 2)
+        xv = self.rand_cval((2, 2))
        zv = xv + xv
        assert (f(xv) == zv).all()


 class test_CAReduce(unittest_tools.InferShapeTester):
+    op = CAReduce

    def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
                    test_nan=False, tensor_op=None):
@@ -288,9 +306,9 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                dtype = theano.config.floatX
            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
            if tensor_op is None:
-                e = CAReduce(scalar_op, axis=tosum)(x)
+                e = as_tensor_variable(self.op(scalar_op, axis=tosum)(x))
            else:
-                e = tensor_op(x, axis=tosum)
+                e = as_tensor_variable(tensor_op(x, axis=tosum))

            if tosum is None:
                tosum = range(len(xsh))
@@ -395,7 +413,7 @@ class test_CAReduce(unittest_tools.InferShapeTester):
            if isinstance(linker, gof.PerformLinker):
                x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
                if tensor_op is None:
-                    e = CAReduce(scalar_op, axis=tosum)(x)
+                    e = self.op(scalar_op, axis=tosum)(x)
                else:
                    e = tensor_op(x, axis=tosum)
                if tosum is None:
@@ -491,8 +509,8 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                tosum = range(len(xsh))
            xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
            self._compile_and_check([x],
-                            [CAReduce(scalar.add, axis=tosum)(x)],
-                            [xv], CAReduce, ["local_cut_useless_reduce"])
+                            [self.op(scalar.add, axis=tosum)(x)],
+                            [xv], self.op, ["local_cut_useless_reduce"])


 class test_Prod(unittest.TestCase):

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -122,10 +122,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        try:
            try:
                self.eval_output_and_check(t)
-                assert 0
-            except Exception, e:
-                if 'out of bounds' not in exc_message(e):
-                    raise
+            except IndexError, e:
+                return
+            self.fail()
        finally:
            _logger.setLevel(oldlevel)

@@ -161,7 +160,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):

    def test1_0_dims(self):
        n = self.shared(numpy.ones((), dtype=self.dtype))
-        t = theano.tensor.Subtensor([])(n)
+        t = self.sub([])(n)
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        mode = self.mode
        self.mode = mode.excluding("local_useless_subtensor")
@@ -188,7 +187,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(tval == 5.0)

    def test1_ok_range_infinite(self):
-        #Subtensor.debug = True
        n = self.shared(numpy.arange(3, dtype=self.dtype))
        t = n[1:]
        self.assertTrue(isinstance(t.owner.op, Subtensor))

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -543,8 +543,8 @@ class _tensor_py_operators:
    def get_scalar_constant_value(self):
        return theano.tensor.basic.get_scalar_constant_value(self)

-    def zeros_like(self, dtype=None):
-        return theano.tensor.basic.zeros_like(self, dtype=dtype)
+    def zeros_like(model, dtype=None):
+        return theano.tensor.basic.zeros_like(model, dtype=dtype)


 class TensorVariable(_tensor_py_operators, Variable):

--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
@@ -182,7 +182,10 @@ class InferShapeTester(unittest.TestCase):
    def setUp(self):
        seed_rng()
        # Take into account any mode that may be defined in a child class
-        mode = getattr(self, 'mode', theano.compile.get_default_mode())
+        # and it can be None
+        mode = getattr(self, 'mode', None)
+        if mode is None:
+            mode = theano.compile.get_default_mode()
        # This mode seems to be the minimal one including the shape_i
        # optimizations, if we don't want to enumerate them explicitly.
        self.mode = mode.including("canonicalize")