Merge pull request #1582 from abergeron/compyte2-rb

Compyte2 rebase

Merge pull request #1582 from abergeron/compyte2-rb
8bd900f8 · Frédéric Bastien · 382d2ed1 · 217b616b · 8bd900f8 · 8bd900f8
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -932,7 +932,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
    adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
    mode = mode_with_gpu
    dtype = 'float32'
-    ignore_topo = (B.HostFromGpu, B.GpuFromHost)
+    ignore_topo = (B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp)
    fast_compile = False
    ops = (cuda.GpuSubtensor, cuda.GpuIncSubtensor,
           cuda.GpuAdvancedSubtensor1, cuda.GpuAdvancedIncSubtensor1)

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
 import logging
 import theano
-from theano.configparser import config
+from theano.configparser import config, AddConfigVar, BoolParam
 from theano.compile import optdb
 _logger_name = 'theano.sandbox.gpuarray'
@@ -18,6 +18,13 @@ try:
 except ImportError:
    pygpu = None
+AddConfigVar('gpuarray.sync',
+             """If True, every op will make sure its work is done before
+                returning.  Setting this to True will slow down execution,
+                but give much more accurate results in profiling.""",
+             BoolParam(False),
+             in_c_key=True)
 # This is for documentation not to depend on the availability of pygpu
 from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                  GpuArraySharedVariable, gpuarray_shared_constructor)

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
+from theano import Op, Apply, config
+from theano.tensor.blas import Gemv, Gemm
+from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
+try:
+    import pygpu
+    from pygpu import blas
+except ImportError, e:
+    # To make sure theano is importable
+    pass
+class BlasOp(HideC):
+    def c_headers(self):
+        return ['<blas_api.h>']
+    def c_header_dirs(self):
+        return [pygpu.get_include()]
+    def c_init_code(self):
+        return ['import_pygpu__blas();']
+class GpuGemv(BlasOp, Gemv):
+    def make_node(self, y, alpha, A, x, beta):
+        res = Gemv.make_node(self, y, alpha, A, x, beta)
+        A = as_gpuarray_variable(A)
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        return Apply(self, [y, alpha, A, x, beta], [y.type()])
+    def perform(self, node, inputs, out_storage):
+        y, alpha, A, x, beta = inputs
+        out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, trans=False,
+                                      overwrite_y=self.inplace)
+    def c_code(self, node, name, inp, out, sub):
+        vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
+                    beta=inp[4], fail=sub['fail'], name=name)
+        if self.inplace:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = %(y)s;
+                   Py_INCREF(%(out)s);
+                   """ % vars
+        else:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
+                   if (%(out)s == NULL) {
+                       %(fail)s
+                   }
+                   """ % vars
+        code += """
+        if (pygpu_blas_rgemv(cb_no_trans,
+                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
+                             %(A)s, %(x)s,
+                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
+                             %(out)s) == NULL) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """
+        return code
+    def c_code_cache_version(self):
+        return (0,)
+gpugemv_no_inplace = GpuGemv(inplace=False)
+gpugemv_inplace = GpuGemv(inplace=True)
+class GpuGemm(BlasOp, Gemm):
+    def make_node(self, C, alpha, A, B, beta):
+        res = Gemm.make_node(self, C, alpha, A, B, beta)
+        A = as_gpuarray_variable(A)
+        B = as_gpuarray_variable(B)
+        C = as_gpuarray_variable(C)
+        return Apply(self, [C, alpha, A, B, beta], [C.type()])
+    def perform(self, node, inputs, outputs):
+        C, alpha, A, B, beta = inputs
+        outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
+                                  overwrite_c=self.inplace)
+    def c_code(self, node, name, inp, out, sub):
+        vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
+                    beta=inp[4], fail=sub['fail'], name=name)
+        if self.inplace:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = %(C)s;
+                   Py_INCREF(%(out)s);
+                   """ % vars
+        else:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
+                   if (%(out)s == NULL) {
+                       %(fail)s
+                   }
+                   """ % vars
+        code += """
+        if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
+                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
+                             %(A)s, %(B)s,
+                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
+                             %(out)s) == NULL) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """
+        return code
+    def c_code_cache_version(self):
+        return (0,)
+gpugemm_no_inplace = GpuGemm(inplace=False)
+gpugemm_inplace = GpuGemm(inplace=True)
+from theano.compile import optdb
+from theano.gof import local_optimizer, LocalOptGroup
+from theano.tensor.opt import in2out
+@local_optimizer([gpugemv_no_inplace])
+def local_inplace_gpuagemv(node):
+    if node.op == gpugemv_no_inplace:
+        return [gpugemv_inplace(*node.inputs)]
+@local_optimizer([gpugemm_no_inplace])
+def local_inplace_gpuagemm(node):
+    if node.op == gpugemm_no_inplace:
+        return [gpugemm_inplace(*node.inputs)]
+gpuablas_opt_inplace = in2out(LocalOptGroup(
+        local_inplace_gpuagemv, local_inplace_gpuagemm),
+                              name='gpuablas_opt_inplace')
+optdb.register('InplaceGpuaBlasOpt',
+               gpuablas_opt_inplace,
+               70.0, 'fast_run', 'inplace', 'gpuarray')
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
-import theano, numpy
+import copy
-from theano import tensor
+import theano
+import numpy
+from theano import tensor, scalar
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
                        Optimizer, toolbox, DestroyHandler,
@@ -8,8 +10,12 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
 from theano.gof.python25 import all, any
 from theano.sandbox.gpuarray.type import GpuArrayType
-from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc
+from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
-from elemwise import GpuElemwise, _is_scalar
+                                               gpu_alloc, GpuReshape)
+from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
+                                              GpuDimShuffle, GpuCAReduce)
+from theano.sandbox.gpuarray.subtensor import GpuSubtensor
+from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm
 gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
@@ -26,6 +32,7 @@ optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')
 def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
@@ -35,6 +42,36 @@ def register_opt(*tags, **kwargs):
 register_opt()(theano.tensor.opt.local_track_shape_i)
+def op_lifter(OP):
+    """
+    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
+    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
+    """
+    def f(maker):
+        def local_opt(node):
+            if type(node.op) is OP:
+                # This does not support nodes that have more than one output.
+                assert len(node.outputs) == 1
+                # either one of our inputs is on the gpu or
+                # all of our client are on the gpu
+                if (any([i.owner and i.owner.op == host_from_gpu
+                         for i in node.inputs]) or
+                    all([c != 'output' and c.op == gpu_from_host
+                         for c, idx in node.outputs[0].clients])):
+                    new_op = maker(node)
+                    # This is needed as sometimes new_op inherit from OP.
+                    if new_op and new_op != node.op:
+                        if isinstance(new_op, theano.Op):
+                            return [host_from_gpu(new_op(*node.inputs))]
+                        else:  # suppose it is a variable on the GPU
+                            return [host_from_gpu(new_op)]
+            return False
+        local_opt.__name__ = maker.__name__
+        return local_optimizer([OP])(local_opt)
+    return f
 class InputToGpuOptimizer(Optimizer):
    "Transfer the input to the gpu to start the rolling wave."
@@ -63,6 +100,7 @@ class InputToGpuOptimizer(Optimizer):
 gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
                    0, 'fast_run', 'fast_compile', 'merge')
 @local_optimizer([])
 def local_cut_gpu_host_gpu(node):
    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
@@ -78,67 +116,117 @@ gpu_cut_copies.register('cut_gpua_constant_transfers',
 optdb['canonicalize'].register('local_cut_gpua_host_gpua',
                               local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
 @register_opt()
-@local_optimizer([tensor.Alloc])
+@op_lifter(tensor.Alloc)
 def local_gpualloc(node):
-    replace = False
+    return gpu_alloc
-    if node.op == tensor.alloc:
-        if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu:
-            replace = True
-        elif all([c != 'output' and c.op == gpu_from_host
-                  for c, idx in node.outputs[0].clients]):
-            replace = True
-        elif all([c != 'output' and c.op == tensor.join and
-                  all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc]
-                       for i in c.inputs[1:]])
-                  for c, idx in node.outputs[0].clients]):
-            replace = True
-    if replace:
-        val = node.inputs[0]
-        shp = node.inputs[1:]
-        old_out = node.outputs[0]
-        val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
-        new_out = host_from_gpu(gpu_alloc(val, *shp))
-        if new_out.type != old_out.type:
-            assert new_out.type.ndim == old_out.type.ndim
-            assert new_out.type.dtype == old_out.type.dtype
-            for b_old, b_new in zip(old_out.type.broadcastable,
-                                    new_out.type.broadcastable):
-                assert b_new or (not b_old)
-            new_out = tensor.patternbroadcast(new_out. old_out.broadcastable)
-        return [new_out]
 @register_opt()
-@local_optimizer([])
+@op_lifter(tensor.Reshape)
+def local_gpureshape(node):
+    op = node.op
+    name = op.name
+    if name:
+        name = 'Gpu' + name
+    res = GpuReshape(op.ndim, op.name)
+    return res
+@register_opt()
+@op_lifter(tensor.Flatten)
+def local_gpuflatten(node):
+    op = node.op
+    if op.outdim != 1:
+        return None
+    res = GpuReshape(op.outdim, None)
+    o = res(node.inputs[0], theano.tensor.constant([-1]))
+    return o
+@register_opt()
+@op_lifter(tensor.Elemwise)
 def local_gpu_elemwise(node):
-    do_replace = False
+    op = node.op
-    gpu_out = False
+    name = op.name
-    # check for gpu_from_host(Elemwise)) and extract the Elemwise node
+    if name:
-    if node.op == gpu_from_host:
+        name = 'Gpu'+name
-        host_i, = node.inputs
+    res = GpuElemwise(op.scalar_op, name=name,
-        if (host_i.owner and
+                      inplace_pattern=copy.copy(op.inplace_pattern),
-            isinstance(host_i.owner.op, tensor.Elemwise) and
+                      nfunc_spec=op.nfunc_spec)
-            len(host_i.clients) == 1):
+    return res
-            node = host_i.owner
-            do_replace = True
-            gpu_out = True
+def max_inputs_to_GpuElemwise(node):
-    # check for elemwise(..., host_from_gpu, ...)
+    ptr_size = 8
-    if isinstance(node.op, tensor.Elemwise):
+    int_size = 4
-        if numpy.any([i.owner and
-                      i.owner.op == host_from_gpu
+    # we take the limit from CUDA for now
-                      for i in node.inputs]):
+    argument_limit = 232
-                do_replace = True
+    ndim = node.inputs[0].type.ndim
-    if numpy.all([_is_scalar(i)
+    # number of elements and shape
-                  for i in node.inputs]):
+    size_param_mandatory = (int_size * (ndim + 1)) + \
-            do_replace = False
+        (ptr_size + int_size * ndim) * len(node.outputs)
-    if do_replace:
+    nb_bytes_avail = argument_limit - size_param_mandatory
-        new_op = GpuElemwise(node.op.scalar_op)
+    nb_bytes_per_input = ptr_size + ndim * int_size
-        gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
+    max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
-        if gpu_out:
-            return [gpu_elemwise]
+    return max_nb_inputs
-        else:
-            return [host_from_gpu(gpu_elemwise)]
+gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
-    else:
+    GpuElemwise,
-        return False
+    max_inputs_to_GpuElemwise)
+optdb.register('gpua_elemwise_fusion',
+               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00,
+               'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
+inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
+    GpuElemwise)
+optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
+               'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')
+@register_opt()
+@op_lifter(tensor.DimShuffle)
+def local_gpua_dimshuffle(node):
+    return GpuDimShuffle(node.op.input_broadcastable,
+                         node.op.new_order)
+@register_opt()
+@op_lifter(tensor.SpecifyShape)
+def local_gpua_specifyShape(node):
+    return tensor.specify_shape
+@register_opt()
+@op_lifter(tensor.Subtensor)
+def local_gpua_subtensor(node):
+    return GpuSubtensor(node.op.idx_list)
+@register_opt()
+@op_lifter(tensor.CAReduce)
+def local_gpua_careduce(node):
+    if (isinstance(node.op.scalar_op, scalar.basic.Add) or
+        isinstance(node.op.scalar_op, scalar.basic.Mul)):
+        return GpuCAReduce(node.op.scalar_op, axis=node.op.axis,
+                           dtype=getattr(node.op, 'dtype', None),
+                           acc_dtype=getattr(node.op, 'acc_dtype', None))
+@register_opt()
+@op_lifter(tensor.blas.Gemv)
+def local_gpua_gemv(node):
+    return GpuGemv(inplace=node.op.inplace)
+@register_opt()
+@op_lifter(tensor.blas_c.CGemv)
+def local_gpua_gemv2(node):
+    return GpuGemv(inplace=node.op.inplace)
+@register_opt()
+@op_lifter(tensor.blas.Gemm)
+def local_gpua_gemm(node):
+    return GpuGemm(inplace=node.op.inplace)
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
+import StringIO
+import numpy
+import theano
+from theano import tensor, gof
+from theano.tensor.subtensor import Subtensor, get_idx_list
+from theano.gof.python25 import all, any
+try:
+    import pygpu
+    from pygpu import gpuarray
+except ImportError:
+    pass
+from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
+class GpuSubtensor(HideC, Subtensor):
+    def make_node(self, x, *inputs):
+        rval = tensor.Subtensor.make_node(self, x, *inputs)
+        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
+                             broadcastable=rval.outputs[0].type.broadcastable)
+        x = as_gpuarray_variable(x)
+        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
+    def perform(self, node, inputs, out_):
+        out, = out_
+        x = inputs[0]
+        if self.perform_cache_cdata is not None:
+            out[0] = x.__getitem__(self.perform_cache_cdata)
+            return
+        cdata = get_idx_list(inputs, self.idx_list)
+        if len(cdata) == 1:
+            cdata = cdata[0]
+        if len(inputs) == 1:
+            self.perform_cache_cdata = cdata
+        out[0] = x.__getitem__(cdata)
+    def c_support_code(self):
+        return """
+        static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step,
+                               int start_n, int stop_n, int step_n,
+                               size_t len) {
+            if (step_n) *step = 1;
+            if (*step == 0) {
+                PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
+                return -1;
+            }
+            if (start_n) *start = (*step < 0) ? len-1 : 0;
+            else {
+                if (*start < 0) *start += len;
+                if (*start < 0) *start = (*step < 0) ? -1 : 0;
+                if (*start >= len) *start = (*step < 0) ? len-1 : len;
+            }
+            if (stop_n) *stop = (*step < 0) ? -1 : len;
+            else {
+                if (*stop < 0) *stop += len;
+                if (*stop < 0) *stop = (*step < 0) ? -1 : 0;
+                if (*stop >= len) *stop = (*step < 0) ? len-1 : len;
+            }
+            if (*stop < *start && *step > 0)
+                *stop = *start;
+            return 0;
+        }
+        """
+    def c_code(self, node, name, inputs, outputs, sub):
+        inp_ndim = node.inputs[0].ndim
+        inp = inputs[0]
+        indices = inputs[1:]
+        # pad out the index list to the same dimension as the input
+        idx_list = self.idx_list + \
+            ((slice(None),) * (inp_ndim - len(self.idx_list)))
+        # This case fails when we use pygpu_index(), so here is some
+        # special code
+        if len(idx_list) == 0:
+            return """
+        Py_XDECREF(%(out)s);
+        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
+        if (!%(out)s) { %(fail)s }
+""" % dict(out=outputs[0], inp=inp, fail=sub['fail'])
+        sio = StringIO.StringIO()
+        print >> sio, """
+        ssize_t starts[%(sz)s];
+        ssize_t stops[%(sz)s];
+        ssize_t steps[%(sz)s];
+        ssize_t cur;
+        int err;
+        if (%(inp)s->ga.nd != %(sz)s) {
+            PyErr_SetString(PyExc_IndexError, "invalid index");
+            %(fail)s
+        }
+        """ % dict(sz=len(idx_list), inp=inp, fail=sub['fail'])
+        def fix_idx(idx):
+            if idx is None:
+                return "0", 1
+            elif isinstance(idx, (numpy.integer, int)):
+                return str(idx), 0
+            elif isinstance(idx, gof.Type):
+                return indices.pop(0), 0
+            else:
+                assert 0, idx
+        for i, idx in enumerate(idx_list):
+            if isinstance(idx, slice):
+                start, start_n = fix_idx(idx.start)
+                stop, stop_n = fix_idx(idx.stop)
+                step, step_n = fix_idx(idx.step)
+                print >>sio, """
+                starts[%(i)s] = %(start)s;
+                stops[%(i)s] = %(stop)s;
+                steps[%(i)s] = %(step)s;
+                if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s],
+                                %(start_n)s, %(stop_n)s, %(step_n)s,
+                                %(inp)s->ga.dimensions[%(i)s]) == -1) {
+                    %(fail)s
+                }
+                """ % dict(i=i, start=start, stop=stop, step=step,
+                           start_n=start_n, stop_n=stop_n, step_n=step_n,
+                           fail=sub['fail'], inp=inp)
+            else:
+                if isinstance(idx, gof.Type):
+                    start = indices.pop(0)
+                elif isinstance(idx, (numpy.integer, int)):
+                    start = idx
+                else:
+                    assert 0, idx
+                print >>sio, """
+                cur = %(start)s;
+                if (cur < 0)
+                    cur += %(inp)s->ga.dimensions[%(i)s];
+                starts[%(i)s] = cur;
+                steps[%(i)s] = 0;
+                """ % dict(i=i, start=start, fail=sub['fail'], inp=inp)
+        print >>sio, """
+        Py_XDECREF(%(out)s);
+        %(out)s = pygpu_index(%(inp)s, starts, stops, steps);
+        if (!%(out)s) { %(fail)s }
+""" % dict(name=name, fail=sub['fail'], inp=inp, out=outputs[0])
+        return sio.getvalue()
+    def c_code_cache_version(self):
+        return (5,)
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
+from unittest import TestCase
+from theano.tensor.blas import gemv_inplace, gemm_inplace
+from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
+from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
+                                          gpugemm_inplace)
+GpuGemvTester = makeTester('GpuGemvTester',
+                           op=gemv_inplace, gpu_op=gpugemv_inplace,
+                           cases=dict(
+        dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
+        dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
+#        test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
+#        test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
+#        test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
+        test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
+        )
+)
+GpuGemmTester = makeTester('GpuGemmTester',
+                           op=gemm_inplace, gpu_op=gpugemm_inplace,
+                           cases=dict(
+        test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
+        test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
+        test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
+        test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
+        test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
+        test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
+        test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
+        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.0],
+        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.0],
+        )
+)
--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
+import unittest
+from theano import scalar, gof
+from theano.gof import FunctionGraph
+from theano.gof.python25 import all, any
+from theano.tests.unittest_tools import SkipTest
+from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
+                                               test_CAReduce)
+from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
+from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
+                                              GpuCAReduce)
+from theano.sandbox.gpuarray.type import GpuArrayType
+from pygpu.array import gpuarray
+# This is acutally a test for GpuElemwise
+class test_gpu_Broadcast(test_Broadcast):
+    op = GpuElemwise
+    type = GpuArrayType
+    def rand_val(self, shp):
+        return rand_gpuarray(*shp, **dict(cls=gpuarray))
+    # no c_code() yet
+    #cop = GpuElemwise
+    #ctype = GpuArrayType
+    #def rand_cval(self, shp):
+    #    return rand_gpuarray(*shp, **dict(cls=gpuarray))
+class test_GpuDimShuffle(test_DimShuffle):
+    op = GpuDimShuffle
+class test_GpuCAReduce(test_CAReduce):
+    dtypes = ["float32"]
+    bin_dtypes = ["uint8", "int8"]
+    op = GpuCAReduce
+    reds = [scalar.add, scalar.mul]
+    def test_perform(self):
+        for dtype in self.dtypes + self.bin_dtypes:
+            for op in self.reds:
+                self.with_linker(gof.PerformLinker(), op, dtype=dtype)
+    def test_perform_nan(self):
+        for dtype in self.dtypes:
+            for op in self.reds:
+                self.with_linker(gof.PerformLinker(), op, dtype=dtype,
+                                 test_nan=True)
+    def test_c(self):
+        raise SkipTest("no C code")
+    def test_c_nan(self):
+        raise SkipTest("no C code")
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
+import numpy
+import theano
+from theano.tests import unittest_tools as utt
+from theano.sandbox.gpuarray.basic_ops import GpuReshape
+import theano.sandbox.gpuarray
+if theano.sandbox.gpuarray.pygpu is None:
+    raise SkipTest("pygpu not installed")
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
+    if not cuda_ndarray.use.device_number:
+        cuda_ndarray.use('gpu')
+    theano.sandbox.gpuarray.init_dev('cuda')
+if not theano.sandbox.gpuarray.pygpu_activated:
+    raise SkipTest("pygpu disabled")
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
+def test_flatten():
+    m = theano.tensor.fmatrix()
+    f = theano.function([m], m.flatten(), mode=mode_with_gpu)
+    val = numpy.random.rand(10,11).astype("float32")
+    res = f(val)
+    utt.assert_allclose(res, val.flatten())
+    assert res.shape == val.flatten().shape
+    assert GpuReshape in [type(node.op)
+                          for node in f.maker.fgraph.toposort()]
\ No newline at end of file
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
+from theano.tensor.tests.test_subtensor import T_subtensor
+from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
+from theano.sandbox.gpuarray.subtensor import GpuSubtensor
+from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
+from theano.compile import DeepCopyOp
+from theano import tensor
+class G_subtensor(T_subtensor):
+    def shortDescription(self):
+        return None
+    def __init__(self, name):
+        T_subtensor.__init__(self, name,
+                             shared=gpuarray_shared_constructor,
+                             sub=GpuSubtensor,
+                             mode=mode_with_gpu,
+                             # avoid errors with limited devices
+                             dtype='float32',
+                             ignore_topo=(HostFromGpu,GpuFromHost,DeepCopyOp))
+        assert self.sub == GpuSubtensor
--- a/theano/sandbox/gpuarray/tests/test_type.py
+++ b/theano/sandbox/gpuarray/tests/test_type.py
+import operator
+import theano
+from theano.compile import DeepCopyOp
+from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
+from theano.sandbox.gpuarray.type import GpuArrayType
+def test_deep_copy():
+    a = rand_gpuarray(20, dtype='float32')
+    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
+    f = theano.function([g], g)
+    assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
+    res = f(a)
+    assert GpuArrayType.values_eq(res, a)
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
 import numpy
 import theano
+from theano.tensor.var import _tensor_py_operators
 from theano import Type, Variable, Constant, tensor, config, scalar
 from theano.compile import SharedVariable
@@ -26,7 +27,10 @@ class GpuArrayType(Type):
        except gpuarray.GpuArrayException:
            raise TypeError("Unsupported dtype for %s: %s" %
                            (self.__class__.__name__, self.dtype))
+    def __str__(self):
+        return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
    def filter(self, data, strict=False, allow_downcast=None):
        if strict:
            if not isinstance(data, gpuarray.GpuArray):
@@ -103,8 +107,8 @@ class GpuArrayType(Type):
            return GpuArrayType.values_eq(a, b)
        else:
            res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
-                            op_tmpl="res[i] = ((%(a)s - %(b)s) <" \
+                            op_tmpl="res[i] = ((%(a)s - %(b)s) <"
-                                "(1e-8 + 1e-5 * fabs(%(b)s)))")
+                            "(1e-8 + 1e-5 * fabs(%(b)s)))")
            return numpy.asarray(res).all()
    def value_zeros(self, shape):
@@ -134,7 +138,7 @@ class GpuArrayType(Type):
            return numpy.dtype(self.dtype).itemsize
    def c_declare(self, name, sub):
-        return "GpuArrayObject *%s;" % (name,)
+        return "PyGpuArrayObject *%s;" % (name,)
    def c_init(self, name, sub):
        return "%s = NULL;" % (name,)
@@ -149,17 +153,17 @@ class GpuArrayType(Type):
        }
        /* First check if we are the base type exactly (the most common case),
           then do the full subclass check if needed. */
-        if (py_%(name)s->ob_type != &GpuArrayType &&
+        if (py_%(name)s->ob_type != &PyGpuArrayType &&
-            !PyObject_TypeCheck(py_%(name)s, &GpuArrayType)) {
+            !PyObject_TypeCheck(py_%(name)s, &PyGpuArrayType)) {
            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
            %(fail)s
        }
-        %(name)s = (GpuArrayObject *)py_%(name)s;
+        %(name)s = (PyGpuArrayObject *)py_%(name)s;
        Py_INCREF(%(name)s);
        """ % {'name': name, 'fail': sub['fail']}
    def c_cleanup(self, name, sub):
-        return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name }
+        return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name}
    def c_sync(self, name, sub):
        return """
@@ -184,7 +188,8 @@ class GpuArrayType(Type):
        # We need arrayobject for the PyArrayDescr struct def
        # (even if we just use a pointer to it in a function def)
        return ['<compyte/array.h>', '<compyte/kernel.h>', '<compyte/error.h>',
-                '<numpy/arrayobject.h>', '<gpuarray_api.h>']
+                '<compyte/buffer_blas.h>', '<numpy/arrayobject.h>',
+                '<gpuarray_api.h>']
    def c_header_dirs(self):
        return [pygpu.get_include(), numpy.get_include()]
@@ -193,10 +198,13 @@ class GpuArrayType(Type):
        return ['compyte']
    def c_code_cache_version(self):
-        return (1,)
+        ver = pygpu.gpuarray.api_version()
+        # we only use the major version since the minor revision are
+        # API-compatible.
+        return (1, ver[0])
-class _operators(tensor.basic._tensor_py_operators):
+class _operators(_tensor_py_operators):
    def _as_TensorVariable(self):
        from basic_ops import host_from_gpu
        return host_from_gpu(self)
@@ -204,10 +212,6 @@ class _operators(tensor.basic._tensor_py_operators):
    def _as_GpuArrayVariable(self):
        return self
-    dtype = property(lambda s: s.type.dtype)
-    broadcastable = property(lambda s: s.type.broadcastable)
-    ndim = property(lambda s: s.type.ndim)
 class GpuArrayVariable(_operators, Variable):
    pass
@@ -276,12 +280,6 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
 theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
    Py_XDECREF(%(oname)s);
-    %(oname)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
+    %(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);
    if (!%(oname)s) { %(fail)s }
-    int err;
+""", version=(5,))
-    err = GpuArray_copy(&%(oname)s->ga, &%(iname)s->ga, GA_ANY_ORDER);
-    if (err != GA_NO_ERROR) {
-        PyErr_SetString(PyExc_RuntimeError, "Error during copy");
-        %(fail)s
-    }
-""", version=(1,))
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -3018,7 +3018,7 @@ class Composite(ScalarOp):
        rval = []
        for subnode in self.fgraph.toposort():
            try:
-                rval.append(subnode.op.c_support_code())
+                rval.append(subnode.op.c_support_code().strip())
            except gof.utils.MethodNotDefined:
                pass
        # remove duplicate code blocks

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -143,10 +143,6 @@ class DimShuffle(Op):
        # list of dimensions of the input to drop
        self.drop = []
-        # this maps i before dropping dimensions to j after dropping dimensions
-        # so self.shuffle can be set properly later on
-        i2j = {}
-        j = 0
        for i, b in enumerate(input_broadcastable):
            if i not in new_order:
                # we want to drop this dimension because it's not a value in
@@ -158,14 +154,9 @@ class DimShuffle(Op):
                    raise ValueError(
                            "You cannot drop a non-broadcastable dimension.",
                            (input_broadcastable, new_order))
-            else:
-                i2j[i] = j
-                j += 1
-        # transposition of non-broadcastable dimensions
+        # this is the list of the original dimensions that we keep
-        # This is how the dimensions will be permuted, without accounting for
+        self.shuffle = [x for x in new_order if x != 'x']
-        # the extra 'x' broadcastable dimensions to insert.
-        self.shuffle = [i2j[x] for x in new_order if x != 'x']
        # list of dimensions of the output that are broadcastable and were not
        # in the original input
@@ -237,16 +228,12 @@ class DimShuffle(Op):
        res = input
        if type(res) != numpy.ndarray and type(res) != numpy.memmap:
            raise TypeError(res)
-        shape = list(res.shape)
-        for drop in reversed(self.drop):
-            shape.pop(drop)
-        res = res.reshape(shape)
        # transpose
-        res = res.transpose(self.shuffle)
+        res = res.transpose(self.shuffle+self.drop)
        # augment
-        shape = list(res.shape)
+        shape = list(res.shape[:len(self.shuffle)])
        for augm in self.augment:
            shape.insert(augm, 1)
        res = res.reshape(shape)
@@ -259,9 +246,6 @@ class DimShuffle(Op):
    def infer_shape(self, node, shapes):
        ishp, = shapes
-        ishp = list(ishp)
-        for drop in reversed(self.drop):
-            del ishp[drop]
        # transpose
        rval = [ishp[i] for i in self.shuffle]

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -410,9 +410,9 @@ def local_dimshuffle_lift(node):
    inode = input.owner
    if inode and isinstance(inode.op, Elemwise) and (len(input.clients) == 1):
        # Don't use make_node to have tag.test_value set.
-        ret = inode.op(*[DimShuffle(inp.type.broadcastable,
+        ret = inode.op(*[op.__class__(inp.type.broadcastable,
-                                    op.new_order,
+                                      op.new_order,
-                                    op.inplace)(inp) for inp in
+                                      op.inplace)(inp) for inp in
                         inode.inputs], **dict(return_list=True))
        return ret
    if inode and isinstance(inode.op, DimShuffle):
@@ -424,8 +424,8 @@ def local_dimshuffle_lift(node):
                                                   iinput.type.ndim):
            return [iinput]
        else:
-            ret = DimShuffle(iinput.type.broadcastable, new_order,
+            ret = op.__class__(iinput.type.broadcastable, new_order,
-                             inplace)(iinput, **dict(return_list=True))
+                               inplace)(iinput, **dict(return_list=True))
            return ret
@@ -460,7 +460,7 @@ def dimshuffle_as_view(node):
    op = node.op
    if not isinstance(op, DimShuffle) or op.inplace:
        return False
-    new_op = DimShuffle(op.input_broadcastable, op.new_order, inplace=True)
+    new_op = op.__class__(op.input_broadcastable, op.new_order, inplace=True)
    return [new_op(*node.inputs)]
 #Step 60 is the inplace optimization stage.
@@ -4609,7 +4609,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
        # worthwhile if the summation axis doesn't line up with a
        # contiguous dimension)
-        if not isinstance(node.op, OP):
+        if type(node.op) is not OP:
            return False
        inputs = []  # inputs of the new Elemwise op.
        s_inputs = []  # inputs of the new scalar op used by the Composite.

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -44,7 +44,7 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
        dtensor3, SpecifyShape, Mean,
        itensor3, Tile, switch, Diagonal, Diag,
        nonzero, flatnonzero, nonzero_values,
-        stacklists)
+        stacklists, DimShuffle)
 from theano.tests import unittest_tools as utt
@@ -4204,9 +4204,30 @@ class T_op_cache(unittest.TestCase):
        self.assertTrue(numpy.all(fn_py(a) == fn_c_or_py(a)))
-class T_reshape(unittest.TestCase):
+class T_reshape(utt.InferShapeTester, utt.TestOptimizationMixin):
-    def setUp(self):
+    def __init__(self, name, shared=tensor._shared, op=Reshape, mode=None,
-        utt.seed_rng()
+                 ignore_topo=(DeepCopyOp, opt.MakeVector,
+                              opt.Shape_i, DimShuffle, theano.tensor.Elemwise)):
+        self.shared = shared
+        self.op = op
+        #The tag canonicalize is needed for the shape test in FAST_COMPILE
+        self.mode = mode
+        self.ignore_topo = ignore_topo
+        return super(T_reshape, self).__init__(name)
+    def function(self, inputs, outputs):
+        f = function(inputs, outputs, mode=self.mode)
+        if self.mode is not None or theano.config.mode != "FAST_COMPILE":
+            topo = f.maker.fgraph.toposort()
+            topo_ = [node for node in topo if not isinstance(node.op,
+                                                             self.ignore_topo)]
+            assert len(topo_) == 1, topo_
+        return f
+    def eval_output_and_check(self, t):
+        f = self.function([], t)
+        tval = f()
+        return tval
    def test_reshape(self):
        a = dvector()
@@ -4215,7 +4236,7 @@ class T_reshape(unittest.TestCase):
        #basic to 1 dim(without list)
        c = reshape(b, as_tensor_variable(6), ndim=1)
-        f = inplace_func([b], c)
+        f = self.function([b], c)
        b_val1 = numpy.asarray([[0, 1, 2], [3, 4, 5]])
        c_val1 = numpy.asarray([0, 1, 2, 3, 4, 5])
@@ -4231,7 +4252,7 @@ class T_reshape(unittest.TestCase):
        #basic to 1 dim(with list)
        c = reshape(b, (as_tensor_variable(6),), ndim=1)
-        f = inplace_func([b], c)
+        f = self.function([b], c)
        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
                         numpy.asarray([0, 1, 2, 3, 4, 5]))
        #print f.maker.fgraph.toposort()
@@ -4239,14 +4260,14 @@ class T_reshape(unittest.TestCase):
        #basic to shape object of same ndim
        c = reshape(b, d.shape)
-        f = inplace_func([b, d], c)
+        f = self.function([b, d], c)
        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]]),
                           [[0, 1], [2, 3], [4, 5]]) ==
                         numpy.asarray([[0, 1], [2, 3], [4, 5]]))
        #basic to 2 dims
        c = reshape(a, [2, 3])
-        f = inplace_func([a], c)
+        f = self.function([a], c)
        assert numpy.all(f(numpy.asarray([0, 1, 2, 3, 4, 5])) ==
                         numpy.asarray([[0, 1, 2], [3, 4, 5]]))
@@ -4255,7 +4276,7 @@ class T_reshape(unittest.TestCase):
        a_val_copy = numpy.asarray([0, 1, 2, 3, 4, 5])
        b_val = numpy.asarray([[0, 1, 2], [3, 4, 5]])
-        f_sub = inplace_func([a, b], c - b)
+        f_sub = self.function([a, b], c - b)
        assert numpy.all(f_sub(a_val, b_val) == 0.0)
        assert numpy.all(a_val == a_val_copy)
@@ -4264,35 +4285,33 @@ class T_reshape(unittest.TestCase):
        a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
        b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')
-        f_sub = inplace_func([a, b], c - b)
+        f_sub = self.function([a, b], c - b)
        assert numpy.all(f_sub(a_val, b_val) == 0.0)
        assert numpy.all(a_val == a_val_copy)
        # verify gradient
        def just_vals(v):
            return Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
-        utt.verify_grad(just_vals, [a_val])
+        utt.verify_grad(just_vals, [a_val], mode=self.mode)
        #test infer_shape
-        f_sub = function([a, b], (c - b).shape)
+        self._compile_and_check([a], [c], (a_val,), self.op)
-        if config.mode == "FAST_COMPILE":
-            assert len(f_sub.maker.fgraph.toposort()) == 3
-        else:
-            topo = f_sub.maker.fgraph.toposort()
-            assert len(topo) == 1
-            topo[0].op == theano.compile.function_module.deep_copy_op
-            #assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5]]))==[2,3])#work in FAST_RUN, but fail on other!
-            #assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5],[6,7]]))==[2,3])#work in FAST_RUN, but fail on other!
        # test broadcast flag for constant value of 1
        c = reshape(b, (b.shape[0], b.shape[1], 1))
-        f = inplace_func([b], c)
+        f = self.function([b], c)
        assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
                         numpy.asarray([[[0], [1], [2]], [[3], [4], [5]]]))
        assert (f.maker.fgraph.toposort()[-2].outputs[0].type.broadcastable ==
                (False, False, True))
-        assert numpy.all(f_sub(a_val, b_val) == [2, 3])
+    def test_m1(self):
+        t = tensor3()
+        rng = numpy.random.RandomState(seed=utt.fetch_seed())
+        val = rng.uniform(size=(3, 4, 5)).astype(config.floatX)
+        for out in [t.reshape([-1]), t.reshape([-1, 5]),
+                    t.reshape([5, -1]), t.reshape([5, -1, 3])]:
+            self._compile_and_check([t], [out], [val], self.op)
    def test_reshape_long_in_shape(self):
        v = dvector('v')
@@ -4311,14 +4330,14 @@ class T_reshape(unittest.TestCase):
        r = a.reshape(shapes, ndim=1)
        z = zeros_like(r)
-        f = function([a, shapes], z.shape)
+        f = self.function([a, shapes], z.shape)
        self.assertRaises(ValueError, f, a_val, [13])
        #Test reshape to 2 dim
        r = a.reshape(shapes, ndim=2)
        z = zeros_like(r)
-        f = function([a, shapes], z.shape)
+        f = self.function([a, shapes], z.shape)
        self.assertRaises(ValueError, f, a_val, [-1, 5])
        self.assertRaises(ValueError, f, a_val, [7, -1])

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -122,10 +122,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        try:
            try:
                self.eval_output_and_check(t)
-                assert 0
+            except IndexError, e:
-            except Exception, e:
+                return
-                if 'out of bounds' not in exc_message(e):
+            self.fail()
-                    raise
        finally:
            _logger.setLevel(oldlevel)
@@ -161,7 +160,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
    def test1_0_dims(self):
        n = self.shared(numpy.ones((), dtype=self.dtype))
-        t = theano.tensor.Subtensor([])(n)
+        t = self.sub([])(n)
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        mode = self.mode
        self.mode = mode.excluding("local_useless_subtensor")
@@ -188,7 +187,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(tval == 5.0)
    def test1_ok_range_infinite(self):
-        #Subtensor.debug = True
        n = self.shared(numpy.arange(3, dtype=self.dtype))
        t = n[1:]
        self.assertTrue(isinstance(t.owner.op, Subtensor))

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -543,8 +543,8 @@ class _tensor_py_operators:
    def get_scalar_constant_value(self):
        return theano.tensor.basic.get_scalar_constant_value(self)
-    def zeros_like(self, dtype=None):
+    def zeros_like(model, dtype=None):
-        return theano.tensor.basic.zeros_like(self, dtype=dtype)
+        return theano.tensor.basic.zeros_like(model, dtype=dtype)
 class TensorVariable(_tensor_py_operators, Variable):

--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
@@ -182,7 +182,10 @@ class InferShapeTester(unittest.TestCase):
    def setUp(self):
        seed_rng()
        # Take into account any mode that may be defined in a child class
-        mode = getattr(self, 'mode', theano.compile.get_default_mode())
+        # and it can be None
+        mode = getattr(self, 'mode', None)
+        if mode is None:
+            mode = theano.compile.get_default_mode()
        # This mode seems to be the minimal one including the shape_i
        # optimizations, if we don't want to enumerate them explicitly.
        self.mode = mode.including("canonicalize")