Merge pull request #1626 from nouiz/gpu

Assorted gpuarray work.

Merge pull request #1626 from nouiz/gpu
cfd5c827 · abergeron · fd2f68d2 · 210d4431 · cfd5c827 · cfd5c827
--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
 from theano.compile.ops import (
        DeepCopyOp, deep_copy_op, register_deep_copy_op_c_code,
+        Shape_i, register_shape_i_c_code,
        ViewOp, view_op, register_view_op_c_code)
 from theano.compile.function_module import *

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -2,7 +2,7 @@
 import copy
 import warnings
-#import theano
+import theano
 from theano import gof
@@ -155,7 +155,7 @@ class DeepCopyOp(gof.Op):
        # Else, we will return a list of (type name, version) pairs.
        for t, (c, v) in sorted(self.c_code_and_version.items(), key=lambda pair: str(pair[0])):
            if not v:
-                warnings.warn("Type %s has C code for OutputGuard, but it has "
+                warnings.warn("Type %s has C code for DeepCopyOp, but it has "
                        "no version. You should add a 'version' keyword arg "
                        "when calling register_OutputGuard_c_code." % t,
                        stacklevel=2)
@@ -180,6 +180,99 @@ class DeepCopyOp(gof.Op):
 deep_copy_op = DeepCopyOp()
+class Shape_i(gof.Op):
+    """
+    L{Op} to return the shape of a matrix.
+    @note: Non-differentiable.
+    """
+    # Mapping from Type to C code (and version) to use.
+    # In the C code, the name of the input variable is %(iname)s,
+    # the output variable is %(oname)s.
+    c_code_and_version = {}
+    def __init__(self, i):
+        self.i = i
+    def __hash__(self):
+        return hash(type(self)) ^ self.i
+    def __eq__(self, other):
+        return type(self) == type(other) and self.i == other.i
+    def __str__(self):
+        return '%s{%i}' % (self.__class__.__name__, self.i)
+    def make_node(self, x):
+        # x could be one of a number of types
+        # the only thing we require is that the variable have a .ndim,
+        # and that the value have a .shape
+        if not isinstance(x, theano.Variable):
+            raise TypeError('x must be Variable with ndim attribute', x)
+        if x.ndim <= self.i:
+            raise TypeError('x has too few dimensions for Shape_i',
+                            (x, self.i))
+        return theano.Apply(self, [x], [theano.tensor.lscalar()])
+    def perform(self, node, inp, out_):
+        x, = inp
+        out, = out_
+        if out[0] is None:
+            out[0] = theano._asarray(x.shape[self.i], dtype='int64')
+        else:
+            out[0][...] = x.shape[self.i]
+    def c_code_cache_version(self):
+        version = []
+        # If any of the c code is unversionned, we have to return ()
+        # Else, we will return a list of (type name, version) pairs.
+        for t, (c, v) in sorted(self.c_code_and_version.items(),
+                                key=lambda pair: str(pair[0])):
+            if not v:
+                warnings.warn("Type %s has C code for Shape_i, but it has "
+                        "no version. You should add a 'version' keyword arg "
+                        "when calling register_OutputGuard_c_code." % t,
+                        stacklevel=2)
+                return ()
+            version.append((str(t), v))
+        return tuple(version)
+    def c_code(self, node, name, inames, onames, sub):
+        iname, = inames
+        oname, = onames
+        fail = sub['fail']
+        i = self.i
+        itype = node.inputs[0].type.__class__
+        if itype in self.c_code_and_version:
+            code, version = self.c_code_and_version[itype]
+            return code % locals()
+        # Else, no C code
+        return super(Shape_i, self).c_code(node, name, inames, onames, sub)
+    def infer_shape(self, node, input_shapes):
+        return [()]
+    def grad(self, inp, grads):
+        return [None]
+def register_shape_i_c_code(typ, code, version=()):
+    """ Tell DeepCopyOp how to generate C code for a Theano Type
+    :param typ: A Theano type. It must be the Theano class itself and not an
+                instance of the class.
+    :param code: C code that deep copies the Theano type 'typ'.
+                 Use %(iname)s and %(oname)s for the input and output C
+                 variable names respectively.
+    :param version: A number indicating the version of the code, for cache.
+    """
+    Shape_i.c_code_and_version[typ] = (code, version)
 # List of Theano Types that one can add an extra dimension and for which
 # Scan can deal with.
 expandable_types = ()
--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -438,6 +438,13 @@ theano.compile.register_view_op_c_code(
        """,
        version=1)
+theano.compile.register_shape_i_c_code(CudaNdarrayType, """
+    if(!%(oname)s)
+        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
+    ((npy_int64*)PyArray_DATA(%(oname)s))[0] =
+                              CudaNdarray_HOST_DIMS(%(iname)s)[%(i)s];
+""", version=(0,))
 # Register CudaNdarrayType to the DeepCopyOp list of types with c code.
 theano.compile.register_deep_copy_op_c_code(
        CudaNdarrayType,

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -532,8 +532,26 @@ cuda_from_gpu = CudaFromGpu()
 class GpuAlloc(HideC, Alloc):
+    def __init__(self, memset_0=False):
+        """memset_0 is only an optimized version. True, it mean the
+        value is always 0, so the c code call memset as it is faster.
+        """
+        self.memset_0 = memset_0
+    def __eq__(self, other):
+        return type(self) == type(other) and self.memset_0 == other.memset_0
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.memset_0)
    def __str__(self):
-        return 'GpuAlloc'
+        #Hide the memset parameter when not used to prevent confusion.
+        if self.memset_0:
+            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
+        else:
+            s = self.__class__.__name__
+        return s
    def make_node(self, value, *shape):
        res = Alloc.make_node(self, value, *shape)
@@ -542,6 +560,9 @@ class GpuAlloc(HideC, Alloc):
                             broadcastable=res.outputs[0].broadcastable)
        return Apply(self, [value] + res.inputs[1:], [otype()])
+    def c_headers(self):
+        return ['<compyte/numpy_compat.h>']
    def perform(self, node, inputs, outs):
        out, = outs
        v = inputs[0]
@@ -562,6 +583,7 @@ class GpuAlloc(HideC, Alloc):
        ndim = len(inp[1:])
        zz, = out
+        memset_0 = int(self.memset_0)
        code = """
        int i;
        size_t %(name)s_shape[%(ndim)s];
@@ -579,21 +601,45 @@ class GpuAlloc(HideC, Alloc):
            for (i = 0; i < %(ndim)s; i++)
                need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
-        if (need_new_out) {
+        if (need_new_out && (%(memset_0)s)) {
+            //pygpu_zeros can be faster then empty followed by memset.
            Py_XDECREF(%(zz)s);
-            %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
+            %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
                                 %(vv)s->ga.typecode, GA_C_ORDER,
                                 pygpu_default_context(), Py_None);
            if (!%(zz)s) {
                %(fail)s
            }
+        } else {
+            if (need_new_out) {
+                Py_XDECREF(%(zz)s);
+                %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
+                                     %(vv)s->ga.typecode, GA_C_ORDER,
+                                     pygpu_default_context(), Py_None);
+                if (!%(zz)s) {
+                    %(fail)s
+                }
+            }
+            if (%(memset_0)s && GpuArray_ISONESEGMENT(&%(zz)s->ga))
+            {
+                int err = GpuArray_memset(&%(zz)s->ga, 0);
+                if (err != GA_NO_ERROR)
+                {
+                    PyErr_Format(PyExc_MemoryError,
+                                 "GpuAlloc: Error memsetting %%d"
+                                 " element of device memory to 0.",
+                                 PyGpuArray_SIZE(%(zz)s));
+                    %(fail)s;
+                }
+            }
+            else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) !=
+                     GA_NO_ERROR) {
+                PyErr_SetString(PyExc_ValueError, "setarray failed");
+                %(fail)s
+            }
        }
+        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv,
-        if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) {
+                   fail=sub['fail'], memset_0=memset_0)
-            PyErr_SetString(PyExc_ValueError, "setarray failed");
-            %(fail)s
-        }
-        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, fail=sub['fail'])
        if config.gpuarray.sync:
            code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
@@ -601,7 +647,7 @@ class GpuAlloc(HideC, Alloc):
        return code
    def c_code_cache_version(self):
-        return (1,)
+        return (2,)
 gpu_alloc = GpuAlloc()

--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 from theano import Op, Apply, config
-from theano.tensor.blas import Gemv, Gemm
+from theano.tensor.blas import Dot22, Gemv, Gemm
 from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
 try:
@@ -28,12 +28,16 @@ class GpuGemv(BlasOp, Gemv):
        A = as_gpuarray_variable(A)
        x = as_gpuarray_variable(x)
        y = as_gpuarray_variable(y)
+        assert A.dtype == x.dtype == y.dtype == alpha.dtype == beta.dtype
        return Apply(self, [y, alpha, A, x, beta], [y.type()])
    def perform(self, node, inputs, out_storage):
        y, alpha, A, x, beta = inputs
-        out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, trans=False,
+        inplace = self.inplace
-                                      overwrite_y=self.inplace)
+        if inplace and y.strides[0] < 0:
+            inplace = False
+        out_storage[0][0] = blas.gemv(alpha, A, x, beta, y,
+                                      overwrite_y=inplace)
    def c_code(self, node, name, inp, out, sub):
        vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
@@ -64,7 +68,7 @@ class GpuGemv(BlasOp, Gemv):
        if config.gpuarray.sync:
            code += """
            GpuArray_sync(&%(out)s->ga);
-            """
+            """ % vars
        return code
    def c_code_cache_version(self):
@@ -80,12 +84,16 @@ class GpuGemm(BlasOp, Gemm):
        A = as_gpuarray_variable(A)
        B = as_gpuarray_variable(B)
        C = as_gpuarray_variable(C)
+        assert A.dtype == B.dtype == C.dtype == alpha.dtype == beta.dtype
        return Apply(self, [C, alpha, A, B, beta], [C.type()])
    def perform(self, node, inputs, outputs):
        C, alpha, A, B, beta = inputs
+        inplace = self.inplace
+        if inplace and not C.flags.forc:
+            inplace = False
        outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
-                                  overwrite_c=self.inplace)
+                                  overwrite_c=inplace)
    def c_code(self, node, name, inp, out, sub):
        vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
@@ -116,7 +124,7 @@ class GpuGemm(BlasOp, Gemm):
        if config.gpuarray.sync:
            code += """
            GpuArray_sync(&%(out)s->ga);
-            """
+            """ % vars
        return code
    def c_code_cache_version(self):
@@ -126,6 +134,67 @@ class GpuGemm(BlasOp, Gemm):
 gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)
+class GpuDot22(BlasOp, Dot22):
+    def make_node(self, x, y):
+        res = Dot22.make_node(self, x, y)
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        assert x.dtype == y.dtype
+        return Apply(self, [x, y], [x.type()])
+    def perform(self, node, inputs, outputs):
+        x, y = inputs
+        out = pygpu.empty((x.shape[0], y.shape[1]), dtype=x.dtype)
+        outputs[0][0] = blas.gemm(1., x, y, 0., out,
+                                  overwrite_c=True)
+    def c_code(self, node, name, inputs, outputs, sub):
+        dtype = node.inputs[0].dtype
+        typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
+        vars = dict(A=inputs[0], B=inputs[1], dtype=dtype, out=outputs[0],
+                    typecode=typecode,
+                    fail=sub['fail'], name=name)
+        code = """
+        double one = 1.;
+        double zero = 0.;
+        size_t dims[] = {0, 0};
+        dims[0] = PyGpuArray_DIMS(%(A)s)[0];
+        dims[1] = PyGpuArray_DIMS(%(B)s)[1];
+        %(out)s = pygpu_empty(2, dims,
+                            %(typecode)s,
+                            GA_C_ORDER,
+                            pygpu_default_context(), Py_None);
+        if (!%(out)s) {
+            %(fail)s
+        }
+        if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
+                             one,
+                             %(A)s, %(B)s,
+                             zero,
+                             %(out)s) == NULL) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """ % vars
+        return code
+    def c_code_cache_version(self):
+        return (0,)
+    def c_headers(self):
+        ret = super(GpuDot22, self).c_headers()
+        return ret + ['<compyte/numpy_compat.h>']
+gpu_dot22 = GpuDot22()
 from theano.compile import optdb
 from theano.gof import local_optimizer, LocalOptGroup
 from theano.tensor.opt import in2out

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -3,7 +3,8 @@ import theano
 import numpy
 from theano import tensor, scalar
 from theano.compile import optdb
-from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
+from theano.gof import (local_optimizer, EquilibriumDB,
+                        SequenceDB, ProxyDB,
                        Optimizer, toolbox, DestroyHandler,
                        InconsistencyError, EquilibriumOptimizer)
@@ -12,12 +13,15 @@ from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               gpu_from_host,
-                                               gpu_alloc, GpuReshape,
+                                               gpu_alloc,
+                                               GpuAlloc,
+                                               GpuReshape,
                                               GpuEye)
+from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
 from theano.sandbox.gpuarray.subtensor import GpuSubtensor
-from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm
+from theano.sandbox.gpuarray.type import GpuArrayConstant
 gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
@@ -52,7 +56,7 @@ def op_lifter(OP):
    """
    def f(maker):
        def local_opt(node):
-            if type(node.op) is OP:
+            if type(node.op) in OP:
                # This does not support nodes that have more than one output.
                assert len(node.outputs) == 1
                # either one of our inputs is on the gpu or
@@ -70,7 +74,7 @@ def op_lifter(OP):
                            return [host_from_gpu(new_op)]
            return False
        local_opt.__name__ = maker.__name__
-        return local_optimizer([OP])(local_opt)
+        return local_optimizer(OP)(local_opt)
    return f
@@ -120,13 +124,25 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
 @register_opt()
-@op_lifter(tensor.Alloc)
+@op_lifter([tensor.Alloc])
 def local_gpualloc(node):
    return gpu_alloc
 @register_opt()
-@op_lifter(tensor.Reshape)
+@local_optimizer([GpuAlloc])
+def local_gpualloc_memset_0(node):
+    if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
+        inp = node.inputs[0]
+        if (isinstance(inp, GpuArrayConstant) and
+            inp.data.size == 1 and
+            (numpy.asarray(inp.data) == 0).all()):
+            new_out = GpuAlloc(memset_0=True)(*node.inputs)
+            return [new_out]
+@register_opt()
+@op_lifter([tensor.Reshape])
 def local_gpureshape(node):
    op = node.op
    name = op.name
@@ -137,7 +153,7 @@ def local_gpureshape(node):
 @register_opt()
-@op_lifter(tensor.Flatten)
+@op_lifter([tensor.Flatten])
 def local_gpuflatten(node):
    op = node.op
    shp =[]
@@ -150,10 +166,12 @@ def local_gpuflatten(node):
 @register_opt()
-@op_lifter(tensor.Elemwise)
+@op_lifter([tensor.Elemwise])
 def local_gpu_elemwise(node):
    op = node.op
    name = op.name
+    if node.outputs[0].ndim == 0:
+        return
    if name:
        name = 'Gpu'+name
    res = GpuElemwise(op.scalar_op, name=name,
@@ -193,26 +211,26 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
 @register_opt()
-@op_lifter(tensor.DimShuffle)
+@op_lifter([tensor.DimShuffle])
 def local_gpua_dimshuffle(node):
    return GpuDimShuffle(node.op.input_broadcastable,
                         node.op.new_order)
 @register_opt()
-@op_lifter(tensor.SpecifyShape)
+@op_lifter([tensor.SpecifyShape])
 def local_gpua_specifyShape(node):
    return tensor.specify_shape
 @register_opt()
-@op_lifter(tensor.Subtensor)
+@op_lifter([tensor.Subtensor])
 def local_gpua_subtensor(node):
    return GpuSubtensor(node.op.idx_list)
 @register_opt()
-@op_lifter(tensor.CAReduce)
+@op_lifter([tensor.CAReduce, tensor.Sum])
 def local_gpua_careduce(node):
    if (isinstance(node.op.scalar_op, scalar.basic.Add) or
        isinstance(node.op.scalar_op, scalar.basic.Mul)):
@@ -220,23 +238,32 @@ def local_gpua_careduce(node):
                           dtype=getattr(node.op, 'dtype', None),
                           acc_dtype=getattr(node.op, 'acc_dtype', None))
 @register_opt()
-@op_lifter(tensor.blas.Gemv)
+@op_lifter([tensor.blas.Gemv])
 def local_gpua_gemv(node):
    return GpuGemv(inplace=node.op.inplace)
 @register_opt()
-@op_lifter(tensor.blas_c.CGemv)
+@op_lifter([tensor.blas_c.CGemv])
 def local_gpua_gemv2(node):
    return GpuGemv(inplace=node.op.inplace)
 @register_opt()
-@op_lifter(tensor.blas.Gemm)
+@op_lifter([tensor.blas.Gemm])
 def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)
 @register_opt()
-@op_lifter(tensor.basic.Eye)
+@op_lifter([tensor.blas.Dot22])
+def local_gpua_dot22(node):
+    return gpu_dot22
+@register_opt()
+@op_lifter([tensor.basic.Eye])
 def local_gpua_eye(node):
    return GpuEye(dtype=node.op.dtype)
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -336,3 +336,39 @@ def test_gpueye():
        # M != N, k = 0
        yield check, dtype, 3, 5
        yield check, dtype, 5, 3
+def test_hostfromgpu_shape_i():
+    """
+    Test that the shape is lifted over hostfromgpu
+    """
+    m = mode_with_gpu.including('local_dot_to_dot22',
+                                'local_dot22_to_dot22scalar','specialize')
+    a = T.fmatrix('a')
+    ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
+    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
+    cv = gpuarray.asarray(numpy.random.rand(5, 4),
+                          dtype='float32')
+    gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
+    host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
+    f = theano.function([a], gpu_from_host(a), mode=m)
+    assert gpu_from_host in [x.op
+                             for x in f.maker.fgraph.toposort()]
+    f = theano.function([a], gpu_from_host(a).shape, mode=m)
+    topo = f.maker.fgraph.toposort()
+    assert isinstance(topo[0].op, T.opt.Shape_i)
+    assert isinstance(topo[1].op, T.opt.Shape_i)
+    assert isinstance(topo[2].op, T.opt.MakeVector)
+    assert tuple(f(av)) == (5, 4)
+    f = theano.function([ca], host_from_gpu(ca), mode=m)
+    assert host_from_gpu in [x.op
+                             for x in f.maker.fgraph.toposort()]
+    f = theano.function([ca], host_from_gpu(ca).shape, mode=m)
+    topo = f.maker.fgraph.toposort()
+    assert isinstance(topo[0].op, theano.compile.Shape_i)
+    assert isinstance(topo[1].op, theano.compile.Shape_i)
+    assert isinstance(topo[2].op, theano.tensor.opt.MakeVector)
+    assert tuple(f(cv)) == (5, 4)
--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
 from unittest import TestCase
-from theano.tensor.blas import gemv_inplace, gemm_inplace
+import theano
+from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
 from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
 from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
-                                          gpugemm_inplace)
+                                          gpugemm_inplace, gpu_dot22)
 GpuGemvTester = makeTester('GpuGemvTester',
                           op=gemv_inplace, gpu_op=gpugemv_inplace,
@@ -29,7 +31,28 @@ GpuGemmTester = makeTester('GpuGemmTester',
        test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
        test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
        test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
-        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.0],
+        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
-        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.0],
+        test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
-        )
+ #       test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
+ #       test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
+ #       test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
+ #       test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
+    )
+)
+GpuDot22Tester = makeTester(
+    'GpuGemmTester',
+    op=_dot22, gpu_op=gpu_dot22,
+    cases=dict(
+        test1=[rand(3, 4), rand(4, 5)],
+        test2=[rand(1, 4), rand(4, 5)],
+        test3=[rand(3, 1), rand(1, 5)],
+        test4=[rand(3, 4), rand(4, 1)],
+#        test5=[rand(0, 4), rand(4, 5)],
+#        test6=[rand(3, 0), rand(0, 5)],
+#        test7=[rand(3, 4), rand(4, 0)],
+#        test8=[rand(0, 4), rand(4, 0)],
+#        test9=[rand(0, 0), rand(0, 0)],
+    )
 )
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -2,7 +2,8 @@ import numpy
 import theano
 from theano.tests import unittest_tools as utt
-from theano.sandbox.gpuarray.basic_ops import GpuReshape
+from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
+from theano.sandbox.gpuarray.elemwise import GpuCAReduce
 import theano.sandbox.gpuarray
 from theano.tests.unittest_tools import SkipTest
@@ -29,7 +30,7 @@ else:
 def test_flatten():
    m = theano.tensor.fmatrix()
    f = theano.function([m], m.flatten(), mode=mode_with_gpu)
-    val = numpy.random.rand(10,11).astype("float32")
+    val = numpy.random.rand(10, 11).astype("float32")
    res = f(val)
    utt.assert_allclose(res, val.flatten())
    assert res.shape == val.flatten().shape
@@ -58,3 +59,48 @@ def test_flatten():
    assert res.shape == val.reshape(10, -1).shape
    assert GpuReshape in [type(node.op)
                          for node in f.maker.fgraph.toposort()]
+def test_sum_prod():
+    for method in ['sum']:
+        m = theano.tensor.fmatrix()
+        f = theano.function([m], getattr(m, method)(), mode=mode_with_gpu)
+        val = numpy.random.rand(10, 11).astype("float32")
+        res = f(val)
+        utt.assert_allclose(res, val.sum())
+        assert res.shape == ()
+        assert GpuCAReduce in [type(node.op)
+                               for node in f.maker.fgraph.toposort()]
+def test_local_gpualloc_memset_0():
+    i = theano.tensor.iscalar()
+    z = numpy.zeros((1,), dtype='float32')
+    o = numpy.ones((1,), dtype='float32')
+    ones = numpy.ones((2,), dtype='float32')
+    # Test with 0
+    a = gpu_alloc(z, i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
+    assert (numpy.asarray(f(6)) == 0).all()
+    # Test with 1
+    a = gpu_alloc(o, i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, GpuAlloc)
+    assert not topo[0].op.memset_0
+    assert (numpy.asarray(f(6)) == 1).all()
+    # Test with 1, 1
+    a = gpu_alloc(ones, i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, GpuAlloc)
+    assert not topo[0].op.memset_0
+    assert (numpy.asarray(f(2)) == 1).all()
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -278,6 +278,13 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
    Py_XINCREF(%(oname)s);
 """, version=(0,))
+theano.compile.register_shape_i_c_code(GpuArrayType, """
+    if(!%(oname)s)
+        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
+    ((npy_int64*)PyArray_DATA(%(oname)s))[0] =
+                              %(iname)s->ga.dimensions[%(i)s];
+""", version=(0,))
 theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
    Py_XDECREF(%(oname)s);
    %(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1589,7 +1589,7 @@ class Dot22(GemmRelated):
            raise
    def __str__(self):
-        return "_dot22"
+        return self.__class__.__name__
    setup_z_Nz_Sz = """
        if ((NULL == %(_zout)s)
@@ -1862,7 +1862,7 @@ class Dot22Scalar(GemmRelated):
            raise
    def __str__(self):
-        return "_dot22scalar"
+        return self.__class__.__name__
    setup_z_Nz_Sz = Dot22.setup_z_Nz_Sz

--- a/theano/tensor/elemwise_cgen.py
+++ b/theano/tensor/elemwise_cgen.py
@@ -20,8 +20,8 @@ def make_declare(loop_orders, dtypes, sub):
                # the stride in that dimension,
                # and the jump from an iteration to the next
                decl += """
-                int %(var)s_n%(value)i;
+                npy_intp %(var)s_n%(value)i;
-                int %(var)s_stride%(value)i;
+                ssize_t %(var)s_stride%(value)i;
                int %(var)s_jump%(value)i_%(j)i;
                """ % locals()
            else:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -29,6 +29,7 @@ from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
 from theano import scalar
 from theano.tensor import basic as T
 from theano import compile  # to register the optimizer built by this file
+from theano.compile.ops import Shape_i
 from theano.gof.python25 import any, all
 from theano.gof.opt import (Optimizer, pre_constant_merge,
@@ -637,78 +638,6 @@ T.pprint.assign(lambda pstate, r: r.owner and isinstance(
        r.owner.op, MakeVector), MakeVectorPrinter())
-class Shape_i(T.Op):
-    """
-    L{Op} to return the shape of a matrix.
-    @note: Non-differentiable.
-    """
-    def __init__(self, i):
-        self.i = i
-    def __hash__(self):
-        return hash(type(self)) ^ self.i
-    def __eq__(self, other):
-        return type(self) == type(other) and self.i == other.i
-    def __str__(self):
-        return '%s{%i}' % (self.__class__.__name__, self.i)
-    def make_node(self, x):
-        # x could be one of a number of types
-        # the only thing we require is that the variable have a .ndim,
-        # and that the value have a .shape
-        if not isinstance(x, T.Variable):
-            raise TypeError('x must be Variable with ndim attribute', x)
-        if x.ndim <= self.i:
-            raise TypeError('x has too few dimensions for Shape_i',
-                            (x, self.i))
-        return T.Apply(self, [x], [T.lscalar()])
-    def perform(self, node, inp, out_):
-        x, = inp
-        out, = out_
-        if out[0] is None:
-            out[0] = theano._asarray(x.shape[self.i], dtype='int64')
-        else:
-            out[0][...] = x.shape[self.i]
-    def c_code_cache_version(self):
-        return (0, 1)
-    def c_code(self, node, name, inp, out_, sub):
-        x, = inp
-        out, = out_
-        i = self.i
-        if isinstance(node.inputs[0].type, T.TensorType):
-            return """
-            if(!%(out)s)
-            %(out)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
-            ((npy_int64*)PyArray_DATA(%(out)s))[0]=PyArray_DIMS(%(x)s)[%(i)s];
-            """ % locals()
-        elif node.inputs[0].type.__class__.__name__ == "CudaNdarrayType":
-            #Don't want to import cuda stuff here.
-            return """
-            if(!%(out)s)
-            %(out)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
-            ((npy_int64*)PyArray_DATA(%(out)s))[0]=
-                            CudaNdarray_HOST_DIMS(%(x)s)[%(i)s];
-            """ % locals()
-        else:
-            #TODO: if your type is not listed here, make a damn registry of
-            #      shape_i ops for various types of variables.
-            #      Do not continue this madness.
-            return super(Shape_i, self).c_code(node, name, (x,), (out,), sub)
-    def infer_shape(self, node, input_shapes):
-        return [()]
-    def grad(self, inp, grads):
-        return [None]
 class ShapeFeature(object):
    """Graph optimizer for removing all calls to shape()

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -611,6 +611,16 @@ theano.compile.register_view_op_c_code(
        """,
        version=1)
+# Register TensorType C code for ViewOp.
+theano.compile.register_shape_i_c_code(
+        TensorType,
+        """
+        if(!%(oname)s)
+            %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
+        ((npy_int64*)PyArray_DATA(%(oname)s))[0]=PyArray_DIMS(%(iname)s)[%(i)s];
+        """,
+        version=1)
 # Register TensorType C code for DeepCopyOp
 theano.compile.register_deep_copy_op_c_code(
        TensorType,