Implement GpuAlloc(memset_0=True) as in the old back-end.

0b70cc9e · Frederic · 26941db0 · 0b70cc9e · 0b70cc9e · 0b70cc9e
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -447,8 +447,26 @@ cuda_from_gpu = CudaFromGpu()
 class GpuAlloc(HideC, Alloc):
+    def __init__(self, memset_0=False):
+        """memset_0 is only an optimized version. True, it mean the
+        value is always 0, so the c code call memset as it is faster.
+        """
+        self.memset_0 = memset_0
+    def __eq__(self, other):
+        return type(self) == type(other) and self.memset_0 == other.memset_0
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.memset_0)
    def __str__(self):
-        return 'GpuAlloc'
+        #Hide the memset parameter when not used to prevent confusion.
+        if self.memset_0:
+            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
+        else:
+            s = self.__class__.__name__
+        return s
    def make_node(self, value, *shape):
        res = Alloc.make_node(self, value, *shape)
@@ -457,6 +475,9 @@ class GpuAlloc(HideC, Alloc):
                             broadcastable=res.outputs[0].broadcastable)
        return Apply(self, [value] + res.inputs[1:], [otype()])
+    def c_headers(self):
+        return ['<compyte/numpy_compat.h>']
    def perform(self, node, inputs, outs):
        out, = outs
        v = inputs[0]
@@ -477,6 +498,7 @@ class GpuAlloc(HideC, Alloc):
        ndim = len(inp[1:])
        zz, = out
+        memset_0 = int(self.memset_0)
        code = """
        int i;
        size_t %(name)s_shape[%(ndim)s];
@@ -503,12 +525,24 @@ class GpuAlloc(HideC, Alloc):
                %(fail)s
            }
        }
+        if (%(memset_0)s && GpuArray_ISONESEGMENT(&%(zz)s->ga))
-        if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) {
+        {
+            int err = GpuArray_memset(&%(zz)s->ga, 0);
+            if (err != GA_NO_ERROR)
+            {
+                PyErr_Format(PyExc_MemoryError,
+                             "GpuAlloc: Error memsetting %%d"
+                             " element of device memory to 0.",
+                             PyGpuArray_SIZE(%(zz)s));
+                %(fail)s;
+            }
+        }
+        else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) {
            PyErr_SetString(PyExc_ValueError, "setarray failed");
            %(fail)s
        }
-        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, fail=sub['fail'])
+        """ % dict(name=name, ndim=ndim, zz=zz, vv=vv,
+                   fail=sub['fail'], memset_0=memset_0)
        if config.gpuarray.sync:
            code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -13,12 +13,15 @@ from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               gpu_from_host,
-                                               gpu_alloc, GpuReshape,
+                                               gpu_alloc,
+                                               GpuAlloc,
+                                               GpuReshape,
                                               GpuEye)
+from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
 from theano.sandbox.gpuarray.subtensor import GpuSubtensor
-from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
+from theano.sandbox.gpuarray.type import GpuArrayConstant
 gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
@@ -126,6 +129,18 @@ def local_gpualloc(node):
    return gpu_alloc
+@register_opt()
+@local_optimizer([GpuAlloc])
+def local_gpualloc_memset_0(node):
+    if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
+        inp = node.inputs[0]
+        if (isinstance(inp, GpuArrayConstant) and
+            inp.data.size == 1 and
+            (numpy.asarray(inp.data) == 0).all()):
+            new_out = GpuAlloc(memset_0=True)(*node.inputs)
+            return [new_out]
 @register_opt()
 @op_lifter([tensor.Reshape])
 def local_gpureshape(node):

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -2,7 +2,8 @@ import numpy
 import theano
 from theano.tests import unittest_tools as utt
-from theano.sandbox.gpuarray.basic_ops import GpuReshape
+from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
+from theano.sandbox.gpuarray.elemwise import GpuCAReduce
 import theano.sandbox.gpuarray
 from theano.tests.unittest_tools import SkipTest
@@ -70,3 +71,36 @@ def test_sum_prod():
        assert res.shape == ()
        assert GpuCAReduce in [type(node.op)
                               for node in f.maker.fgraph.toposort()]
+def test_local_gpualloc_memset_0():
+    i = theano.tensor.iscalar()
+    z = numpy.zeros((1,), dtype='float32')
+    o = numpy.ones((1,), dtype='float32')
+    ones = numpy.ones((2,), dtype='float32')
+    # Test with 0
+    a = gpu_alloc(z, i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
+    assert (numpy.asarray(f(6)) == 0).all()
+    # Test with 1
+    a = gpu_alloc(o, i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, GpuAlloc)
+    assert not topo[0].op.memset_0
+    assert (numpy.asarray(f(6)) == 1).all()
+    # Test with 1, 1
+    a = gpu_alloc(ones, i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, GpuAlloc)
+    assert not topo[0].op.memset_0
+    assert (numpy.asarray(f(2)) == 1).all()