New optimization that will use cudaMemset for GpuAlloc when we init memory to 0.

7cc1d895 · Frederic · c8934e50 · 7cc1d895 · 7cc1d895 · 7cc1d895
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2255,20 +2255,23 @@ gpu_join = GpuJoin()


 class GpuAlloc(GpuOp):
+    """Implement Alloc on the gpu.
+
+    The memset_0 param is an optimization. When True, we call
+    cudaMalloc that is faster.
+
    """
-    Implement Alloc on the gpu.
-    """
-    def __init__(self):
-        pass
+    def __init__(self, memset_0=False):
+        self.memset_0 = memset_0

    def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) == type(other) and self.memset_0 == other.memset_0

    def __hash__(self):
-        return hash(type(self))
+        return hash(type(self)) ^ hash(self.memset_0)

    def __str__(self):
-        return self.__class__.__name__
+        return "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)

    def make_node(self, value, *shape):
        #if their is unneeded transfert generated by the next line
@@ -2307,6 +2310,7 @@ class GpuAlloc(GpuOp):
        value = inputs[0]
        shps = inputs[1:]
        nd = len(shps)
+        memset_0 = int(self.memset_0)
        str = "int dims[%(nd)s];\n" % locals()
        for idx, sh in enumerate(shps):
            str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
@@ -2330,7 +2334,21 @@ class GpuAlloc(GpuOp):
                %(fail)s;
            }
        }
-        if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
+        if (%(memset_0)s)
+        {
+            if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
+                                          CudaNdarray_SIZE(%(out)s) * 4))
+            {
+                PyErr_Format(PyExc_MemoryError,
+                             "GpuAlloc: Error memsetting %%d"
+                             " bytes of device memory.",
+                             CudaNdarray_SIZE(%(out)s) * 4);
+                Py_XDECREF(%(out)s);
+                %(out)s = NULL;
+                %(fail)s;
+            }
+        }
+        else if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
        {
            // exception already set
            Py_XDECREF(%(out)s);
@@ -2348,7 +2366,7 @@ class GpuAlloc(GpuOp):
        return [None for i in inputs]

    def c_code_cache_version(self):
-        return (4,)
+        return (5,)

    def do_constant_folding(self, node):
        for client in node.outputs[0].clients:

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -33,6 +33,7 @@ from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmax1HotWithBiasDx,
        GpuSoftmax, GpuSoftmaxWithBias)
 from theano.sandbox.cuda.elemwise import SupportCodeError
+from theano.sandbox.cuda.var import CudaNdarrayConstant
 from theano.scan_module import scan_utils, scan_op
 from theano.tensor.blas import _is_real_vector, _is_real_matrix

@@ -1337,6 +1338,18 @@ def local_gpualloc(node):
        return [new_out]


+@register_opt()
+@local_optimizer([tensor.Alloc])
+def local_gpualloc_memset_0(node):
+    replace = False
+    if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
+        inp = node.inputs[0]
+        if (isinstance(inp, CudaNdarrayConstant) and
+            numpy.asarray(inp.data) == 0 and inp.data.size == 1):
+            new_out = GpuAlloc(memset_0=True)(*node.inputs)
+            return [new_out]
+
+
 def safe_to_gpu(x):
    if (isinstance(x.type, tensor.TensorType) and
        x.type.dtype == 'float32'):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -775,11 +775,11 @@ def test_gpujoin_gpualloc():

    assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
    assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
-    assert sum([node.op == B.gpu_alloc
+    assert sum([isinstance(node.op, B.GpuAlloc)
                for node in f_gpu.maker.fgraph.toposort()]) == 2
    assert sum([node.op == B.gpu_join
                for node in f_gpu.maker.fgraph.toposort()]) == 1
-    assert sum([node.op == B.gpu_alloc
+    assert sum([isinstance(node.op, B.GpuAlloc)
                for node in f_gpu2.maker.fgraph.toposort()]) == 2
    assert sum([node.op == B.gpu_join
                for node in f_gpu2.maker.fgraph.toposort()]) == 1

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -70,6 +70,17 @@ def test_gpualloc():
    assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )


+def test_alloc_memset_0():
+    i = tensor.iscalar()
+    z = numpy.zeros((1,), dtype='float32')
+    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
+    assert (numpy.asarray(f(6)) == 0).all()
+
+
 def test_gpuspecifyshape():
    x = cuda.shared_constructor(numpy.ones(3,dtype='float32'), 'x')
    m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))