Merge pull request #942 from nouiz/gpu_alloc2

Gpu alloc2

Merge pull request #942 from nouiz/gpu_alloc2
18708214 · lamblin · c8934e50 · a3987a12 · 18708214 · 18708214
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -164,7 +164,7 @@ if compile_cuda_ndarray and cuda_available:
                            code,
                            location=cuda_ndarray_loc,
                            include_dirs=[cuda_path], libs=['cublas'],
-                            preargs=compiler.compile_args())
+                            preargs=['-O3'] + compiler.compile_args())
                    from cuda_ndarray.cuda_ndarray import *
            except Exception, e:
                _logger.error("Failed to compile cuda_ndarray.cu: %s", str(e))

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2255,20 +2255,28 @@ gpu_join = GpuJoin()
 class GpuAlloc(GpuOp):
+    """Implement Alloc on the gpu.
+    The memset_0 param is an optimization. When True, we call
+    cudaMalloc that is faster.
    """
-    Implement Alloc on the gpu.
+    def __init__(self, memset_0=False):
-    """
+        self.memset_0 = memset_0
-    def __init__(self):
-        pass
    def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) == type(other) and self.memset_0 == other.memset_0
    def __hash__(self):
-        return hash(type(self))
+        return hash(type(self)) ^ hash(self.memset_0)
    def __str__(self):
-        return self.__class__.__name__
+        #Hide the memset parameter when not used to prevent confusion.
+        if self.memset_0:
+            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
+        else:
+            s = self.__class__.__name__
+        return s
    def make_node(self, value, *shape):
        #if their is unneeded transfert generated by the next line
@@ -2307,6 +2315,7 @@ class GpuAlloc(GpuOp):
        value = inputs[0]
        shps = inputs[1:]
        nd = len(shps)
+        memset_0 = int(self.memset_0)
        str = "int dims[%(nd)s];\n" % locals()
        for idx, sh in enumerate(shps):
            str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
@@ -2330,7 +2339,21 @@ class GpuAlloc(GpuOp):
                %(fail)s;
            }
        }
-        if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
+        if (%(memset_0)s)
+        {
+            if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
+                                          CudaNdarray_SIZE(%(out)s) * 4))
+            {
+                PyErr_Format(PyExc_MemoryError,
+                             "GpuAlloc: Error memsetting %%d"
+                             " bytes of device memory.",
+                             CudaNdarray_SIZE(%(out)s) * 4);
+                Py_XDECREF(%(out)s);
+                %(out)s = NULL;
+                %(fail)s;
+            }
+        }
+        else if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(value)s, true))
        {
            // exception already set
            Py_XDECREF(%(out)s);
@@ -2348,7 +2371,7 @@ class GpuAlloc(GpuOp):
        return [None for i in inputs]
    def c_code_cache_version(self):
-        return (4,)
+        return (5,)
    def do_constant_folding(self, node):
        for client in node.outputs[0].clients:

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -3133,24 +3133,6 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
    return 0;
 }
-bool
-CudaNdarray_is_c_contiguous(const CudaNdarray * self)
-{
-    bool c_contiguous = true;
-    int size = 1;
-    for (int i = self->nd-1; (i >= 0) && c_contiguous; --i)
-    {
-        if (CudaNdarray_HOST_DIMS(self)[i] == 1)
-            continue;
-        if (CudaNdarray_HOST_STRIDES(self)[i] != size)
-        {
-            c_contiguous = false;
-        }
-        size = size * CudaNdarray_HOST_DIMS(self)[i];
-    }
-    return c_contiguous;
-}
 PyObject *
 CudaNdarray_new_nd(int nd)
 {
@@ -4346,12 +4328,6 @@ CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
    return self->host_structure + 2*self->nd;
 }
-void
-cnda_mark_dev_structure_dirty(CudaNdarray * self)
-{
-    self->dev_structure_fresh = 0;
-}
 int
 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
 {
@@ -4406,43 +4382,33 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
    return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
 }
-void
+int
-CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
+cnda_copy_structure_to_device(const CudaNdarray * self)
 {
-    if ((idx >= self->nd) || (idx < 0) || (d < 0))
+    //If the device structure do not exists, create it.
+    //We allocate it here as we do not need it often.
+    //In fact, we need it so infrequently that we expect
+    //that most object won't need it. Not allocating it
+    //save a significant when creating object.
+    //This speed up a benchmark by 8% with the gc.
+    if (!self->dev_structure)
+    {
+        int struct_size = cnda_structure_size(self->nd);
+        if (struct_size)
        {
-        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
+            self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
-    }
+            if (NULL == self->dev_structure)
-    if (d != self->host_structure[idx])
            {
-        self->host_structure[idx] = d;
+                return -1;
-        int log2d = (int)log2((double)d);
-        self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
-        cnda_mark_dev_structure_dirty(self);
            }
-}
-void
-CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
-{
-    if ((idx >= self->nd) || (idx < 0))
-    {
-        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
        }
-    if (s != CudaNdarray_HOST_STRIDES(self)[idx])
-    {
-        self->host_structure[idx+self->nd] = s;
-        cnda_mark_dev_structure_dirty(self);
    }
-}
+    cublasSetVector(cnda_structure_size(self->nd),
+                    sizeof(int),
+                    self->host_structure,
-int
+                    1,
-cnda_copy_structure_to_device(const CudaNdarray * self)
+                    self->dev_structure,
-{
+                    1);
-    cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
    CNDA_THREAD_SYNC;
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
@@ -4510,56 +4476,6 @@ CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
    return PyInt_FromLong(CudaNdarray_SIZE(self));
 }
-int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
-{
-    if (nd != self->nd)
-    {
-        if (self->dev_structure)
-        {
-            if (device_free(self->dev_structure))
-            {
-                return -1;
-            }
-            self->dev_structure = NULL;
-        }
-        if (self->host_structure)
-        {
-            free(self->host_structure);
-            self->host_structure = NULL;
-            self->nd = -1;
-        }
-        if (nd == -1) return 0;
-        self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
-        if (NULL == self->host_structure)
-        {
-            PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
-            return -1;
-        }
-        //initialize all dimensions and strides to 0
-        for (int i = 0; i < cnda_structure_size(nd); ++i)
-        {
-            self->host_structure[i] = 0;
-        }
-        int struct_size = cnda_structure_size(nd);
-        if (struct_size)
-        {
-            self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
-            if (NULL == self->dev_structure)
-            {
-                free(self->host_structure);
-                self->host_structure = NULL;
-                self->dev_structure = NULL;
-                return -1;
-            }
-        }
-        self->nd = nd;
-        self->dev_structure_fresh = 0;
-    }
-    return 0;
-}
 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
 {
    return CudaNdarray_set_device_data(self, data, (PyObject *) base);

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -82,8 +82,9 @@ struct CudaNdarray
    //device pointers (allocated by cudaMalloc)
    mutable int dev_structure_fresh;
-    //dev_structure should be accessed via macros, otherwise may not be synchronized
+    //dev_structure should be accessed via macros, otherwise may not be
-    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
+    //synchronized. The macro will allocate it when needed.
+    mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
    real* devdata; //pointer to data element [0,..,0].
 };
@@ -126,8 +127,12 @@ CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
 DllExport const int *
 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self);
-DllExport void
+DllExport inline void __attribute__((always_inline))
-cnda_mark_dev_structure_dirty(CudaNdarray * self);
+cnda_mark_dev_structure_dirty(CudaNdarray * self)
+{
+    self->dev_structure_fresh = 0;
+}
 DllExport int
 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase);
@@ -143,11 +148,38 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
 *
 *  Does not sync structure to host.
 */
-DllExport void
+DllExport inline void __attribute__((always_inline))
-CudaNdarray_set_dim(CudaNdarray * self, int idx, int d);
+CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) 
+{
+    if ((idx >= self->nd) || (idx < 0) || (d < 0))
+    {
+        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
+    }
+    if (d != self->host_structure[idx])
+    {
+        self->host_structure[idx] = d;
+        int log2d = (int)log2((double)d);
+        self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
+        cnda_mark_dev_structure_dirty(self);
+    }
+}
-DllExport void
+DllExport inline void __attribute__((always_inline))
-CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
+CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
+{
+    if ((idx >= self->nd) || (idx < 0))
+    {
+        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
+    }
+    if (s != CudaNdarray_HOST_STRIDES(self)[idx])
+    {
+        self->host_structure[idx+self->nd] = s;
+        cnda_mark_dev_structure_dirty(self);
+    }
+}
 /***
 *  Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
@@ -188,7 +220,46 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
 *
 * Note: This does not allocate storage for data.
 */
-DllExport int CudaNdarray_set_nd(CudaNdarray * self, const int nd);
+DllExport inline int __attribute__((always_inline))
+CudaNdarray_set_nd(CudaNdarray * self, const int nd)
+{
+    if (nd != self->nd)
+    {
+        if (self->dev_structure)
+        {
+            if (device_free(self->dev_structure))
+            {
+                return -1;
+            }
+            self->dev_structure = NULL;
+        }
+        if (self->host_structure)
+        {
+            free(self->host_structure);
+            self->host_structure = NULL;
+            self->nd = -1;
+        }
+        if (nd == -1) return 0;
+        self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
+        if (NULL == self->host_structure)
+        {
+            PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
+            return -1;
+        }
+        //initialize all dimensions and strides to 0
+        for (int i = 0; i < cnda_structure_size(nd); ++i)
+        {
+            self->host_structure[i] = 0;
+        }
+        //The device structure will be created in cnda_copy_structure_to_device
+        //if needed.
+        self->nd = nd;
+        self->dev_structure_fresh = 0;
+    }
+    return 0;
+}
 /**
 * CudaNdarray_alloc_contiguous
@@ -218,7 +289,7 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
        size = size * dim[i];
    }
-    if (CudaNdarray_is_c_contiguous(self) && (self->data_allocated == size))
+    if ((self->data_allocated == size) && CudaNdarray_is_c_contiguous(self))
    {
        return 0;
    }
@@ -333,7 +404,24 @@ CudaNdarray_ZEROS(int n, int * dims);
 /**
 * True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
 */
-DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
+DllExport inline bool  __attribute__((always_inline))
+CudaNdarray_is_c_contiguous(const CudaNdarray * self)
+{
+    bool c_contiguous = true;
+    int size = 1;
+    for (int i = self->nd-1; (i >= 0) && c_contiguous; --i)
+    {
+        if (CudaNdarray_HOST_DIMS(self)[i] == 1)
+            continue;
+        if (CudaNdarray_HOST_STRIDES(self)[i] != size)
+        {
+            c_contiguous = false;
+        }
+        size = size * CudaNdarray_HOST_DIMS(self)[i];
+    }
+    return c_contiguous;
+}
 DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
 DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -33,6 +33,7 @@ from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmax1HotWithBiasDx,
        GpuSoftmax, GpuSoftmaxWithBias)
 from theano.sandbox.cuda.elemwise import SupportCodeError
+from theano.sandbox.cuda.var import CudaNdarrayConstant
 from theano.scan_module import scan_utils, scan_op
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
@@ -1337,6 +1338,19 @@ def local_gpualloc(node):
        return [new_out]
+@register_opt()
+@local_optimizer([tensor.Alloc])
+def local_gpualloc_memset_0(node):
+    replace = False
+    if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
+        inp = node.inputs[0]
+        if (isinstance(inp, CudaNdarrayConstant) and
+            inp.data.size == 1 and
+            (numpy.asarray(inp.data) == 0).all()):
+            new_out = GpuAlloc(memset_0=True)(*node.inputs)
+            return [new_out]
 def safe_to_gpu(x):
    if (isinstance(x.type, tensor.TensorType) and
        x.type.dtype == 'float32'):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -775,11 +775,11 @@ def test_gpujoin_gpualloc():
    assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
    assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
-    assert sum([node.op == B.gpu_alloc
+    assert sum([isinstance(node.op, B.GpuAlloc)
                for node in f_gpu.maker.fgraph.toposort()]) == 2
    assert sum([node.op == B.gpu_join
                for node in f_gpu.maker.fgraph.toposort()]) == 1
-    assert sum([node.op == B.gpu_alloc
+    assert sum([isinstance(node.op, B.GpuAlloc)
                for node in f_gpu2.maker.fgraph.toposort()]) == 2
    assert sum([node.op == B.gpu_join
                for node in f_gpu2.maker.fgraph.toposort()]) == 1

--- a/theano/sandbox/cuda/tests/test_memory.py
+++ b/theano/sandbox/cuda/tests/test_memory.py
@@ -62,7 +62,7 @@ def test_memory():
    # When dtype is float64, only the shared is on the gpu and it is transferd
    # to the cpu for computation. So no extra alloc after compilation.
    # more_alloc1 if after the first compilation, more_alloc2 after the second.
-    for dtype, more_alloc1, more_alloc2 in [("float32", 2, 9),
+    for dtype, more_alloc1, more_alloc2 in [("float32", 1, 4),
                                            ("float64", 0, 0)]:
        print dtype
        test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype)
@@ -129,7 +129,7 @@ def test_memory_lazy():
    # When dtype is float64, only the shared is on the gpu and it is transferd
    # to the cpu for computation. So no extra alloc after compilation.
    # more_alloc1 if after the first compilation, more_alloc2 after the second.
-    for dtype, more_alloc1 in [("float32", 3),
+    for dtype, more_alloc1 in [("float32", 2),
                               ("float64", 0)]:
        print dtype
        test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype)

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -70,6 +70,39 @@ def test_gpualloc():
    assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )
+def test_alloc_memset_0():
+    i = tensor.iscalar()
+    z = numpy.zeros((1,), dtype='float32')
+    o = numpy.ones((1,), dtype='float32')
+    ones = numpy.ones((2,), dtype='float32')
+    # Test with 0
+    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
+    assert (numpy.asarray(f(6)) == 0).all()
+    # Test with 1
+    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(o)), i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
+    assert not topo[0].op.memset_0
+    assert (numpy.asarray(f(6)) == 1).all()
+    # Test with 1, 1
+    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(ones)), i)
+    f = theano.function([i], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
+    assert not topo[0].op.memset_0
+    assert (numpy.asarray(f(2)) == 1).all()
 def test_gpuspecifyshape():
    x = cuda.shared_constructor(numpy.ones(3,dtype='float32'), 'x')
    m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))

--- a/theano/tensor/tests/test_merge.py
+++ b/theano/tensor/tests/test_merge.py
@@ -62,8 +62,8 @@ def test_merge_with_weird_eq():
    g = Env([x, y], [x+y])
    MergeOptimizer().optimize(g)
-    assert len(g.nodes) == 1
+    assert len(g.apply_nodes) == 1
-    node = list(g.nodes)[0]
+    node = list(g.apply_nodes)[0]
    assert len(node.inputs) == 2
    assert node.inputs[0] is node.inputs[1]
@@ -74,8 +74,7 @@ def test_merge_with_weird_eq():
    g = Env([x, y], [x+y])
    MergeOptimizer().optimize(g)
-    assert len(g.nodes) == 1
+    assert len(g.apply_nodes) == 1
-    node = list(g.nodes)[0]
+    node = list(g.apply_nodes)[0]
    assert len(node.inputs) == 2
    assert node.inputs[0] is node.inputs[1]