make CudaNdarray_inplace_add and CudaNdarray_add work with tensor of 0 elements and test them.

e4e88af1 · Frederic Bastien · 6664a048 · e4e88af1 · e4e88af1
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -786,6 +786,10 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other)
        return NULL;
    }
+    if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
+      return (PyObject *) rval;
+    }
    int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
    int n_blocks = std::min(ceil_intdiv(size,(unsigned int)threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
    kAdd_contiguous<<<n_blocks,threads_per_block>>>(
@@ -874,6 +878,11 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
        size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
    }
+    if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
+      Py_INCREF(py_self);
+      return py_self;
+    }
    switch(self->nd)
    {
        case 1:

--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
@@ -16,7 +16,7 @@ def test_host_to_device():
        assert numpy.all(a == c)
 def test_add():
-    for shape in ((), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
+    for shape in ((), (0,), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
        a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
        a1 = a0.copy()
        b0 = cuda_ndarray.CudaNdarray(a0)
@@ -34,6 +34,13 @@ def test_add():
        print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', cpu_dt / gpu_dt
        assert numpy.allclose(asum,  numpy.asarray(bsum))
+	if len(shape)>0:
+            #test inplace version, not implemented with 0 dims
+            b0 += b1
+            a0 += a1
+            assert numpy.allclose(a0, numpy.asarray(b0))
+            assert numpy.allclose(a0,a1*2)
        if len(shape)==2:
            #test not contiguous version.
            #should raise not implemented.