Merge pull request #1437 from nouiz/gpu_iadd

Gpu iadd for 6d tensor

Merge pull request #1437 from nouiz/gpu_iadd
2faeb62c · lamblin · cd50d5ef · 9c43373e · 2faeb62c · 2faeb62c
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1389,6 +1389,45 @@ __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3
    }
 }

+template <int operator_num>
+__global__ void k_ielem_6(const int d0, const int d1,
+                          const int d2, const int d3,
+                          const int d4, const int d5,
+                          float* a, const int sA0, const int sA1,
+                          const int sA2, const int sA3,
+                          const int sA4, const int sA5,
+                          const float* b, const int sB0, const int sB1,
+                          const int sB2, const int sB3,
+                          const int sB4, const int sB5
+                          ){
+    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
+        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
+            for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
+                for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
+                    for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
+                        for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
+                            switch (operator_num) {
+                            case IADD:
+                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
+                                    += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
+                                break;
+                            case IDIV:
+                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
+                                    /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
+                                break;
+                            case CPY:
+                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
+                                    = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 /*
 CudaNdarray_inplace_elemwise
 Compute elemwise, working inplace on A.
@@ -1415,19 +1454,31 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                    const int, const int,
                    const float*, const int, const int,
                    const int, const int);
+    void (*k6)(const int, const int,
+               const int, const int,
+               const int, const int,
+               float*, const int, const int,
+               const int, const int,
+               const int, const int,
+               const float*, const int, const int,
+               const int, const int,
+               const int, const int);
    switch (fct_nb)
    {
        case IADD:
            k3 = k_ielem_3<IADD>;
            k4 = k_ielem_4<IADD>;
+            k6 = k_ielem_6<IADD>;
            break;
        case IDIV:
            k3 = k_ielem_3<IDIV>;
            k4 = k_ielem_4<IDIV>;
+            k6 = k_ielem_6<IDIV>;
            break;
        case CPY:
            k3 = k_ielem_3<CPY>;
            k4 = k_ielem_4<CPY>;
+            k6 = k_ielem_6<CPY>;
            break;
        default:
            assert (0);
@@ -1769,6 +1820,61 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                }
            }
            break;
+        case 6:
+            {
+                dim3 n_blocks(
+                        std::min(
+                            CudaNdarray_HOST_DIMS(self)[0],
+                            NUM_VECTOR_OP_BLOCKS),
+                        CudaNdarray_HOST_DIMS(self)[1],
+                        CudaNdarray_HOST_DIMS(self)[2]
+                        );
+                while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
+                    n_blocks.y /= 2;
+                while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
+                    n_blocks.z /= 2;
+                dim3 n_threads(
+                        std::min(
+                            CudaNdarray_HOST_DIMS(self)[3],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK)
+                    //TODO: DON"T YOU NEED OT PUT DIMS[4] in here???
+                    //TODO: DON"T YOU NEED OT PUT DIMS[5] in here???
+                            );
+                k6<<<n_blocks, n_threads>>>(
+                        CudaNdarray_HOST_DIMS(self)[0],
+                        CudaNdarray_HOST_DIMS(self)[1],
+                        CudaNdarray_HOST_DIMS(self)[2],
+                        CudaNdarray_HOST_DIMS(self)[3],
+                        CudaNdarray_HOST_DIMS(self)[4],
+                        CudaNdarray_HOST_DIMS(self)[5],
+                        CudaNdarray_DEV_DATA(self),
+                        CudaNdarray_HOST_STRIDES(self)[0],
+                        CudaNdarray_HOST_STRIDES(self)[1],
+                        CudaNdarray_HOST_STRIDES(self)[2],
+                        CudaNdarray_HOST_STRIDES(self)[3],
+                        CudaNdarray_HOST_STRIDES(self)[4],
+                        CudaNdarray_HOST_STRIDES(self)[5],
+                        CudaNdarray_DEV_DATA(other),
+                        other_strides[0],
+                        other_strides[1],
+                        other_strides[2],
+                        other_strides[3],
+                        other_strides[4],
+                        other_strides[5]);
+                CNDA_THREAD_SYNC;
+                cudaError_t err = cudaGetLastError();
+                if (cudaSuccess != err)
+                {
+                    PyErr_Format(
+                        PyExc_RuntimeError,
+                        "Cuda error: %s: %s.\n",
+                        "k4",
+                        cudaGetErrorString(err));
+                    Py_XDECREF(new_other);
+                    return -1;
+                }
+            }
+            break;
        default:
        {
            PyErr_Format(

--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
@@ -10,7 +10,7 @@ import theano.sandbox.cuda as cuda_ndarray
 from theano.tensor.basic import _allclose
 from theano.tests import unittest_tools as utt

-if cuda_ndarray.cuda_available == False:
+if not cuda_ndarray.cuda_available:
    raise SkipTest('Optional package cuda disabled')


@@ -29,14 +29,16 @@ def advantage(cpu_dt, gpu_dt):
    else:
        return cpu_dt / gpu_dt

+
 def test_host_to_device():
    #print >>sys.stdout, 'starting test_host_to_dev'
-    for shape in ((), (3,), (2,3), (3,4,5,6)):
+    for shape in ((), (3,), (2, 3), (3, 4, 5, 6)):
        a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
        c = numpy.asarray(b)
        assert numpy.all(a == c)

+
 def test_add_iadd_idiv():
    for shapes in (
                  [(5,5),(5,1)],
@@ -51,8 +53,18 @@ def test_add_iadd_idiv():
                  (3,34,35,36,37),
                  (33,34,3,36,37),
                  (33,34,35,36,3),
+                  (0,0,0,0,0,0),
+                  (3,34,35,36,37,2),
+                  (33,34,3,36,37,2),
+                  (33,34,35,36,3,2),
+                  (3,4,5,6,7,1025),
+                  (3,4,5,6,1025,7),
+                  (3,4,5,1025,6,7),
+                  (3,4,1025,5,6,7),
+                  (3,1025,4,5,6,7),
+                  (1025,3,4,5,6,7),
                  ):
-        if isinstance(shapes,tuple):
+        if isinstance(shapes, tuple):
            shape = shapes
            shape2 = shapes
            a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
@@ -91,7 +103,7 @@ def test_add_iadd_idiv():
        #should raise not implemented.
        a0 = a0_orig.copy()
        b0 = cuda_ndarray.CudaNdarray(a0)
-        if len(shape)==0:
+        if len(shape) == 0:
            continue
        elif len(shape) == 1:
            _b = b1[::-1]
@@ -103,6 +115,10 @@ def test_add_iadd_idiv():
            _b = b1[::, ::, ::, ::-1]
        elif len(shape) == 5:
            _b = b1[::, ::, ::, ::, ::-1]
+        elif len(shape) == 6:
+            _b = b1[::, ::, ::, ::, ::, ::-1]
+        else:
+            raise Exception("You need to modify this case!")
        # TODO: b0[...,::-1] don't work

        if shape == shape2:
@@ -141,11 +157,15 @@ def test_add_iadd_idiv():
        b0 /= _b
        a0 /= a1[..., ::-1]
        assert numpy.allclose(a0, numpy.asarray(b0))
-        assert numpy.allclose(a0, ((a0_orig+a1)/a1+a1[..., ::-1])/a1[..., ::-1])
+        assert numpy.allclose(a0, ((a0_orig + a1) / a1 +
+                                   a1[..., ::-1]) / a1[..., ::-1])
+

 def test_exp():
    #print >>sys.stdout, 'starting test_exp'
-    for shape in ((), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
+    for shape in ((), (3,), (2, 3),
+                  (1, 10000000), (10, 1000000),
+                  (100, 100000), (1000, 10000), (10000, 1000)):
        a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
        a1 = a0.copy()
        b0 = cuda_ndarray.CudaNdarray(a0)
@@ -181,90 +201,108 @@ def test_copy():
    assert numpy.allclose(a, numpy.asarray(b))
    assert numpy.allclose(a, numpy.asarray(c))
    assert numpy.allclose(a, numpy.asarray(d))
-    b+=b
+    b += b
    assert numpy.allclose(a+a, numpy.asarray(b))
    assert numpy.allclose(a+a, numpy.asarray(c))
    assert numpy.allclose(a, numpy.asarray(d))

+
 def test_nvcc_bug():
    """
    The fct k_elemwise_unary_rowmajor_copy(used by cuda.copy()) in cuda_ndarray.cu
    is not well compiled with nvcc 3.0 and 3.1 beta. We found a workaround, so it
    sould work correctly. Without the workaround, this test fail.
    """
-    shape = (5,4)
+    shape = (5, 4)
    aa = theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    a = aa[::,::-1]
+    a = aa[::, ::-1]

-    b = cuda_ndarray.CudaNdarray(aa)[::,::-1]
+    b = cuda_ndarray.CudaNdarray(aa)[::, ::-1]
    c = copy.copy(b)
    d = copy.deepcopy(b)

    assert numpy.allclose(a, numpy.asarray(b))
    assert numpy.allclose(a, numpy.asarray(c))
    assert numpy.allclose(a, numpy.asarray(d))
-    b+=b
+    b += b
    assert numpy.allclose(a+a, numpy.asarray(b))
    assert numpy.allclose(a+a, numpy.asarray(c))
    assert numpy.allclose(a, numpy.asarray(d))

+
 class test_DimShuffle(unittest.TestCase):
    def test_dimshuffle(self):
        utt.seed_rng()
        rng = numpy.random.RandomState(utt.fetch_seed())

        # 2d -> 0d
-        a = theano._asarray(rng.randn(1,1), dtype='float32')
+        a = theano._asarray(rng.randn(1, 1), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,()))
+        assert numpy.allclose(numpy.transpose(a),
+                              cuda_ndarray.dimshuffle(b, ()))

        # Test when we drop a axis that don't have shape 1
-        a = theano._asarray(rng.randn(2,1), dtype='float32')
+        a = theano._asarray(rng.randn(2, 1), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b,())
+        self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b, ())

        # Test that we can't take a dimensions multiple time
-        a = theano._asarray(rng.randn(2,1), dtype='float32')
+        a = theano._asarray(rng.randn(2, 1), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b,(1,1))
+        self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b, (1, 1))

        # 1d
        a = theano._asarray(rng.randn(3,), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,(0,)))
-        assert numpy.allclose(a[None,:,None], cuda_ndarray.dimshuffle(b,(-1,0,-1)))
+        assert numpy.allclose(numpy.transpose(a),
+                              cuda_ndarray.dimshuffle(b, (0,)))
+        assert numpy.allclose(a[None, :, None],
+                              cuda_ndarray.dimshuffle(b, (-1, 0, -1)))

        # 2d
-        a = theano._asarray(rng.randn(3,11), dtype='float32')
+        a = theano._asarray(rng.randn(3, 11), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,(1,0)))
-        assert numpy.allclose(numpy.transpose(a)[None,:,None,:,None], cuda_ndarray.dimshuffle(b,(-1,1,-1,0,-1)))
+        assert numpy.allclose(numpy.transpose(a),
+                              cuda_ndarray.dimshuffle(b, (1, 0)))
+        assert numpy.allclose(numpy.transpose(a)[None, :, None, :, None],
+                              cuda_ndarray.dimshuffle(b, (-1, 1, -1, 0, -1)))

        # 2d -> 1d
-        a = theano._asarray(rng.randn(1,11), dtype='float32')
+        a = theano._asarray(rng.randn(1, 11), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(a[:,], cuda_ndarray.dimshuffle(b,(1,)))
-        a = theano._asarray(rng.randn(11,1), dtype='float32')
+        assert numpy.allclose(a[:],
+                              cuda_ndarray.dimshuffle(b, (1,)))
+        a = theano._asarray(rng.randn(11, 1), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(a.reshape((11,)), cuda_ndarray.dimshuffle(b,(0,)))
+        assert numpy.allclose(a.reshape((11,)),
+                              cuda_ndarray.dimshuffle(b, (0,)))

        # 3d
-        a = theano._asarray(rng.randn(3,4,5), dtype='float32')
+        a = theano._asarray(rng.randn(3, 4, 5), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(a, cuda_ndarray.dimshuffle(b,(0,1,2)))
-        assert numpy.allclose(numpy.swapaxes(a,0,1), cuda_ndarray.dimshuffle(b,(1,0,2)))
-        assert numpy.allclose(numpy.swapaxes(a,0,2), cuda_ndarray.dimshuffle(b,(2,1,0)))
-        assert numpy.allclose(numpy.swapaxes(a,1,2), cuda_ndarray.dimshuffle(b,(0,2,1)))
-        assert numpy.allclose(numpy.swapaxes(a,1,2)[None,:,None,:,:,None], cuda_ndarray.dimshuffle(b,(-1,0,-1,2,1,-1)))
+        assert numpy.allclose(a, cuda_ndarray.dimshuffle(b, (0, 1, 2)))
+        assert numpy.allclose(numpy.swapaxes(a, 0, 1),
+                              cuda_ndarray.dimshuffle(b, (1, 0, 2)))
+        assert numpy.allclose(numpy.swapaxes(a, 0, 2),
+                              cuda_ndarray.dimshuffle(b, (2, 1, 0)))
+        assert numpy.allclose(numpy.swapaxes(a, 1, 2),
+                              cuda_ndarray.dimshuffle(b, (0, 2, 1)))
+        assert numpy.allclose(numpy.swapaxes(a, 1, 2)[None, :, None, :, :, None],
+                              cuda_ndarray.dimshuffle(b, (-1, 0, -1, 2, 1, -1)))

        # 4d
-        a = theano._asarray(rng.randn(3,11,4,5), dtype='float32')
+        a = theano._asarray(rng.randn(3, 11, 4, 5), dtype='float32')
        b = cuda_ndarray.CudaNdarray(a)
-        assert numpy.allclose(numpy.swapaxes(a,0,1), cuda_ndarray.dimshuffle(b,(1,0,2,3)))
-        assert numpy.allclose(numpy.swapaxes(a,0,2), cuda_ndarray.dimshuffle(b,(2,1,0,3)))
-        assert numpy.allclose(numpy.swapaxes(a,0,3), cuda_ndarray.dimshuffle(b,(3,1,2,0)))
-        assert numpy.allclose(numpy.swapaxes(a,0,3), cuda_ndarray.dimshuffle(b,(3,1,2,0)))
-        assert numpy.allclose(numpy.swapaxes(a,0,3)[None,:,None,:,:,:], cuda_ndarray.dimshuffle(b,(-1,3,-1,1,2,0)))
+        assert numpy.allclose(numpy.swapaxes(a, 0, 1),
+                              cuda_ndarray.dimshuffle(b, (1, 0, 2, 3)))
+        assert numpy.allclose(numpy.swapaxes(a, 0, 2),
+                              cuda_ndarray.dimshuffle(b, (2, 1, 0, 3)))
+        assert numpy.allclose(numpy.swapaxes(a, 0, 3),
+                              cuda_ndarray.dimshuffle(b, (3, 1, 2, 0)))
+        assert numpy.allclose(numpy.swapaxes(a, 0, 3),
+                              cuda_ndarray.dimshuffle(b, (3, 1, 2, 0)))
+        assert numpy.allclose(numpy.swapaxes(a, 0, 3)[None, :, None, :, :, :],
+                              cuda_ndarray.dimshuffle(b, (-1, 3, -1, 1, 2, 0)))


 def test_dot():
@@ -281,66 +319,72 @@ def test_dot():

    assert _allclose(numpy.dot(a0, a1), cuda_ndarray.dot(b0, b1))

-
    a1 = theano._asarray(rng.randn(6, 7), dtype='float32')
    b1 = cuda_ndarray.CudaNdarray(a1)

    numpy_version = numpy.dot(a0, a1.T)
-    transposed = cuda_ndarray.dimshuffle(b1,(1,0))
-    cuda_version  =  cuda_ndarray.dot(b0,  transposed)
+    transposed = cuda_ndarray.dimshuffle(b1, (1, 0))
+    cuda_version = cuda_ndarray.dot(b0,  transposed)

    assert _allclose(numpy_version, cuda_version)

    a1 = theano._asarray(rng.randn(7, 6), dtype='float32')
    b1 = cuda_ndarray.CudaNdarray(a1)

-
    a0 = theano._asarray(rng.randn(7, 4), dtype='float32')
    b0 = cuda_ndarray.CudaNdarray(a0)

    assert _allclose(numpy.dot(a0.T, a1),
-            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0,(1,0)), b1))
+            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)), b1))

    a1 = theano._asarray(rng.randn(6, 7), dtype='float32')
    b1 = cuda_ndarray.CudaNdarray(a1)

    assert _allclose(numpy.dot(a0.T, a1.T),
-            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0,(1,0)),
-                             cuda_ndarray.dimshuffle(b1,(1,0))))
+            cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)),
+                             cuda_ndarray.dimshuffle(b1, (1, 0))))


 def test_sum():
-    shape = (2,3)
-    a0 = theano._asarray(numpy.arange(shape[0]*shape[1]).reshape(shape), dtype='float32')
+    shape = (2, 3)
+    a0 = theano._asarray(numpy.arange(shape[0] * shape[1]).reshape(shape),
+                         dtype='float32')

    b0 = cuda_ndarray.CudaNdarray(a0)

-    assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
+    assert numpy.allclose(a0.sum(),
+                          numpy.asarray(b0.reduce_sum([1, 1])))

    a0sum = a0.sum(axis=0)
-    b0sum = b0.reduce_sum([1,0])
+    b0sum = b0.reduce_sum([1, 0])

    #print 'asum\n',a0sum
    #print 'bsum\n',numpy.asarray(b0sum)

-    assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
-    assert numpy.allclose(a0.sum(axis=1), numpy.asarray(b0.reduce_sum([0,1])))
-    assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0,0])))
+    assert numpy.allclose(a0.sum(axis=0),
+                          numpy.asarray(b0.reduce_sum([1, 0])))
+    assert numpy.allclose(a0.sum(axis=1),
+                          numpy.asarray(b0.reduce_sum([0, 1])))
+    assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0, 0])))

-    shape = (3,4,5,6,7,8)
-    a0 = theano._asarray(numpy.arange(3*4*5*6*7*8).reshape(shape), dtype='float32')
+    shape = (3, 4, 5, 6, 7, 8)
+    a0 = theano._asarray(numpy.arange(3 * 4 * 5 * 6 * 7 * 8).reshape(shape),
+                         dtype='float32')
    b0 = cuda_ndarray.CudaNdarray(a0)
-    assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0), numpy.asarray(b0.reduce_sum([1,0,0,1,0,1])))
+    assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0),
+                          numpy.asarray(b0.reduce_sum([1, 0, 0, 1, 0, 1])))

-    shape = (16,2048)
-    a0 = theano._asarray(numpy.arange(16*2048).reshape(shape), dtype='float32')
+    shape = (16, 2048)
+    a0 = theano._asarray(numpy.arange(16 * 2048).reshape(shape),
+                         dtype='float32')
    b0 = cuda_ndarray.CudaNdarray(a0)
-    assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
+    assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1, 0])))

-    shape = (16,10)
+    shape = (16, 10)
    a0 = theano._asarray(numpy.arange(160).reshape(shape), dtype='float32')
    b0 = cuda_ndarray.CudaNdarray(a0)
-    assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
+    assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1, 1])))
+

 def test_reshape():
    shapelist = [