Merge pull request #1437 from nouiz/gpu_iadd

Gpu iadd for 6d tensor

Merge pull request #1437 from nouiz/gpu_iadd
2faeb62c · lamblin · cd50d5ef · 9c43373e · 2faeb62c · 2faeb62c
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1389,6 +1389,45 @@ __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3
    }
 }

+template <int operator_num>
+__global__ void k_ielem_6(const int d0, const int d1,
+                          const int d2, const int d3,
+                          const int d4, const int d5,
+                          float* a, const int sA0, const int sA1,
+                          const int sA2, const int sA3,
+                          const int sA4, const int sA5,
+                          const float* b, const int sB0, const int sB1,
+                          const int sB2, const int sB3,
+                          const int sB4, const int sB5
+                          ){
+    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
+        for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
+            for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
+                for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
+                    for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
+                        for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
+                            switch (operator_num) {
+                            case IADD:
+                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
+                                    += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
+                                break;
+                            case IDIV:
+                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
+                                    /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
+                                break;
+                            case CPY:
+                                a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
+                                    = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 /*
 CudaNdarray_inplace_elemwise
 Compute elemwise, working inplace on A.
@@ -1415,19 +1454,31 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                    const int, const int,
                    const float*, const int, const int,
                    const int, const int);
+    void (*k6)(const int, const int,
+               const int, const int,
+               const int, const int,
+               float*, const int, const int,
+               const int, const int,
+               const int, const int,
+               const float*, const int, const int,
+               const int, const int,
+               const int, const int);
    switch (fct_nb)
    {
        case IADD:
            k3 = k_ielem_3<IADD>;
            k4 = k_ielem_4<IADD>;
+            k6 = k_ielem_6<IADD>;
            break;
        case IDIV:
            k3 = k_ielem_3<IDIV>;
            k4 = k_ielem_4<IDIV>;
+            k6 = k_ielem_6<IDIV>;
            break;
        case CPY:
            k3 = k_ielem_3<CPY>;
            k4 = k_ielem_4<CPY>;
+            k6 = k_ielem_6<CPY>;
            break;
        default:
            assert (0);
@@ -1769,6 +1820,61 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                }
            }
            break;
+        case 6:
+            {
+                dim3 n_blocks(
+                        std::min(
+                            CudaNdarray_HOST_DIMS(self)[0],
+                            NUM_VECTOR_OP_BLOCKS),
+                        CudaNdarray_HOST_DIMS(self)[1],
+                        CudaNdarray_HOST_DIMS(self)[2]
+                        );
+                while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
+                    n_blocks.y /= 2;
+                while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
+                    n_blocks.z /= 2;
+                dim3 n_threads(
+                        std::min(
+                            CudaNdarray_HOST_DIMS(self)[3],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK)
+                    //TODO: DON"T YOU NEED OT PUT DIMS[4] in here???
+                    //TODO: DON"T YOU NEED OT PUT DIMS[5] in here???
+                            );
+                k6<<<n_blocks, n_threads>>>(
+                        CudaNdarray_HOST_DIMS(self)[0],
+                        CudaNdarray_HOST_DIMS(self)[1],
+                        CudaNdarray_HOST_DIMS(self)[2],
+                        CudaNdarray_HOST_DIMS(self)[3],
+                        CudaNdarray_HOST_DIMS(self)[4],
+                        CudaNdarray_HOST_DIMS(self)[5],
+                        CudaNdarray_DEV_DATA(self),
+                        CudaNdarray_HOST_STRIDES(self)[0],
+                        CudaNdarray_HOST_STRIDES(self)[1],
+                        CudaNdarray_HOST_STRIDES(self)[2],
+                        CudaNdarray_HOST_STRIDES(self)[3],
+                        CudaNdarray_HOST_STRIDES(self)[4],
+                        CudaNdarray_HOST_STRIDES(self)[5],
+                        CudaNdarray_DEV_DATA(other),
+                        other_strides[0],
+                        other_strides[1],
+                        other_strides[2],
+                        other_strides[3],
+                        other_strides[4],
+                        other_strides[5]);
+                CNDA_THREAD_SYNC;
+                cudaError_t err = cudaGetLastError();
+                if (cudaSuccess != err)
+                {
+                    PyErr_Format(
+                        PyExc_RuntimeError,
+                        "Cuda error: %s: %s.\n",
+                        "k4",
+                        cudaGetErrorString(err));
+                    Py_XDECREF(new_other);
+                    return -1;
+                }
+            }
+            break;
        default:
        {
            PyErr_Format(

--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py