merge

85b0821d · James Bergstra · 09270345 · 8199e0df · 85b0821d · 85b0821d
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1414,10 +1414,12 @@ class DebugMode(Mode):
            check_c_code=None,
            check_py_code=None,
            check_isfinite=None,
-            require_matching_strides=None):
+            require_matching_strides=None,
+            linker=None):
        """Initialize member variables.

        If any of these arguments (except optimizer) is not None, it overrides the class default.
+        The linker arguments is not used. It is set their to allow Mode.requiring() and some other fct to work with DebugMode too.
        """
        super(DebugMode, self).__init__(
                optimizer=optimizer,

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -12,8 +12,6 @@ from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
 import opt
 import cuda_ndarray

-import theano.compile.sandbox
-
 import os
 import theano.config as config
 from theano.compile import optdb

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -583,7 +583,7 @@ class GpuSum(Op):

    def _k_init(self, *args):
        return """
-                const int threadCount = blockDim.x * blockDim.y * blockDim.y;
+                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
                extern __shared__ float buf[];
                float mysum = 0.0f;
@@ -835,6 +835,38 @@ class GpuSum(Op):
        }
        """ % locals()

+    def c_code_reduce_1111(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
+
+            //get as many y threads as we can fit
+            while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
+            {
+                if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
+                    break;
+                n_threads.y += 1;
+            }
+            n_threads.y -= 1;
+
+            //get as many z threads as we can fit
+            while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
+            {
+                if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
+                    break;
+                n_threads.z += 1;
+            }
+            n_threads.z -= 1;
+
+            dim3 n_blocks(1,1,1);
+            %(makecall)s
+        }
+        """ % locals()
+
    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
        print >> sio, """
        {
@@ -892,7 +924,7 @@ class GpuSum(Op):

    def c_code_cache_version(self):
        #return ()
-        return (7,)
+        return (8,)


    def c_support_code_apply(self, node, nodename):
@@ -900,6 +932,7 @@ class GpuSum(Op):
        if self.reduce_mask == (1,):
            #this kernel is ok for up to a few thousand elements, but 
            # it only runs on ONE multiprocessor
+            reducebuf = self._k_reduce_buf('Z[0]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_1_%(nodename)s(
                    const unsigned int d0,
@@ -921,36 +954,13 @@ class GpuSum(Op):
                    float Ai = A[i0 * sA0];
                    mysum += Ai;
                }
-                buf[threadNum] = mysum;
-                __syncthreads();
-
-                // rest of function is handled by one warp
-                if (threadNum < warpSize)
-                {
-                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                    {
-                        mysum += buf[i];
-                    }
-                    buf[threadNum] = mysum;
-                    if (threadNum < 16)
-                    {
-                        //reduce so that threadNum 0 has the sum of everything
-                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                        if (threadNum == 0)
-                        {
-                            Z[0] = buf[0];
-                        }
-                    }
-                }
+                %(reducebuf)s
            }
            """ %locals()
        if self.reduce_mask == (1,1):
            #this kernel is ok for up to a few thousand elements, but 
            # it only runs on ONE multiprocessor
+            reducebuf = self._k_reduce_buf('Z[0]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_11_%(nodename)s(
                    const int d0,
@@ -976,31 +986,7 @@ class GpuSum(Op):
                        mysum += Ai;
                    }
                }
-                buf[threadNum] = mysum;
-                __syncthreads();
-
-                // rest of function is handled by one warp
-                if (threadNum < warpSize)
-                {
-                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                    {
-                        mysum += buf[i];
-                    }
-                    buf[threadNum] = mysum;
-                    if (threadNum < 16)
-                    {
-                        //reduce so that threadNum 0 has the sum of everything
-                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                        if (threadNum == 0)
-                        {
-                            Z[0] = buf[0];
-                        }
-                    }
-                }
+                %(reducebuf)s
            }
            """ %locals()
        if self.reduce_mask == (1,0):
@@ -1010,6 +996,7 @@ class GpuSum(Op):
            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
            #      c_contiguous (typical case) then each warp is accessing non-contigous
            #      memory (a segment of a column).
+            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_10_%(nodename)s(
                    const int d0,
@@ -1032,31 +1019,7 @@ class GpuSum(Op):
                    float Ai = A[i0 * sA0 + blockIdx.x * sA1];
                    mysum += Ai;
                }
-                buf[threadNum] = mysum;
-                __syncthreads();
-
-                // rest of function is handled by one warp
-                if (threadNum < warpSize)
-                {
-                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                    {
-                        mysum += buf[i];
-                    }
-                    buf[threadNum] = mysum;
-                    if (threadNum < 16)
-                    {
-                        //reduce so that threadNum 0 has the sum of everything
-                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                        if (threadNum == 0)
-                        {
-                            Z[blockIdx.x * sZ0] = buf[0];
-                        }
-                    }
-                }
+                %(reducebuf)s
            }
            """ %locals()
        if self.reduce_mask == (1,1,0):
@@ -1146,6 +1109,7 @@ class GpuSum(Op):
        if self.reduce_mask == (0,0,1):
            # this kernel uses one block for each row, 
            # threads per block for each element per row.
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_001_%(nodename)s(
                    const int d0,
@@ -1172,36 +1136,36 @@ class GpuSum(Op):
                        {
                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                        }
-                        buf[threadNum] = mysum;
-                        __syncthreads();
-
-                        // rest of function is handled by one warp
-                        if (threadNum < warpSize)
+                        %(reducebuf)s
+                    }
+                }
+            }
+            """ %locals()
+        if self.reduce_mask == (1,1,1,1):
+            reducebuf = self._k_reduce_buf('Z[0]')
+            decl = self._k_decl(node, nodename)
+            init = self._k_init(node, nodename)
+            print >> sio, """
+            %(decl)s
+            {
+                %(init)s
+                mysum = 0;
+              for (int i0 = 0; i0 < d0; i0++)
+                for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
+                {
+                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
+                    {
+                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                        {
-                            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                            {
-                                mysum += buf[i];
-                            }
-                            buf[threadNum] = mysum;
-                            if (threadNum < 16)
-                            {
-                                //reduce so that threadNum 0 has the sum of everything
-                                if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                                if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                                if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                                if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                                if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                                if (threadNum == 0)
-                                {
-                                    Z[i0 * sZ0 + i1 * sZ1] = buf[0];
-                                }
-                            }
+                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
                        }
                    }
                }
+                %(reducebuf)s
            }
            """ %locals()
        if self.reduce_mask == (1,0,1,1):
+            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_1011_%(nodename)s(
                    const unsigned int d0,
@@ -1232,31 +1196,7 @@ class GpuSum(Op):
                        }
                    }
                }
-                buf[threadNum] = mysum;
-                __syncthreads();
-
-                // rest of function is handled by one warp
-                if (threadNum < warpSize)
-                {
-                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                    {
-                        mysum += buf[i];
-                    }
-                    buf[threadNum] = mysum;
-                    if (threadNum < 16)
-                    {
-                        //reduce so that threadNum 0 has the sum of everything
-                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                        if (threadNum == 0)
-                        {
-                            Z[blockIdx.x*sZ0] = buf[0];
-                        }
-                    }
-                }
+                %(reducebuf)s
            }
            """ %locals()
        return sio.getvalue()

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
+"""
+This file implement 3 different version of the elemwise op on the gpu. Only NaiveAlgo is used and it is not very naive now. 
+
+The elemwise fct are also used with scalar operation! So it can happen that ndim is 0 as with all scalar type.
+"""
+
+
 import StringIO, sys
 import numpy
 from theano import Op, Type, Apply, Variable, Constant

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
 import sys, time
-from theano.compile.sandbox.sharedvalue import shared
-from theano.compile.sandbox.pfunc import pfunc
+
+from theano import shared
+from theano.compile.pfunc import pfunc
 from theano import tensor

 import numpy
+import theano
+import theano.tensor as T

 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
@@ -13,6 +16,7 @@ except ImportError:
    raise SkipTest('Optional package cuda_ndarray not available')

 import theano.sandbox.cuda as tcn
+import cuda_ndarray as cuda
 import theano.compile.mode

 mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
@@ -20,6 +24,63 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
 def tes_use():
    tcn.use()

+def test_sum():
+    """
+    test sum pattern 1, 11, 10, 100, 110, 001, 111, 1011, 1111
+    TODO: test with broadcast
+    """
+
+    for shape, pattern in [((5,),[0]),
+                           ((5,4),[0,1]),((5,4),[0]),
+                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
+                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
+        a = tensor.TensorType('float32',(False,)*len(shape))()
+        b = T.Sum(pattern)(a)
+        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
+#        val = numpy.ones(shape)
+#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
+        val = numpy.asarray(val,dtype='float32')
+        f = theano.function([a],b, mode=mode_with_gpu)
+        f2 = theano.function([a],b)
+        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
+        assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
+        assert numpy.allclose(f2(val),f(val))
+        
+
+        #test with broadcast
+    for shape, pattern in [((5,),[0]),
+                           ((5,4),[0,1]),((5,4),[0]),
+                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
+                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
+        shape = numpy.asarray(shape)*2
+        a = tensor.TensorType('float32',(False,)*len(shape))()
+        a2 = tcn.CudaNdarrayType((False,)*len(shape))()
+        b = T.Sum(pattern)(a)
+        b2 = T.Sum(pattern)(a2)
+        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
+#        val = numpy.ones(shape)
+#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
+        val = numpy.asarray(val,dtype='float32')
+        val2 = cuda.CudaNdarray(val)
+        if len(shape)==1:
+            val = val[::2]
+            val2 = val2[::2]
+        elif len(shape)==2:
+            val = val[::2,::2]
+            val2 = val2[::2,::2]
+        elif len(shape)==3:
+            val = val[::2,::2,::2]
+            val2 = val2[::2,::2,::2]
+        elif len(shape)==4:
+            val = val[::2,::2,::2,::2]
+            val2 = val2[::2,::2,::2,::2]
+        f = theano.function([a],b)
+        f2 = theano.function([a2],b2, mode=mode_with_gpu)
+        assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()]
+        assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()]
+        assert numpy.allclose(f2(val2),f(val))
+        
+
 def test_elemwise0():

    a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')

--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -2,7 +2,7 @@ import numpy

 from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor
-from theano.compile.sandbox.sharedvalue import shared, SharedVariable, shared_constructor
+from theano.compile import shared, SharedVariable, shared_constructor

 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.type_support import filter as type_support_filter