Merge pull request #610 from nouiz/gpu_conv_faster

Gpu conv faster

Merge pull request #610 from nouiz/gpu_conv_faster
578f4836 · lamblin · f57b7b77 · 328129d1 · 578f4836 · 578f4836
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -39,6 +39,12 @@ Interface changes
   the provided value have. In the past, the error was at run time.
   (Frederic B.)

+Speed up
+ * Convolution on the GPU now check the generation of the card to make
+   it faster in some cases (especially medium/big ouput image) (Frédéric B.)
+   (We hardcoded 512 as the maximum number of thread per block. Newer card
+    support up to 1024 threads per block.
+
 New Features
 * debugprint new param ids=["CHAR", "id", "int", ""]
   This makes the identifier printed to be the python id, a unique char, a
@@ -120,6 +126,9 @@ Crash Fix
 * Work around a known issue with nvcc 4.1 on MacOS X. (Graham Taylon)
 * In advanced indexing, if some inputs are constant, no need to call constant(...)
   on their value any more. (Pascal L., reported by John Salvatier)
+ * Fix crash on GPU when the GpuSubtensor didn't put the right stride
+   when the results tensor had a dimensions with size of 1. (Pascal L,
+   reported Graham T.)

 =============
 Release Notes

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
+import copy
 import os
 import StringIO

+import theano
 from theano import Apply
 from theano import tensor
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -613,7 +615,8 @@ class GpuConv(GpuOp):
            version=-1,
            verbose=0,
            kshp=None,
-            imshp=None):
+            imshp=None,
+            max_threads_dim0=None):
        """
        :param version: each version of c_code implement many kernel for the
                        convolution. By default we try to guess the best one.
@@ -629,6 +632,10 @@ class GpuConv(GpuOp):
        :param imshp:   The size of the image. Not used for code generation but
                        allow to select an experimental new version in another
                        repo.
+        :param max_threads_dim0: The maximum number of thread for the
+                        block size dimensions 0 (blockDim.x) used by the
+                        GPU function.
+
        """
        self.border_mode = border_mode
        self.subsample = subsample
@@ -651,6 +658,7 @@ class GpuConv(GpuOp):
        self.verbose = verbose
        self.kshp = kshp
        self.imshp = imshp
+        self.max_threads_dim0 = max_threads_dim0

    def __eq__(self, other):
        return type(self) == type(other) \
@@ -662,7 +670,8 @@ class GpuConv(GpuOp):
            and self.version == other.version \
            and self.verbose == other.verbose \
            and self.kshp == other.kshp\
-            and self.imshp == other.imshp
+            and self.imshp == other.imshp\
+            and self.max_threads_dim0 == other.max_threads_dim0

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -681,7 +690,8 @@ class GpuConv(GpuOp):
            ^ self.version \
            ^ hash(self.verbose) \
            ^ hash(self.kshp)\
-            ^ hash(self.imshp)
+            ^ hash(self.imshp)\
+            ^ hash(self.max_threads_dim0)

    def __str__(self):
        return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
@@ -704,6 +714,25 @@ class GpuConv(GpuOp):
                         False, False]
        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])

+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        node_ = copy.copy(node)
+        assert node.op is node_.op
+        if node_.op.max_threads_dim0 is None:
+            op = copy.copy(node_.op)
+            device_id = theano.sandbox.cuda.use.device_number[3:]
+            if device_id == '':
+                device_id = 0
+            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
+            prop = cuda_ndarray.device_properties(device_id)
+            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
+                                                   compute_map, no_recycling)
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, "max_threads_dim0"):
+            self.max_threads_dim0 = None
+
    def c_compile_args(self):
        nb = 0
        if self.kshp is not None:
@@ -715,7 +744,7 @@ class GpuConv(GpuOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 18)
+        return (0, 19)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -734,6 +763,7 @@ class GpuConv(GpuOp):
        version = self.version
        verbose = self.verbose
        sub = sub.copy()
+        max_threads_dim0 = self.max_threads_dim0
        sub.update(locals())
        return """
    //Mandatory args
@@ -764,7 +794,8 @@ class GpuConv(GpuOp):
    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
                                                         %(out)s, mode,
                                                         dx, dy,
-                                                         version, verbose);
+                                                         version, verbose,
+                                                         %(max_threads_dim0)s);
    Py_XDECREF(%(out)s);
    %(out)s = out2;
 """ % sub

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -10,7 +10,9 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
 int
 CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                       CudaNdarray * out, int subsample_rows, int subsample_cols,
-                       int version = -1, int verbose=0)
+                       int version = -1, int verbose=0,
+                       int max_threads_dim0 = 512
+                       )
 {
    int work_complete = 0;
    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
@@ -149,7 +151,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
      //condition for exec 
      if(!subsample &&
        out_contiguous &&
-        out_size<512 &&//Maximum of 512 theads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
         std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
        !work_complete)
        version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
@@ -157,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

    if (!subsample && c_contiguous &&
        (version==0||version==2||version==-1) &&
-        out_wid<512 &&//Maximum of 512 theads by block
+        out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
        nstack == 1 &&// don't implement the stack in the kernel.
        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_patch
@@ -165,7 +167,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
        if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-        while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
+        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
+            nb_split++;
        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));

        dim3 grid(nbatch, nkern);
@@ -208,10 +211,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                      cudaGetErrorString(sts));
        }
    }
+
    if (out_contiguous &&
        (version==1||version==3||version==11||version==12||version==-1) &&
-        (version!=1 || out_size<512) &&//Maximum of 512 theads by block
-        out_wid<512 &&//Maximum of 512 theads by block
+        (version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_patch_stack
    {
@@ -222,7 +226,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
        if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-        while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
+        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));

        bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
@@ -291,7 +295,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                      " kern_flipped=true, accumulate=false, kern_width=%i,"
                      " img_c_contiguous_2d=%i,"
                      " kern_c_contiguous_2d=%i, nb_split=%i,"
-                      " preload_full_kernel=%i,",
+                      " preload_full_kernel=%i,"
                      " subsample_rows=%i, subsample_cols=%i\n",
                      threads.x, threads.y, grid.x, grid.y,
                      shared_size, threads.x * threads.y,
@@ -333,7 +337,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

    if (!subsample && out_contiguous &&
        (version==4||version==-1) &&
-        out_wid<512 &&//Maximum of 512 threads by block
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        nstack == 1 &&// don't implement the stack in the kernel.
        kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
        !work_complete) //conv_rows
@@ -386,17 +390,16 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    }
    if (!subsample && out_contiguous &&
        (version==5||version==-1) &&
-        out_wid<512 &&//Maximum of 512 theads by block
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_rows_stack

    {
        int nb_row=1;
-        int max_threads=512;
        //TODO:if not c_contiguous, lower max_thread as we use 22
        //registers by thread and we won't execute 2 block in one MP.
        for(int i=2;i<=out_len;i++){
-          if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
+          if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
            nb_row=i;
        }

@@ -468,7 +471,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

    if (!subsample && out_contiguous &&
        (version==9||version==10||version==-1) &&
-        out_wid<512 &&//Maximum of 512 threads by block
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
        (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
        !work_complete) //conv_rows_stack2
@@ -477,7 +480,6 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
      // version 9:we preload the full kernel
      // version 10: load only a few row at a time.
        int nb_row=1;
-        int max_threads=512;
        int version_back = version;
        //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
        if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
@@ -489,7 +491,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
          k_size=kern_wid;

        for(int i=2;i<=out_len;i++){
-          if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
+          if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
            nb_row=i;
        }

@@ -568,7 +570,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    //version 13 load only 1 kernel row at a time.
    if (!subsample &&
        out_contiguous &&
-        out_size<512 &&//Maximum of 512 theads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
        (version==7||version==8||version==13||version==-1) &&
        (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
        //version 13 need a minimal kernel length as big as the split.
@@ -598,7 +600,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

        // constraint (b)
        //  (TODO: read the number of threads per block from the device)
-        while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;
+        while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
+            nb_split++;

        // tentative estimates (prior to contraint c)
        int thread_z=ceil_intdiv(kern_len,nb_split);
@@ -881,7 +884,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 int
 CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
                      CudaNdarray * out, int subsample_rows,
-                      int subsample_cols, int version = -1, int verbose=0)
+                      int subsample_cols, int version = -1, int verbose=0,
+                      int max_threads_dim0=512)
 {
  //144 is the biggest static shared size used with compiling this file.
    const int shared_avail = SHARED_SIZE - 150;
@@ -1012,7 +1016,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    if (!subsample &&
        out_contiguous &&
        (version==3||version==4||version==5||version==-1) &&
-        out_wid<512 &&//Maximum of 512 threads by block
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_full_patch_stack_padded
    {
@@ -1040,7 +1044,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        //for the limit to bu lower then 512.
        int max_thread = (version!=5?327:450);
        while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
-        if(version==-1 && out_size>512)version=4;
+        if(version==-1 && out_size>max_threads_dim0)version=4;
        if(version==-1)version=3;


@@ -1132,7 +1136,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,

    if (!subsample && c_contiguous &&
        (version==0||version==-1) &&
-        out_size<512 &&//Maximum of 512 theads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
        nstack == 1 &&// don't implement the stack in the kernel.
        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_full_patch
@@ -1174,7 +1178,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    }
    if (false && !subsample && //disabled as test fail for this kernel
        (version==1||version==-1) &&
-        out_size<512 &&//Maximum of 512 theads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
        nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
        !work_complete) //conv_full_load_everything
@@ -1234,7 +1238,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        img_batch_stack_contiguous &&
        out_contiguous &&
        (version==2||version==-1) &&
-        out_size<512 &&//Maximum of 512 theads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_full_patch_stack
    {
@@ -1391,7 +1395,9 @@ PyObject *
 CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
                 CudaNdarray * out, const int mode,
                 const int subsample_rows, const int subsample_cols,
-                 const int version, const int verbose)
+                 const int version, const int verbose,
+                 const int max_threads_dim0 = 512
+                 )
 {
    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
@@ -1456,8 +1462,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
      //rval might be null
    }
    if ((rval==NULL)
-            || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
-            || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
+        || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval,
+                                                             subsample_rows,
+                                                             subsample_cols,
+                                                             version, verbose,
+                                                             max_threads_dim0))
+        || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval,
+                                                           subsample_rows,
+                                                           subsample_cols,
+                                                           version, verbose,
+                                                           max_threads_dim0))
            )
    {
        // if rval is something we just allocated,

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -31,6 +31,16 @@ else:

 cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)

+device_id = theano.sandbox.cuda.use.device_number
+if device_id is None:
+    cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
+device_id = theano.sandbox.cuda.use.device_number
+device_id = device_id[3:]
+if device_id == '':
+    device_id = 0
+cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
+device_prop = cuda_ndarray.device_properties(device_id)
+

 def py_conv_valid_numpy(img, kern):
    assert img.shape[1] == kern.shape[1]
@@ -386,7 +396,7 @@ def test_valid_0_2():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if ishape[1] > 1:
            continue
@@ -417,7 +427,7 @@ def test_valid_1_3_11_12():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
            (16 * 1024 - 150)):
@@ -446,7 +456,7 @@ def test_valid_4():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if ishape[1] > 1:
            continue
@@ -478,7 +488,7 @@ def test_valid_5():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
            (16 * 1024 - 150)):
@@ -512,7 +522,7 @@ def test_valid_7_8_13():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[2] * oshape[3] > 512:
+        if oshape[2] * oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
               oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
@@ -543,7 +553,7 @@ def test_valid_9_10():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
            continue