change inequality to include the max_threads_dim0.

0317cfca · Frederic · dc2e2eee · 0317cfca · 0317cfca · 0317cfca
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -39,6 +39,12 @@ Interface changes
   the provided value have. In the past, the error was at run time.
   (Frederic B.)
+Speed up
+ * Convolution on the GPU now check the generation of the card to make
+   it faster in some cases (especially medium/big ouput image) (Frédéric B.)
+   (We hardcoded 512 as the maximum number of thread per block. Newer card
+    support up to 1024 threads per block.
 New Features
 * debugprint new param ids=["CHAR", "id", "int", ""]
   This makes the identifier printed to be the python id, a unique char, a

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -741,7 +741,7 @@ class GpuConv(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 18)
+        return (0, 19)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -151,7 +151,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
      //condition for exec 
      if(!subsample &&
        out_contiguous &&
-        out_size<max_threads_dim0 &&//Maximum of X threads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
         std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
        !work_complete)
        version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
@@ -159,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    if (!subsample && c_contiguous &&
        (version==0||version==2||version==-1) &&
-        out_wid<max_threads_dim0 &&//Maximum of X threads for block.x
+        out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
        nstack == 1 &&// don't implement the stack in the kernel.
        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_patch
@@ -214,8 +214,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    if (out_contiguous &&
        (version==1||version==3||version==11||version==12||version==-1) &&
-        (version!=1 || out_size<max_threads_dim0) &&//Maximum of X threads by block.x
+        (version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
-        out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_patch_stack
    {
@@ -337,7 +337,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    if (!subsample && out_contiguous &&
        (version==4||version==-1) &&
-        out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        nstack == 1 &&// don't implement the stack in the kernel.
        kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
        !work_complete) //conv_rows
@@ -390,7 +390,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    }
    if (!subsample && out_contiguous &&
        (version==5||version==-1) &&
-        out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_rows_stack
@@ -399,7 +399,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        //TODO:if not c_contiguous, lower max_thread as we use 22
        //registers by thread and we won't execute 2 block in one MP.
        for(int i=2;i<=out_len;i++){
-          if((i)*out_wid<max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
+          if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
            nb_row=i;
        }
@@ -471,7 +471,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    if (!subsample && out_contiguous &&
        (version==9||version==10||version==-1) &&
-        out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
        (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
        !work_complete) //conv_rows_stack2
@@ -491,7 +491,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
          k_size=kern_wid;
        for(int i=2;i<=out_len;i++){
-          if(i*out_wid<max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
+          if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
            nb_row=i;
        }
@@ -570,7 +570,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    //version 13 load only 1 kernel row at a time.
    if (!subsample &&
        out_contiguous &&
-        out_size<max_threads_dim0 &&//Maximum of X threads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
        (version==7||version==8||version==13||version==-1) &&
        (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
        //version 13 need a minimal kernel length as big as the split.
@@ -1016,7 +1016,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    if (!subsample &&
        out_contiguous &&
        (version==3||version==4||version==5||version==-1) &&
-        out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_full_patch_stack_padded
    {
@@ -1136,7 +1136,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    if (!subsample && c_contiguous &&
        (version==0||version==-1) &&
-        out_size<max_threads_dim0 &&//Maximum of X threads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
        nstack == 1 &&// don't implement the stack in the kernel.
        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_full_patch
@@ -1178,7 +1178,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    }
    if (false && !subsample && //disabled as test fail for this kernel
        (version==1||version==-1) &&
-        out_size<max_threads_dim0 &&//Maximum of X threads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
        nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
        !work_complete) //conv_full_load_everything
@@ -1238,7 +1238,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        img_batch_stack_contiguous &&
        out_contiguous &&
        (version==2||version==-1) &&
-        out_size<max_threads_dim0 &&//Maximum of X threads by block
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_full_patch_stack
    {