make GpuImages2Neibs faster.

5ab9d1ac · Frederic Bastien · 049b6ed3 · 5ab9d1ac · 5ab9d1ac
--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -252,7 +252,7 @@ class GpuImages2Neibs(Images2Neibs):
                                                                dtype=ten4.type.dtype)()])
    def c_code_cache_version(self):
-        return (4,)
+        return (5,)
    def c_support_code_apply(self, node, nodename):
        mode = self.mode
@@ -276,7 +276,7 @@ class GpuImages2Neibs(Images2Neibs):
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
-            for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){
+            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
@@ -423,14 +423,22 @@ class GpuImages2Neibs(Images2Neibs):
            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
+            dim3 n_threads(c,d,1);
+            //Their is a max of 512 threads per blocks
+            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--; 
+            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--; 
+            //Make bigger block to have better memory access pattern and a higher core utilisation.
+            //for smaller patch size
+            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
+                n_threads.z++;
+            }
            int nb_block;
-            if (nb_batch %% 32 == 0)
+            if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
-                nb_block = nb_batch/32;
+                nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
            else
-                nb_block = (int)((float)nb_batch/32. + 1.); 
+                nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
+            dim3 n_blocks(std::min(32*1024,nb_block));
-            dim3 n_blocks(std::min(32*1024,CudaNdarray_HOST_DIMS(%(z)s)[0]),1,1);
-            dim3 n_threads(c,d,1);
            int n_shared = 0;
            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(                

--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
@@ -278,10 +278,11 @@ def test_neibs_wrap_centered_step_manual():
 def test_neibs_gpu():
    if cuda.cuda_available == False:
       raise SkipTest('Optional package cuda disabled')
+    for shape, pshape in [((100,40,18,18),(2,2)),
+                          ((10,40,66,66),(33,33))]:
-    shape = (100,40,18,18)
        images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
-    neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
+        neib_shape = T.as_tensor_variable(pshape)
        from theano.sandbox.cuda.basic_ops import gpu_from_host