提交 5ab9d1ac authored 作者: Frederic Bastien's avatar Frederic Bastien

make GpuImages2Neibs faster.

上级 049b6ed3
...@@ -252,7 +252,7 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -252,7 +252,7 @@ class GpuImages2Neibs(Images2Neibs):
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
mode = self.mode mode = self.mode
...@@ -276,7 +276,7 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -276,7 +276,7 @@ class GpuImages2Neibs(Images2Neibs):
const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2; const int wrap_centered_idx_shift_y = d/2;
for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){ for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d; const int b = tblock%%grid_d;
int left = tblock/grid_d; int left = tblock/grid_d;
const int a = left%%grid_c; const int a = left%%grid_c;
...@@ -423,14 +423,22 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -423,14 +423,22 @@ class GpuImages2Neibs(Images2Neibs):
const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(c,d,1);
//Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
//Make bigger block to have better memory access pattern and a higher core utilisation.
//for smaller patch size
while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
n_threads.z++;
}
int nb_block; int nb_block;
if (nb_batch %% 32 == 0) if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
nb_block = nb_batch/32; nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
else else
nb_block = (int)((float)nb_batch/32. + 1.); nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
dim3 n_blocks(std::min(32*1024,nb_block));
dim3 n_blocks(std::min(32*1024,CudaNdarray_HOST_DIMS(%(z)s)[0]),1,1);
dim3 n_threads(c,d,1);
int n_shared = 0; int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>( k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
......
...@@ -278,10 +278,11 @@ def test_neibs_wrap_centered_step_manual(): ...@@ -278,10 +278,11 @@ def test_neibs_wrap_centered_step_manual():
def test_neibs_gpu(): def test_neibs_gpu():
if cuda.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
for shape, pshape in [((100,40,18,18),(2,2)),
((10,40,66,66),(33,33))]:
shape = (100,40,18,18)
images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape)) images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32')) neib_shape = T.as_tensor_variable(pshape)
from theano.sandbox.cuda.basic_ops import gpu_from_host from theano.sandbox.cuda.basic_ops import gpu_from_host
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论