提交 0317cfca authored 作者: Frederic's avatar Frederic

change inequality to include the max_threads_dim0.

上级 dc2e2eee
......@@ -39,6 +39,12 @@ Interface changes
the provided value have. In the past, the error was at run time.
(Frederic B.)
Speed up
* Convolution on the GPU now check the generation of the card to make
it faster in some cases (especially medium/big ouput image) (Frédéric B.)
(We hardcoded 512 as the maximum number of thread per block. Newer card
support up to 1024 threads per block.
New Features
* debugprint new param ids=["CHAR", "id", "int", ""]
This makes the identifier printed to be the python id, a unique char, a
......
......@@ -741,7 +741,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 18)
return (0, 19)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......
......@@ -151,7 +151,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//condition for exec
if(!subsample &&
out_contiguous &&
out_size<max_threads_dim0 &&//Maximum of X threads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
......@@ -159,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads for block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch
......@@ -214,8 +214,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<max_threads_dim0) &&//Maximum of X threads by block.x
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
(version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack
{
......@@ -337,7 +337,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && out_contiguous &&
(version==4||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows
......@@ -390,7 +390,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
}
if (!subsample && out_contiguous &&
(version==5||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack
......@@ -399,7 +399,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i;
}
......@@ -471,7 +471,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2
......@@ -491,7 +491,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
k_size=kern_wid;
for(int i=2;i<=out_len;i++){
if(i*out_wid<max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i;
}
......@@ -570,7 +570,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//version 13 load only 1 kernel row at a time.
if (!subsample &&
out_contiguous &&
out_size<max_threads_dim0 &&//Maximum of X threads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
(version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split.
......@@ -1016,7 +1016,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample &&
out_contiguous &&
(version==3||version==4||version==5||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded
{
......@@ -1136,7 +1136,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && c_contiguous &&
(version==0||version==-1) &&
out_size<max_threads_dim0 &&//Maximum of X threads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch
......@@ -1178,7 +1178,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
}
if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<max_threads_dim0 &&//Maximum of X threads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
......@@ -1238,7 +1238,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
img_batch_stack_contiguous &&
out_contiguous &&
(version==2||version==-1) &&
out_size<max_threads_dim0 &&//Maximum of X threads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack
{
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论