提交 0317cfca authored 作者: Frederic's avatar Frederic

change inequality to include the max_threads_dim0.

上级 dc2e2eee
...@@ -39,6 +39,12 @@ Interface changes ...@@ -39,6 +39,12 @@ Interface changes
the provided value have. In the past, the error was at run time. the provided value have. In the past, the error was at run time.
(Frederic B.) (Frederic B.)
Speed up
* Convolution on the GPU now check the generation of the card to make
it faster in some cases (especially medium/big ouput image) (Frédéric B.)
(We hardcoded 512 as the maximum number of thread per block. Newer card
support up to 1024 threads per block.
New Features New Features
* debugprint new param ids=["CHAR", "id", "int", ""] * debugprint new param ids=["CHAR", "id", "int", ""]
This makes the identifier printed to be the python id, a unique char, a This makes the identifier printed to be the python id, a unique char, a
......
...@@ -741,7 +741,7 @@ class GpuConv(GpuOp): ...@@ -741,7 +741,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 18) return (0, 19)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
......
...@@ -151,7 +151,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -151,7 +151,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//condition for exec //condition for exec
if(!subsample && if(!subsample &&
out_contiguous && out_contiguous &&
out_size<max_threads_dim0 &&//Maximum of X threads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce! std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete) !work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed. version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
...@@ -159,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -159,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && c_contiguous && if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) && (version==0||version==2||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads for block.x out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch !work_complete) //conv_patch
...@@ -214,8 +214,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -214,8 +214,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (out_contiguous && if (out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) && (version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<max_threads_dim0) &&//Maximum of X threads by block.x (version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack !work_complete) //conv_patch_stack
{ {
...@@ -337,7 +337,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -337,7 +337,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==4||version==-1) && (version==4||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows !work_complete) //conv_rows
...@@ -390,7 +390,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -390,7 +390,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==5||version==-1) && (version==5||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack !work_complete) //conv_rows_stack
...@@ -399,7 +399,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -399,7 +399,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//TODO:if not c_contiguous, lower max_thread as we use 22 //TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP. //registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail) if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
} }
...@@ -471,7 +471,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -471,7 +471,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) && (version==9||version==10||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2 !work_complete) //conv_rows_stack2
...@@ -491,7 +491,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -491,7 +491,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
k_size=kern_wid; k_size=kern_wid;
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if(i*out_wid<max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail) if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
} }
...@@ -570,7 +570,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -570,7 +570,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//version 13 load only 1 kernel row at a time. //version 13 load only 1 kernel row at a time.
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
out_size<max_threads_dim0 &&//Maximum of X threads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
(version==7||version==8||version==13||version==-1) && (version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split. (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split. //version 13 need a minimal kernel length as big as the split.
...@@ -1016,7 +1016,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1016,7 +1016,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
(version==3||version==4||version==5||version==-1) && (version==3||version==4||version==5||version==-1) &&
out_wid<max_threads_dim0 &&//Maximum of X threads by block.x out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded !work_complete) //conv_full_patch_stack_padded
{ {
...@@ -1136,7 +1136,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1136,7 +1136,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && c_contiguous && if (!subsample && c_contiguous &&
(version==0||version==-1) && (version==0||version==-1) &&
out_size<max_threads_dim0 &&//Maximum of X threads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch !work_complete) //conv_full_patch
...@@ -1178,7 +1178,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1178,7 +1178,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
} }
if (false && !subsample && //disabled as test fail for this kernel if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) && (version==1||version==-1) &&
out_size<max_threads_dim0 &&//Maximum of X threads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing. (nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything !work_complete) //conv_full_load_everything
...@@ -1238,7 +1238,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1238,7 +1238,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
img_batch_stack_contiguous && img_batch_stack_contiguous &&
out_contiguous && out_contiguous &&
(version==2||version==-1) && (version==2||version==-1) &&
out_size<max_threads_dim0 &&//Maximum of X threads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack !work_complete) //conv_full_patch_stack
{ {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论