提交 578f4836 authored 作者: lamblin's avatar lamblin

Merge pull request #610 from nouiz/gpu_conv_faster

Gpu conv faster
...@@ -39,6 +39,12 @@ Interface changes ...@@ -39,6 +39,12 @@ Interface changes
the provided value have. In the past, the error was at run time. the provided value have. In the past, the error was at run time.
(Frederic B.) (Frederic B.)
Speed up
* Convolution on the GPU now check the generation of the card to make
it faster in some cases (especially medium/big ouput image) (Frédéric B.)
(We hardcoded 512 as the maximum number of thread per block. Newer card
support up to 1024 threads per block.
New Features New Features
* debugprint new param ids=["CHAR", "id", "int", ""] * debugprint new param ids=["CHAR", "id", "int", ""]
This makes the identifier printed to be the python id, a unique char, a This makes the identifier printed to be the python id, a unique char, a
...@@ -120,6 +126,9 @@ Crash Fix ...@@ -120,6 +126,9 @@ Crash Fix
* Work around a known issue with nvcc 4.1 on MacOS X. (Graham Taylon) * Work around a known issue with nvcc 4.1 on MacOS X. (Graham Taylon)
* In advanced indexing, if some inputs are constant, no need to call constant(...) * In advanced indexing, if some inputs are constant, no need to call constant(...)
on their value any more. (Pascal L., reported by John Salvatier) on their value any more. (Pascal L., reported by John Salvatier)
* Fix crash on GPU when the GpuSubtensor didn't put the right stride
when the results tensor had a dimensions with size of 1. (Pascal L,
reported Graham T.)
============= =============
Release Notes Release Notes
......
import copy
import os import os
import StringIO import StringIO
import theano
from theano import Apply from theano import Apply
from theano import tensor from theano import tensor
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
...@@ -613,7 +615,8 @@ class GpuConv(GpuOp): ...@@ -613,7 +615,8 @@ class GpuConv(GpuOp):
version=-1, version=-1,
verbose=0, verbose=0,
kshp=None, kshp=None,
imshp=None): imshp=None,
max_threads_dim0=None):
""" """
:param version: each version of c_code implement many kernel for the :param version: each version of c_code implement many kernel for the
convolution. By default we try to guess the best one. convolution. By default we try to guess the best one.
...@@ -629,6 +632,10 @@ class GpuConv(GpuOp): ...@@ -629,6 +632,10 @@ class GpuConv(GpuOp):
:param imshp: The size of the image. Not used for code generation but :param imshp: The size of the image. Not used for code generation but
allow to select an experimental new version in another allow to select an experimental new version in another
repo. repo.
:param max_threads_dim0: The maximum number of thread for the
block size dimensions 0 (blockDim.x) used by the
GPU function.
""" """
self.border_mode = border_mode self.border_mode = border_mode
self.subsample = subsample self.subsample = subsample
...@@ -651,6 +658,7 @@ class GpuConv(GpuOp): ...@@ -651,6 +658,7 @@ class GpuConv(GpuOp):
self.verbose = verbose self.verbose = verbose
self.kshp = kshp self.kshp = kshp
self.imshp = imshp self.imshp = imshp
self.max_threads_dim0 = max_threads_dim0
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) \ return type(self) == type(other) \
...@@ -662,7 +670,8 @@ class GpuConv(GpuOp): ...@@ -662,7 +670,8 @@ class GpuConv(GpuOp):
and self.version == other.version \ and self.version == other.version \
and self.verbose == other.verbose \ and self.verbose == other.verbose \
and self.kshp == other.kshp\ and self.kshp == other.kshp\
and self.imshp == other.imshp and self.imshp == other.imshp\
and self.max_threads_dim0 == other.max_threads_dim0
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -681,7 +690,8 @@ class GpuConv(GpuOp): ...@@ -681,7 +690,8 @@ class GpuConv(GpuOp):
^ self.version \ ^ self.version \
^ hash(self.verbose) \ ^ hash(self.verbose) \
^ hash(self.kshp)\ ^ hash(self.kshp)\
^ hash(self.imshp) ^ hash(self.imshp)\
^ hash(self.max_threads_dim0)
def __str__(self): def __str__(self):
return '%s{%s, %s, %s, %s, %s, %s, %s}' % ( return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
...@@ -704,6 +714,25 @@ class GpuConv(GpuOp): ...@@ -704,6 +714,25 @@ class GpuConv(GpuOp):
False, False] False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()]) return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node)
assert node.op is node_.op
if node_.op.max_threads_dim0 is None:
op = copy.copy(node_.op)
device_id = theano.sandbox.cuda.use.device_number[3:]
if device_id == '':
device_id = 0
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0']
return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling)
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, "max_threads_dim0"):
self.max_threads_dim0 = None
def c_compile_args(self): def c_compile_args(self):
nb = 0 nb = 0
if self.kshp is not None: if self.kshp is not None:
...@@ -715,7 +744,7 @@ class GpuConv(GpuOp): ...@@ -715,7 +744,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 18) return (0, 19)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
...@@ -734,6 +763,7 @@ class GpuConv(GpuOp): ...@@ -734,6 +763,7 @@ class GpuConv(GpuOp):
version = self.version version = self.version
verbose = self.verbose verbose = self.verbose
sub = sub.copy() sub = sub.copy()
max_threads_dim0 = self.max_threads_dim0
sub.update(locals()) sub.update(locals())
return """ return """
//Mandatory args //Mandatory args
...@@ -764,7 +794,8 @@ class GpuConv(GpuOp): ...@@ -764,7 +794,8 @@ class GpuConv(GpuOp):
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
%(out)s, mode, %(out)s, mode,
dx, dy, dx, dy,
version, verbose); version, verbose,
%(max_threads_dim0)s);
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = out2; %(out)s = out2;
""" % sub """ % sub
......
...@@ -10,7 +10,9 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * ...@@ -10,7 +10,9 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
int int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols, CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0) int version = -1, int verbose=0,
int max_threads_dim0 = 512
)
{ {
int work_complete = 0; int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
...@@ -149,7 +151,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -149,7 +151,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//condition for exec //condition for exec
if(!subsample && if(!subsample &&
out_contiguous && out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce! std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete) !work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed. version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
...@@ -157,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -157,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && c_contiguous && if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) && (version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch !work_complete) //conv_patch
...@@ -165,7 +167,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -165,7 +167,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.) int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing. if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern); dim3 grid(nbatch, nkern);
...@@ -208,10 +211,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -208,10 +211,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (out_contiguous && if (out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) && (version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<512) &&//Maximum of 512 theads by block (version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
out_wid<512 &&//Maximum of 512 theads by block out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack !work_complete) //conv_patch_stack
{ {
...@@ -222,7 +226,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -222,7 +226,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.) int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing. if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail; bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
...@@ -291,7 +295,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -291,7 +295,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
" kern_flipped=true, accumulate=false, kern_width=%i," " kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i," " img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i," " kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,", " preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " subsample_rows=%i, subsample_cols=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, shared_size, threads.x * threads.y,
...@@ -333,7 +337,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -333,7 +337,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==4||version==-1) && (version==4||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows !work_complete) //conv_rows
...@@ -386,17 +390,16 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -386,17 +390,16 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==5||version==-1) && (version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack !work_complete) //conv_rows_stack
{ {
int nb_row=1; int nb_row=1;
int max_threads=512;
//TODO:if not c_contiguous, lower max_thread as we use 22 //TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP. //registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail) if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
} }
...@@ -468,7 +471,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -468,7 +471,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) && (version==9||version==10||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2 !work_complete) //conv_rows_stack2
...@@ -477,7 +480,6 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -477,7 +480,6 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
// version 9:we preload the full kernel // version 9:we preload the full kernel
// version 10: load only a few row at a time. // version 10: load only a few row at a time.
int nb_row=1; int nb_row=1;
int max_threads=512;
int version_back = version; int version_back = version;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP. //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
...@@ -489,7 +491,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -489,7 +491,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
k_size=kern_wid; k_size=kern_wid;
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail) if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
} }
...@@ -568,7 +570,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -568,7 +570,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//version 13 load only 1 kernel row at a time. //version 13 load only 1 kernel row at a time.
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
(version==7||version==8||version==13||version==-1) && (version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split. (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split. //version 13 need a minimal kernel length as big as the split.
...@@ -598,7 +600,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -598,7 +600,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
// constraint (b) // constraint (b)
// (TODO: read the number of threads per block from the device) // (TODO: read the number of threads per block from the device)
while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++; while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
nb_split++;
// tentative estimates (prior to contraint c) // tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split); int thread_z=ceil_intdiv(kern_len,nb_split);
...@@ -881,7 +884,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -881,7 +884,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, CudaNdarray * out, int subsample_rows,
int subsample_cols, int version = -1, int verbose=0) int subsample_cols, int version = -1, int verbose=0,
int max_threads_dim0=512)
{ {
//144 is the biggest static shared size used with compiling this file. //144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150; const int shared_avail = SHARED_SIZE - 150;
...@@ -1012,7 +1016,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1012,7 +1016,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
(version==3||version==4||version==5||version==-1) && (version==3||version==4||version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded !work_complete) //conv_full_patch_stack_padded
{ {
...@@ -1040,7 +1044,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1040,7 +1044,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//for the limit to bu lower then 512. //for the limit to bu lower then 512.
int max_thread = (version!=5?327:450); int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4; if(version==-1 && out_size>max_threads_dim0)version=4;
if(version==-1)version=3; if(version==-1)version=3;
...@@ -1132,7 +1136,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1132,7 +1136,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && c_contiguous && if (!subsample && c_contiguous &&
(version==0||version==-1) && (version==0||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch !work_complete) //conv_full_patch
...@@ -1174,7 +1178,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1174,7 +1178,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
} }
if (false && !subsample && //disabled as test fail for this kernel if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) && (version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing. (nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything !work_complete) //conv_full_load_everything
...@@ -1234,7 +1238,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1234,7 +1238,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
img_batch_stack_contiguous && img_batch_stack_contiguous &&
out_contiguous && out_contiguous &&
(version==2||version==-1) && (version==2||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<=max_threads_dim0 &&//Maximum of X threads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack !work_complete) //conv_full_patch_stack
{ {
...@@ -1391,7 +1395,9 @@ PyObject * ...@@ -1391,7 +1395,9 @@ PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
CudaNdarray * out, const int mode, CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols, const int subsample_rows, const int subsample_cols,
const int version, const int verbose) const int version, const int verbose,
const int max_threads_dim0 = 512
)
{ {
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified. // Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1. // If the out object is re-used then it is returned, and its refcount is incremented by 1.
...@@ -1456,8 +1462,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1456,8 +1462,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
//rval might be null //rval might be null
} }
if ((rval==NULL) if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose)) || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval,
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose)) subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval,
subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
) )
{ {
// if rval is something we just allocated, // if rval is something we just allocated,
......
...@@ -31,6 +31,16 @@ else: ...@@ -31,6 +31,16 @@ else:
cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4) cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
device_id = theano.sandbox.cuda.use.device_number
device_id = device_id[3:]
if device_id == '':
device_id = 0
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
device_prop = cuda_ndarray.device_properties(device_id)
def py_conv_valid_numpy(img, kern): def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
...@@ -386,7 +396,7 @@ def test_valid_0_2(): ...@@ -386,7 +396,7 @@ def test_valid_0_2():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1])) numpy.asarray([1, 1]))
if oshape[3] > 512: if oshape[3] > device_prop['maxThreadsDim0']:
continue continue
if ishape[1] > 1: if ishape[1] > 1:
continue continue
...@@ -417,7 +427,7 @@ def test_valid_1_3_11_12(): ...@@ -417,7 +427,7 @@ def test_valid_1_3_11_12():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1])) numpy.asarray([1, 1]))
if oshape[3] > 512: if oshape[3] > device_prop['maxThreadsDim0']:
continue continue
if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 > if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
(16 * 1024 - 150)): (16 * 1024 - 150)):
...@@ -446,7 +456,7 @@ def test_valid_4(): ...@@ -446,7 +456,7 @@ def test_valid_4():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1])) numpy.asarray([1, 1]))
if oshape[3] > 512: if oshape[3] > device_prop['maxThreadsDim0']:
continue continue
if ishape[1] > 1: if ishape[1] > 1:
continue continue
...@@ -478,7 +488,7 @@ def test_valid_5(): ...@@ -478,7 +488,7 @@ def test_valid_5():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1])) numpy.asarray([1, 1]))
if oshape[3] > 512: if oshape[3] > device_prop['maxThreadsDim0']:
continue continue
if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) > if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
(16 * 1024 - 150)): (16 * 1024 - 150)):
...@@ -512,7 +522,7 @@ def test_valid_7_8_13(): ...@@ -512,7 +522,7 @@ def test_valid_7_8_13():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1])) numpy.asarray([1, 1]))
if oshape[2] * oshape[3] > 512: if oshape[2] * oshape[3] > device_prop['maxThreadsDim0']:
continue continue
if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4, if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150): oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
...@@ -543,7 +553,7 @@ def test_valid_9_10(): ...@@ -543,7 +553,7 @@ def test_valid_9_10():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1])) numpy.asarray([1, 1]))
if oshape[3] > 512: if oshape[3] > device_prop['maxThreadsDim0']:
continue continue
if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150): if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
continue continue
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论