提交 578f4836 authored 作者: lamblin's avatar lamblin

Merge pull request #610 from nouiz/gpu_conv_faster

Gpu conv faster
......@@ -39,6 +39,12 @@ Interface changes
the provided value have. In the past, the error was at run time.
(Frederic B.)
Speed up
* Convolution on the GPU now check the generation of the card to make
it faster in some cases (especially medium/big ouput image) (Frédéric B.)
(We hardcoded 512 as the maximum number of thread per block. Newer card
support up to 1024 threads per block.
New Features
* debugprint new param ids=["CHAR", "id", "int", ""]
This makes the identifier printed to be the python id, a unique char, a
......@@ -120,6 +126,9 @@ Crash Fix
* Work around a known issue with nvcc 4.1 on MacOS X. (Graham Taylon)
* In advanced indexing, if some inputs are constant, no need to call constant(...)
on their value any more. (Pascal L., reported by John Salvatier)
* Fix crash on GPU when the GpuSubtensor didn't put the right stride
when the results tensor had a dimensions with size of 1. (Pascal L,
reported Graham T.)
=============
Release Notes
......
import copy
import os
import StringIO
import theano
from theano import Apply
from theano import tensor
from theano.sandbox.cuda.type import CudaNdarrayType
......@@ -613,7 +615,8 @@ class GpuConv(GpuOp):
version=-1,
verbose=0,
kshp=None,
imshp=None):
imshp=None,
max_threads_dim0=None):
"""
:param version: each version of c_code implement many kernel for the
convolution. By default we try to guess the best one.
......@@ -629,6 +632,10 @@ class GpuConv(GpuOp):
:param imshp: The size of the image. Not used for code generation but
allow to select an experimental new version in another
repo.
:param max_threads_dim0: The maximum number of thread for the
block size dimensions 0 (blockDim.x) used by the
GPU function.
"""
self.border_mode = border_mode
self.subsample = subsample
......@@ -651,6 +658,7 @@ class GpuConv(GpuOp):
self.verbose = verbose
self.kshp = kshp
self.imshp = imshp
self.max_threads_dim0 = max_threads_dim0
def __eq__(self, other):
return type(self) == type(other) \
......@@ -662,7 +670,8 @@ class GpuConv(GpuOp):
and self.version == other.version \
and self.verbose == other.verbose \
and self.kshp == other.kshp\
and self.imshp == other.imshp
and self.imshp == other.imshp\
and self.max_threads_dim0 == other.max_threads_dim0
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -681,7 +690,8 @@ class GpuConv(GpuOp):
^ self.version \
^ hash(self.verbose) \
^ hash(self.kshp)\
^ hash(self.imshp)
^ hash(self.imshp)\
^ hash(self.max_threads_dim0)
def __str__(self):
return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
......@@ -704,6 +714,25 @@ class GpuConv(GpuOp):
False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node)
assert node.op is node_.op
if node_.op.max_threads_dim0 is None:
op = copy.copy(node_.op)
device_id = theano.sandbox.cuda.use.device_number[3:]
if device_id == '':
device_id = 0
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0']
return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling)
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, "max_threads_dim0"):
self.max_threads_dim0 = None
def c_compile_args(self):
nb = 0
if self.kshp is not None:
......@@ -715,7 +744,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 18)
return (0, 19)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......@@ -734,6 +763,7 @@ class GpuConv(GpuOp):
version = self.version
verbose = self.verbose
sub = sub.copy()
max_threads_dim0 = self.max_threads_dim0
sub.update(locals())
return """
//Mandatory args
......@@ -764,7 +794,8 @@ class GpuConv(GpuOp):
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
%(out)s, mode,
dx, dy,
version, verbose);
version, verbose,
%(max_threads_dim0)s);
Py_XDECREF(%(out)s);
%(out)s = out2;
""" % sub
......
......@@ -10,7 +10,9 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0)
int version = -1, int verbose=0,
int max_threads_dim0 = 512
)
{
int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
......@@ -149,7 +151,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//condition for exec
if(!subsample &&
out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
......@@ -157,7 +159,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch
......@@ -165,7 +167,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern);
......@@ -208,10 +211,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
cudaGetErrorString(sts));
}
}
if (out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<512) &&//Maximum of 512 theads by block
out_wid<512 &&//Maximum of 512 theads by block
(version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack
{
......@@ -222,7 +226,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
......@@ -291,7 +295,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
" kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,",
" preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
......@@ -333,7 +337,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && out_contiguous &&
(version==4||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows
......@@ -386,17 +390,16 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
}
if (!subsample && out_contiguous &&
(version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack
{
int nb_row=1;
int max_threads=512;
//TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i;
}
......@@ -468,7 +471,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2
......@@ -477,7 +480,6 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
// version 9:we preload the full kernel
// version 10: load only a few row at a time.
int nb_row=1;
int max_threads=512;
int version_back = version;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
......@@ -489,7 +491,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
k_size=kern_wid;
for(int i=2;i<=out_len;i++){
if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i;
}
......@@ -568,7 +570,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//version 13 load only 1 kernel row at a time.
if (!subsample &&
out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
(version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split.
......@@ -598,7 +600,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
// constraint (b)
// (TODO: read the number of threads per block from the device)
while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;
while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
nb_split++;
// tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split);
......@@ -881,7 +884,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows,
int subsample_cols, int version = -1, int verbose=0)
int subsample_cols, int version = -1, int verbose=0,
int max_threads_dim0=512)
{
//144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150;
......@@ -1012,7 +1016,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample &&
out_contiguous &&
(version==3||version==4||version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded
{
......@@ -1040,7 +1044,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1 && out_size>max_threads_dim0)version=4;
if(version==-1)version=3;
......@@ -1132,7 +1136,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (!subsample && c_contiguous &&
(version==0||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch
......@@ -1174,7 +1178,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
}
if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
......@@ -1234,7 +1238,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
img_batch_stack_contiguous &&
out_contiguous &&
(version==2||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
out_size<=max_threads_dim0 &&//Maximum of X threads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack
{
......@@ -1391,7 +1395,9 @@ PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols,
const int version, const int verbose)
const int version, const int verbose,
const int max_threads_dim0 = 512
)
{
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1.
......@@ -1456,8 +1462,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
//rval might be null
}
if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval,
subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval,
subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
)
{
// if rval is something we just allocated,
......
......@@ -31,6 +31,16 @@ else:
cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
device_id = theano.sandbox.cuda.use.device_number
device_id = device_id[3:]
if device_id == '':
device_id = 0
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
device_prop = cuda_ndarray.device_properties(device_id)
def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1]
......@@ -386,7 +396,7 @@ def test_valid_0_2():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if ishape[1] > 1:
continue
......@@ -417,7 +427,7 @@ def test_valid_1_3_11_12():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
(16 * 1024 - 150)):
......@@ -446,7 +456,7 @@ def test_valid_4():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if ishape[1] > 1:
continue
......@@ -478,7 +488,7 @@ def test_valid_5():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
(16 * 1024 - 150)):
......@@ -512,7 +522,7 @@ def test_valid_7_8_13():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[2] * oshape[3] > 512:
if oshape[2] * oshape[3] > device_prop['maxThreadsDim0']:
continue
if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
......@@ -543,7 +553,7 @@ def test_valid_9_10():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
continue
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论