提交 c1eff5eb authored 作者: Frederic's avatar Frederic

All the boiller plate code to allow using the run time maximum number of theads…

All the boiller plate code to allow using the run time maximum number of theads on the dimensions 0 of gpu block of threads.
上级 4d943bea
import copy
import os import os
import StringIO import StringIO
import theano
from theano import Apply from theano import Apply
from theano import tensor from theano import tensor
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
...@@ -613,9 +615,9 @@ class GpuConv(GpuOp): ...@@ -613,9 +615,9 @@ class GpuConv(GpuOp):
version=-1, version=-1,
verbose=0, verbose=0,
kshp=None, kshp=None,
imshp=None): imshp=None,
""" max_threads_dim0=None):
:param version: each version of c_code implement many kernel for the """:param version: each version of c_code implement many kernel for the
convolution. By default we try to guess the best one. convolution. By default we try to guess the best one.
You can force one version with this parameter. This You can force one version with this parameter. This
parameter is used by the tests. parameter is used by the tests.
...@@ -629,6 +631,9 @@ class GpuConv(GpuOp): ...@@ -629,6 +631,9 @@ class GpuConv(GpuOp):
:param imshp: The size of the image. Not used for code generation but :param imshp: The size of the image. Not used for code generation but
allow to select an experimental new version in another allow to select an experimental new version in another
repo. repo.
:param max_threads_dim0: maximum number of thread for each the
block size dimensions 0
""" """
self.border_mode = border_mode self.border_mode = border_mode
self.subsample = subsample self.subsample = subsample
...@@ -651,6 +656,7 @@ class GpuConv(GpuOp): ...@@ -651,6 +656,7 @@ class GpuConv(GpuOp):
self.verbose = verbose self.verbose = verbose
self.kshp = kshp self.kshp = kshp
self.imshp = imshp self.imshp = imshp
self.max_threads_dim0 = max_threads_dim0
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) \ return type(self) == type(other) \
...@@ -662,7 +668,8 @@ class GpuConv(GpuOp): ...@@ -662,7 +668,8 @@ class GpuConv(GpuOp):
and self.version == other.version \ and self.version == other.version \
and self.verbose == other.verbose \ and self.verbose == other.verbose \
and self.kshp == other.kshp\ and self.kshp == other.kshp\
and self.imshp == other.imshp and self.imshp == other.imshp\
and self.max_threads_dim0 == other.max_threads_dim0
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -681,7 +688,8 @@ class GpuConv(GpuOp): ...@@ -681,7 +688,8 @@ class GpuConv(GpuOp):
^ self.version \ ^ self.version \
^ hash(self.verbose) \ ^ hash(self.verbose) \
^ hash(self.kshp)\ ^ hash(self.kshp)\
^ hash(self.imshp) ^ hash(self.imshp)\
^ hash(self.max_threads_dim0)
def __str__(self): def __str__(self):
return '%s{%s, %s, %s, %s, %s, %s, %s}' % ( return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
...@@ -704,6 +712,24 @@ class GpuConv(GpuOp): ...@@ -704,6 +712,24 @@ class GpuConv(GpuOp):
False, False] False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()]) return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = node
if node.op.max_threads_dim0 is None:
op = copy.copy(node.op)
device_id = theano.sandbox.cuda.use.device_number[3:]
if device_id == '':
device_id = 0
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
node.op.max_threads_dim0 = prop['maxThreadsDim0']
return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling)
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, "max_threads_dim0"):
self.max_threads_dim0 = None
def c_compile_args(self): def c_compile_args(self):
nb = 0 nb = 0
if self.kshp is not None: if self.kshp is not None:
...@@ -734,6 +760,7 @@ class GpuConv(GpuOp): ...@@ -734,6 +760,7 @@ class GpuConv(GpuOp):
version = self.version version = self.version
verbose = self.verbose verbose = self.verbose
sub = sub.copy() sub = sub.copy()
max_threads_dim0 = self.max_threads_dim0
sub.update(locals()) sub.update(locals())
return """ return """
//Mandatory args //Mandatory args
...@@ -764,7 +791,8 @@ class GpuConv(GpuOp): ...@@ -764,7 +791,8 @@ class GpuConv(GpuOp):
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
%(out)s, mode, %(out)s, mode,
dx, dy, dx, dy,
version, verbose); version, verbose,
%(max_threads_dim0)s);
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = out2; %(out)s = out2;
""" % sub """ % sub
......
...@@ -10,7 +10,9 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * ...@@ -10,7 +10,9 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
int int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols, CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0) int version = -1, int verbose=0,
int max_threads_dim0 = 512
)
{ {
int work_complete = 0; int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
...@@ -881,7 +883,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -881,7 +883,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, CudaNdarray * out, int subsample_rows,
int subsample_cols, int version = -1, int verbose=0) int subsample_cols, int version = -1, int verbose=0,
int max_threads_dim0=512)
{ {
//144 is the biggest static shared size used with compiling this file. //144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150; const int shared_avail = SHARED_SIZE - 150;
...@@ -1391,7 +1394,9 @@ PyObject * ...@@ -1391,7 +1394,9 @@ PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
CudaNdarray * out, const int mode, CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols, const int subsample_rows, const int subsample_cols,
const int version, const int verbose) const int version, const int verbose,
const int max_threads_dim0 = 512
)
{ {
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified. // Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1. // If the out object is re-used then it is returned, and its refcount is incremented by 1.
...@@ -1456,8 +1461,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1456,8 +1461,16 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
//rval might be null //rval might be null
} }
if ((rval==NULL) if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose)) || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval,
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose)) subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval,
subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
) )
{ {
// if rval is something we just allocated, // if rval is something we just allocated,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论