提交 578f4836 authored 作者: lamblin's avatar lamblin

Merge pull request #610 from nouiz/gpu_conv_faster

Gpu conv faster
......@@ -39,6 +39,12 @@ Interface changes
the provided value have. In the past, the error was at run time.
(Frederic B.)
Speed up
* Convolution on the GPU now check the generation of the card to make
it faster in some cases (especially medium/big ouput image) (Frédéric B.)
(We hardcoded 512 as the maximum number of thread per block. Newer card
support up to 1024 threads per block.
New Features
* debugprint new param ids=["CHAR", "id", "int", ""]
This makes the identifier printed to be the python id, a unique char, a
......@@ -120,6 +126,9 @@ Crash Fix
* Work around a known issue with nvcc 4.1 on MacOS X. (Graham Taylon)
* In advanced indexing, if some inputs are constant, no need to call constant(...)
on their value any more. (Pascal L., reported by John Salvatier)
* Fix crash on GPU when the GpuSubtensor didn't put the right stride
when the results tensor had a dimensions with size of 1. (Pascal L,
reported Graham T.)
=============
Release Notes
......
import copy
import os
import StringIO
import theano
from theano import Apply
from theano import tensor
from theano.sandbox.cuda.type import CudaNdarrayType
......@@ -613,7 +615,8 @@ class GpuConv(GpuOp):
version=-1,
verbose=0,
kshp=None,
imshp=None):
imshp=None,
max_threads_dim0=None):
"""
:param version: each version of c_code implement many kernel for the
convolution. By default we try to guess the best one.
......@@ -629,6 +632,10 @@ class GpuConv(GpuOp):
:param imshp: The size of the image. Not used for code generation but
allow to select an experimental new version in another
repo.
:param max_threads_dim0: The maximum number of thread for the
block size dimensions 0 (blockDim.x) used by the
GPU function.
"""
self.border_mode = border_mode
self.subsample = subsample
......@@ -651,6 +658,7 @@ class GpuConv(GpuOp):
self.verbose = verbose
self.kshp = kshp
self.imshp = imshp
self.max_threads_dim0 = max_threads_dim0
def __eq__(self, other):
return type(self) == type(other) \
......@@ -662,7 +670,8 @@ class GpuConv(GpuOp):
and self.version == other.version \
and self.verbose == other.verbose \
and self.kshp == other.kshp\
and self.imshp == other.imshp
and self.imshp == other.imshp\
and self.max_threads_dim0 == other.max_threads_dim0
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -681,7 +690,8 @@ class GpuConv(GpuOp):
^ self.version \
^ hash(self.verbose) \
^ hash(self.kshp)\
^ hash(self.imshp)
^ hash(self.imshp)\
^ hash(self.max_threads_dim0)
def __str__(self):
return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
......@@ -704,6 +714,25 @@ class GpuConv(GpuOp):
False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node)
assert node.op is node_.op
if node_.op.max_threads_dim0 is None:
op = copy.copy(node_.op)
device_id = theano.sandbox.cuda.use.device_number[3:]
if device_id == '':
device_id = 0
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0']
return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling)
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, "max_threads_dim0"):
self.max_threads_dim0 = None
def c_compile_args(self):
nb = 0
if self.kshp is not None:
......@@ -715,7 +744,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 18)
return (0, 19)
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
......@@ -734,6 +763,7 @@ class GpuConv(GpuOp):
version = self.version
verbose = self.verbose
sub = sub.copy()
max_threads_dim0 = self.max_threads_dim0
sub.update(locals())
return """
//Mandatory args
......@@ -764,7 +794,8 @@ class GpuConv(GpuOp):
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
%(out)s, mode,
dx, dy,
version, verbose);
version, verbose,
%(max_threads_dim0)s);
Py_XDECREF(%(out)s);
%(out)s = out2;
""" % sub
......
......@@ -31,6 +31,16 @@ else:
cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
device_id = theano.sandbox.cuda.use.device_number
device_id = device_id[3:]
if device_id == '':
device_id = 0
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
device_prop = cuda_ndarray.device_properties(device_id)
def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1]
......@@ -386,7 +396,7 @@ def test_valid_0_2():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if ishape[1] > 1:
continue
......@@ -417,7 +427,7 @@ def test_valid_1_3_11_12():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
(16 * 1024 - 150)):
......@@ -446,7 +456,7 @@ def test_valid_4():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if ishape[1] > 1:
continue
......@@ -478,7 +488,7 @@ def test_valid_5():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
(16 * 1024 - 150)):
......@@ -512,7 +522,7 @@ def test_valid_7_8_13():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[2] * oshape[3] > 512:
if oshape[2] * oshape[3] > device_prop['maxThreadsDim0']:
continue
if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
......@@ -543,7 +553,7 @@ def test_valid_9_10():
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
if oshape[3] > device_prop['maxThreadsDim0']:
continue
if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
continue
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论