提交 0ad57e82 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add support for subsampling with GpuDnnConv and tests it and the gradient.

上级 211ee289
...@@ -12,6 +12,7 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, ...@@ -12,6 +12,7 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
from theano.sandbox.cuda.blas import GpuConv from theano.sandbox.cuda.blas import GpuConv
from theano.compat import PY3 from theano.compat import PY3
class DnnBase(GpuOp): class DnnBase(GpuOp):
""" """
Creates a handle for cudnn and pulls in the cudnn libraries and headers. Creates a handle for cudnn and pulls in the cudnn libraries and headers.
...@@ -46,11 +47,12 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -46,11 +47,12 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
class GpuDnnConvBase(DnnBase): class GpuDnnConvBase(DnnBase):
__props__ = ('border_mode', 'conv_mode') __props__ = ('border_mode', 'subsample', 'conv_mode')
def __init__(self, border_mode, conv_mode='conv'): def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
assert border_mode in ('valid', 'full') assert border_mode in ('valid', 'full')
self.border_mode = border_mode self.border_mode = border_mode
self.subsample = subsample
assert conv_mode in ('conv', 'cross') assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode self.conv_mode = conv_mode
...@@ -58,6 +60,8 @@ class GpuDnnConvBase(DnnBase): ...@@ -58,6 +60,8 @@ class GpuDnnConvBase(DnnBase):
self.__dict__.update(props) self.__dict__.update(props)
if not hasattr(self, 'conv_mode'): if not hasattr(self, 'conv_mode'):
self.conv_mode = 'conv' self.conv_mode = 'conv'
if not hasattr(self, 'subsample'):
self.subsample = (1, 1)
def make_node(self, img, kern): def make_node(self, img, kern):
if img.type.ndim != 4: if img.type.ndim != 4:
...@@ -209,7 +213,7 @@ err%(name)s = cudnnSetConvolutionDescriptor( ...@@ -209,7 +213,7 @@ err%(name)s = cudnnSetConvolutionDescriptor(
op%(id)d, param0_%(id)d, param1_%(id)d, op%(id)d, param0_%(id)d, param1_%(id)d,
pad_h%(name)s, pad_h%(name)s,
pad_w%(name)s, pad_w%(name)s,
1, 1, 1, 1, %(subsx)d, %(subsy)d, 1, 1,
%(conv_flag)s %(conv_flag)s
); );
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err%(name)s != CUDNN_STATUS_SUCCESS) {
...@@ -252,10 +256,11 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -252,10 +256,11 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
""" % dict(param0=param0, param1=param1, out=out, bmode=bmode, """ % dict(param0=param0, param1=param1, out=out, bmode=bmode,
conv_flag=conv_flag, fail=sub['fail'], id=sub['struct_id'], conv_flag=conv_flag, fail=sub['fail'], id=sub['struct_id'],
name=name, checks='\n'.join(checks), sets='\n'.join(sets), name=name, checks='\n'.join(checks), sets='\n'.join(sets),
subsx=self.subsample[0], subsy=self.subsample[1],
set_out=set_out, method=self.conv_op, path=self.path_flag) set_out=set_out, method=self.conv_op, path=self.path_flag)
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
class GpuDnnConv(GpuDnnConvBase): class GpuDnnConv(GpuDnnConvBase):
...@@ -291,12 +296,12 @@ from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous, ...@@ -291,12 +296,12 @@ from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_dnn(node): def local_conv_dnn(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
if (node.op.subsample != (1, 1) or if node.op.border_mode not in ['full', 'valid']:
node.op.border_mode not in ['full', 'valid']):
return return
img, kern = node.inputs img, kern = node.inputs
border_mode = node.op.border_mode border_mode = node.op.border_mode
return [GpuDnnConv(border_mode)(gpu_contiguous(img), subsample = node.op.subsample
gpu_contiguous(kern))] return [GpuDnnConv(border_mode, subsample)(gpu_contiguous(img),
gpu_contiguous(kern))]
gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn') gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
...@@ -26,7 +26,7 @@ from theano.sandbox import cuda ...@@ -26,7 +26,7 @@ from theano.sandbox import cuda
if cuda.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.dnn import GpuDnnConv from theano.sandbox.cuda.dnn import GpuDnnConv, GpuDnnConvBase
#needed as the gpu conv don't have a perform implementation. #needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
...@@ -662,7 +662,7 @@ def test_dnn_full(): ...@@ -662,7 +662,7 @@ def test_dnn_full():
yield t yield t
def test_subsample(conv_gemm=False): def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]):
seed_rng() seed_rng()
shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)), shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)), ((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
...@@ -677,8 +677,6 @@ def test_subsample(conv_gemm=False): ...@@ -677,8 +677,6 @@ def test_subsample(conv_gemm=False):
# We put only the version that implement the subsample to make the # We put only the version that implement the subsample to make the
# test faster. # test faster.
version_valid = [-2, -1, 1, 3, 11, 12]
version_full = [-2, -1]
verbose = 0 verbose = 0
random = True random = True
print_ = False print_ = False
...@@ -686,16 +684,6 @@ def test_subsample(conv_gemm=False): ...@@ -686,16 +684,6 @@ def test_subsample(conv_gemm=False):
if ones: if ones:
random = False random = False
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version_valid = version_full = [-1]
else:
mode = theano_mode
cls = None
for t in exec_conv(version_valid, shapes, verbose, random, 'valid', for t in exec_conv(version_valid, shapes, verbose, random, 'valid',
print_=print_, ones=ones, print_=print_, ones=ones,
theano_mode=mode, cls=cls): theano_mode=mode, cls=cls):
...@@ -706,8 +694,21 @@ def test_subsample(conv_gemm=False): ...@@ -706,8 +694,21 @@ def test_subsample(conv_gemm=False):
yield t yield t
def test_subsample():
for t in _test_subsample(None, theano_mode,
version_valid=[-2, -1, 1, 3, 11, 12],
version_full=[-2, -1]):
yield t
def test_gemm_subsample(): def test_gemm_subsample():
for t in test_subsample(conv_gemm=True): for t in _test_subsample(cuda.blas.BaseGpuCorrMM,
theano_mode.including("conv_gemm")):
yield t
def test_dnn_subsample():
for t in _test_subsample(GpuDnnConv, theano_mode.including('cudnn')):
yield t yield t
...@@ -782,58 +783,122 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -782,58 +783,122 @@ class TestConv2DGPU(unittest.TestCase):
theano_mode = theano_mode_orig theano_mode = theano_mode_orig
def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
direction):
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2)
subsample = (subsx, subsy)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
if direction == 'fprop':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
elif direction == 'bprop img':
cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
elif direction == 'bprop kern':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = numpy.array(f(
npy_img.transpose(1, 0, 2, 3),
npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(
1, 0, 2, 3)
assert_allclose(cpuval, gpuval, rtol=1e-4)
def test_gemm_directly(): def test_gemm_directly():
for direction in ['fprop', 'bprop img', 'bprop kern']: for bs in range(1, 5):
print 'Testing direction: ' + direction for ch in range(1,4):
for bs in range(1, 5): for nf in range(1,4):
for ch in range(1,4): for rImg1 in range(5, 9):
for nf in range(1,4): for rImg2 in range(5, 9):
for rImg1 in range(5, 9): for rFlt1 in range(2, 4):
for rImg2 in range(5, 9): for rFlt2 in range(2, 4):
for rFlt1 in range(2, 4): for direction in ['bprop img', 'bprop kern']:
for rFlt2 in range(2, 4): yield (gemm_directly, bs, ch, nf, rImg1,
for subsx in range(1, 3) if direction == 'fprop' else [1]: rImg2, rFlt1, rFlt2, 1, 1,
for subsy in range(1, 3) if direction == 'fprop' else [1]: direction)
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2) for subsx in range(1, 3):
subsample = (subsx, subsy) for subsy in range(1, 3):
yield (gemm_directly, bs, ch, nf,
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') rImg1, rImg2, rFlt1, rFlt2,
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') subsx, subsy, 'fprop')
i = cuda_tensor4()
k = cuda_tensor4() def gemm_op(mode, subsample):
pad = 'full' if mode == 'full' else (0, 0)
if direction == 'fprop': return theano.sandbox.cuda.blas.GpuCorrMM('valid', subsample, pad)
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
subsample=subsample)(i, k) def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy, op):
f = theano.function([i, k], op, mode=theano_mode) ishape = (bs, ch, rImg1, rImg2)
gpuval = f(npy_img, npy_kern[:,:,::-1,::-1]) kshape = (nf, ch, rFlt1, rFlt2)
elif direction == 'bprop img': subsample = (subsx, subsy)
cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(border_mode='valid', npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
subsample=subsample)(i, k) npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img) i = cuda_tensor4()
elif direction == 'bprop kern': k = cuda_tensor4()
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(border_mode='valid', # TODO: also test custom pad values
subsample=subsample)(i, k) corr_op = op(mode, subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode) # try to compile reference implementation without shape,
gpuval = numpy.array(f(npy_img.transpose(1, 0, 2, 3), # so we don't have to compile hundreds of versions
npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(1, 0, 2, 3) conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
border_mode=mode, subsample=subsample)
if not numpy.allclose(cpuval, gpuval, rtol=1e-4): try:
print "Test failed for" conv_op_di = theano.grad(conv_op.sum(), i)
print "direction: ", direction conv_op_dk = theano.grad(conv_op.sum(), k)
print "ishape: ", ishape except Exception:
print "kshape: ", kshape # compile with shape information only when needed
print "subsample: ", subsample conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
assert False ishape, kshape, mode, subsample)
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
def test_gemm_grads(): corr_op_di = theano.grad(corr_op.sum(), i)
corr_op_dk = theano.grad(corr_op.sum(), k)
outputs = [corr_op, conv_op,
corr_op_di, conv_op_di,
corr_op_dk, conv_op_dk]
try:
conv_op_dik = theano.grad(conv_op_di.sum(), k)
conv_op_dki = theano.grad(conv_op_dk.sum(), i)
except Exception:
# skip if the reference implementation can't do it
return
corr_op_dik = theano.grad(corr_op_di.sum(), k)
corr_op_dki = theano.grad(corr_op_dk.sum(), i)
outputs.extend([corr_op_dik, conv_op_dik,
corr_op_dki, conv_op_dki])
f = theano.function([i, k], outputs, mode=theano_mode)
allvals = f(npy_img, npy_kern)
for a, b, p in zip(allvals[::2], allvals[1::2],
('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
assert_allclose(a, b, rtol=1e-4)
def test_conv_grads():
for mode in 'valid', 'full': for mode in 'valid', 'full':
for bs in [1, 5]: for bs in [1, 5]:
for ch in [4]: for ch in [4]:
...@@ -842,68 +907,16 @@ def test_gemm_grads(): ...@@ -842,68 +907,16 @@ def test_gemm_grads():
for rImg2 in [2, 8]: for rImg2 in [2, 8]:
for rFlt1 in [1, 2]: for rFlt1 in [1, 2]:
for rFlt2 in [1, 2]: for rFlt2 in [1, 2]:
for subsx in [1, 2]: for op in [gemm_op, GpuDnnConv]:
for subsy in [1, 2] if subsx == 1 else [2]: yield (conv_grad, mode, bs, ch, nf,
ishape = (bs, ch, rImg1, rImg2) rImg1, rImg2, rFlt1, rFlt2,
kshape = (nf, ch, rFlt1, rFlt2) 1, 1, op)
subsample = (subsx, subsy) yield (conv_grad, mode, bs, ch, nf,
rImg1, rImg2, rFlt1, rFlt2,
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') 1, 2, op)
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') yield (conv_grad, mode, bs, ch, nf,
rImg1, rImg2, rFlt1, rFlt2,
i = cuda_tensor4() 2, 2, op)
k = cuda_tensor4()
pad = 'full' if mode == 'full' else (0, 0)
# TODO: also test custom pad values
corr_op = theano.sandbox.cuda.blas.GpuCorrMM(
'valid', subsample, pad)(i, k)
# try to compile reference implementation without shape,
# so we don't have to compile hundreds of versions
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
border_mode=mode, subsample=subsample)
try:
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
except Exception:
# compile with shape information only when needed
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
ishape, kshape, mode, subsample)
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
corr_op_di = theano.grad(corr_op.sum(), i)
corr_op_dk = theano.grad(corr_op.sum(), k)
outputs = [corr_op, conv_op,
corr_op_di, conv_op_di,
corr_op_dk, conv_op_dk]
try:
conv_op_dik = theano.grad(conv_op_di.sum(), k)
conv_op_dki = theano.grad(conv_op_dk.sum(), i)
except Exception:
# skip if the reference implementation can't do it
print ".",
else:
corr_op_dik = theano.grad(corr_op_di.sum(), k)
corr_op_dki = theano.grad(corr_op_dk.sum(), i)
outputs.extend([corr_op_dik, conv_op_dik,
corr_op_dki, conv_op_dki])
print ":",
f = theano.function([i, k], outputs, mode=theano_mode)
allvals = f(npy_img, npy_kern)
for a, b, p in zip(allvals[::2], allvals[1::2],
('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
if (a.shape != b.shape) or not numpy.allclose(a, b, rtol=1e-4):
print "Test failed for", p
print "mode: ", mode
print "ishape: ", ishape
print "kshape: ", kshape
print "subsample: ", subsample
assert False
sys.stdout.flush()
def benchmark(): def benchmark():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论