提交 a725adf3 authored 作者: f0k's avatar f0k

Refactored GpuCorrMM to be split into separate ops for the forward pass and the two backward passes

上级 e76a29d9
...@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import (
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, GpuCorrMM) gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace
...@@ -1354,19 +1355,23 @@ def local_conv_gemm(node): ...@@ -1354,19 +1355,23 @@ def local_conv_gemm(node):
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
pad = (0,0) pad = (0,0)
if (border_mode == 'full') and ((subsample != (1,1)) or (pad != (0,0))): if (border_mode == 'full') and (subsample != (1,1)):
# need to simulate this via a padded valid convolution # need to simulate this via a padded valid convolution
pad = 'auto' pad = 'auto'
border_mode = 'valid' border_mode = 'valid'
if (border_mode == 'valid'): if (border_mode == 'valid'):
# need to flip the kernel for valid convolution # need to flip the kernel for valid convolution
kern = gpu_contiguous(kern[:, :, ::-1, ::-1]) kern = kern[:, :, ::-1, ::-1]
# call GpuCorrMM
# TODO: call GpuCorrMM_gradWeights instead if appropriate
return [GpuCorrMM('valid', subsample, pad)(
gpu_contiguous(img), gpu_contiguous(kern))]
elif (border_mode == 'full'): elif (border_mode == 'full'):
# need to bring kernel into correct memory layout for full convolution # need to dimshuffle the kernel for full convolution
kern = gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)).dimshuffle(1, 0, 2, 3) kern = kern.dimshuffle(1, 0, 2, 3)
# need C-contiguous inputs # call GpuCorrMM_gradInputs
img = gpu_contiguous(img) return [GpuCorrMM_gradInputs('valid', subsample, pad)(
return [GpuCorrMM(border_mode, subsample, pad)(img, kern)] gpu_contiguous(kern), gpu_contiguous(img))]
gpu_optimizer.register("conv_gemm", local_conv_gemm) gpu_optimizer.register("conv_gemm", local_conv_gemm)
......
...@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
f = theano.function([i, k], op, mode=theano_mode) f = theano.function([i, k], op, mode=theano_mode)
if cls is not None: if cls is not None:
assert any([isinstance(node.op, cls) assert any([isinstance(node.op, cls)
for node in f.maker.fgraph.toposort()]), f.maker.fgraph.toposort() for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
gpuval = f(img, kern) gpuval = f(img, kern)
t2 = time.time() t2 = time.time()
for i in range(nb_iter): for i in range(nb_iter):
...@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode, ...@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode,
cls=cls) cls=cls)
except Exception, e: except Exception, e:
print ver, id, (ishape, kshape, subshape, istride, kstride) print ver, id, (ishape, kshape, subshape, istride, kstride)
print e print "Exception", type(e), e
pass pass
if not ret: if not ret:
failed_version.add(ver) failed_version.add(ver)
...@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False): ...@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False):
if conv_gemm: if conv_gemm:
# Test the GpuCorrMM version # Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm") mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough # dummy version; not used by GpuCorrMM so one version is enough
version = [-1] version = [-1]
# Add tests with strided inputs by still square images and filters. # Add tests with strided inputs by still square images and filters.
...@@ -713,7 +713,7 @@ def test_full(conv_gemm=False): ...@@ -713,7 +713,7 @@ def test_full(conv_gemm=False):
if conv_gemm: if conv_gemm:
# Test the GpuCorrMM version # Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm") mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough # dummy version; not used by GpuCorrMM so one version is enough
version = [-1] version = [-1]
else: else:
...@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False): ...@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False):
if conv_gemm: if conv_gemm:
# Test the GpuCorrMM version # Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm") mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough # dummy version; not used by GpuCorrMM so one version is enough
version_valid = version_full = [-1] version_valid = version_full = [-1]
else: else:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论