提交 a725adf3 authored 作者: f0k's avatar f0k

Refactored GpuCorrMM to be split into separate ops for the forward pass and the two backward passes

上级 e76a29d9
......@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import (
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, GpuCorrMM)
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace
......@@ -1354,19 +1355,23 @@ def local_conv_gemm(node):
border_mode = node.op.border_mode
subsample = node.op.subsample
pad = (0,0)
if (border_mode == 'full') and ((subsample != (1,1)) or (pad != (0,0))):
if (border_mode == 'full') and (subsample != (1,1)):
# need to simulate this via a padded valid convolution
pad = 'auto'
border_mode = 'valid'
if (border_mode == 'valid'):
# need to flip the kernel for valid convolution
kern = gpu_contiguous(kern[:, :, ::-1, ::-1])
kern = kern[:, :, ::-1, ::-1]
# call GpuCorrMM
# TODO: call GpuCorrMM_gradWeights instead if appropriate
return [GpuCorrMM('valid', subsample, pad)(
gpu_contiguous(img), gpu_contiguous(kern))]
elif (border_mode == 'full'):
# need to bring kernel into correct memory layout for full convolution
kern = gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)).dimshuffle(1, 0, 2, 3)
# need C-contiguous inputs
img = gpu_contiguous(img)
return [GpuCorrMM(border_mode, subsample, pad)(img, kern)]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs
return [GpuCorrMM_gradInputs('valid', subsample, pad)(
gpu_contiguous(kern), gpu_contiguous(img))]
gpu_optimizer.register("conv_gemm", local_conv_gemm)
......
......@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
f = theano.function([i, k], op, mode=theano_mode)
if cls is not None:
assert any([isinstance(node.op, cls)
for node in f.maker.fgraph.toposort()]), f.maker.fgraph.toposort()
for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
gpuval = f(img, kern)
t2 = time.time()
for i in range(nb_iter):
......@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode,
cls=cls)
except Exception, e:
print ver, id, (ishape, kshape, subshape, istride, kstride)
print e
print "Exception", type(e), e
pass
if not ret:
failed_version.add(ver)
......@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False):
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
# Add tests with strided inputs by still square images and filters.
......@@ -713,7 +713,7 @@ def test_full(conv_gemm=False):
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version = [-1]
else:
......@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False):
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.GpuCorrMM
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version_valid = version_full = [-1]
else:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论