提交 40eb1b30 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3678 from nouiz/conv3d

Conv3d
......@@ -33,7 +33,6 @@ from theano.tensor.nnet.abstract_conv2d import (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
def dnn_available():
if dnn_available.avail is None:
if not theano.sandbox.cuda.cuda_available:
......@@ -2232,43 +2231,43 @@ if True:
70.0, 'fast_run', 'inplace', 'gpu', 'cudnn')
@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
return [node.op(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConvGradW()(*inputs)]
return [node.op(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1:
return None
return [GpuDnnConvGradI()(*inputs)]
return [node.op(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
return [node.op(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW()(*inputs)]
return [node.op(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI()(*inputs)]
return [node.op(*inputs)]
@register_opt('cudnn')
@local_optimizer([GpuDownsampleFactorMax])
......
......@@ -57,7 +57,7 @@ def is_equal(var, val):
return False
def alpha_merge(cls, alpha_in, beta_in, nd):
def alpha_merge(cls, alpha_in, beta_in):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
......@@ -68,9 +68,13 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
targ = find_node(node.inputs[0], cls)
if targ is None:
targ = find_node(node.inputs[1], cls)
lr = grab_cpu_scalar(node.inputs[0], nd=nd)
if targ is None:
return
lr = grab_cpu_scalar(node.inputs[0],
nd=targ.outputs[0].ndim)
else:
lr = grab_cpu_scalar(node.inputs[1], nd=nd)
lr = grab_cpu_scalar(node.inputs[1],
nd=targ.outputs[0].ndim)
if lr is None or targ is None:
return None
inputs = list(targ.inputs)
......@@ -93,7 +97,7 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
return wrapper
def output_merge(cls, alpha_in, beta_in, out_in, nd):
def output_merge(cls, alpha_in, beta_in, out_in):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
......
......@@ -1082,6 +1082,80 @@ def test_dnn_conv_alpha_output_merge():
utt.assert_allclose(v1, v2)
def test_dnn_conv3d_alpha_output_merge():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
t = T.TensorType(broadcastable=(False, False, False, False, False),
dtype='float32')
img = t()
kern = t()
out = t()
b = 1
c = 4
f = 3
it = 10
ih = 5
iw = 8
kt = 3
kh = 2
kw = 6
img_val = numpy.random.random((b, c, it, ih, iw)).astype('float32')
kern_val = numpy.random.random((f, c, kt, kh, kw)).astype('float32')
out_val = numpy.random.random((b, f, it - kt + 1, ih - kh + 1,
iw - kw + 1)).astype('float32')
conv = dnn.dnn_conv3d(img, kern)
gw = theano.grad(conv.sum(), kern)
gi = theano.grad(conv.sum(), img)
lr = numpy.asarray(0.05, dtype='float32')
if cuda.dnn.version() == -1:
# Can't merge alpha with cudnn v1
fr = conv + out
wr = kern + gw
ir = img + gi
else:
fr = lr * (conv + out)
wr = kern + lr * gw
ir = img + lr * gi
f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
dnn.GpuDnnConv)
assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
dnn.GpuDnnConvGradW)
assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
dnn.GpuDnnConvGradI)
mode = mode_with_gpu
mode = mode.excluding('local_dnn_conv_alpha_merge')
mode = mode.excluding('local_dnn_convw_alpha_merge')
mode = mode.excluding('local_dnn_convi_alpha_merge')
mode = mode.excluding('local_dnn_conv_output_merge')
mode = mode.excluding('local_dnn_convw_output_merge')
mode = mode.excluding('local_dnn_convi_output_merge')
f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode)
assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
dnn.GpuDnnConv3d)
assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
dnn.GpuDnnConv3dGradW)
assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
dnn.GpuDnnConv3dGradI)
out_f1 = f1(img_val, kern_val, out_val)
out_f2 = f2(img_val, kern_val, out_val)
assert len(out_f1) == len(out_f2)
for v1, v2 in zip(out_f1, out_f2):
utt.assert_allclose(v1, v2)
def test_dnn_conv_merge_mouts():
# make sure it doesn't attempt to output/alpha merge a convolution
# that has multiple clients.
......
......@@ -54,6 +54,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
if config.nvcc.compiler_bindir:
params.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
......
......@@ -130,7 +130,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#endif
#ifdef CUDNN_VERSION > 3000
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
int nd;
int pad[2];
......
......@@ -344,7 +344,7 @@ SOMEPATH/Canopy_64bit/User/lib/python2.7/site-packages/numpy/distutils/system_in
res = GCC_compiler.try_compile_tmp(
test_code, tmp_prefix='try_blas_',
flags=flags, try_run=True)
if res[0] and res[1]:
if res and res[0] and res[1]:
return "-lblas"
else:
return ""
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论