提交 6e3837ba authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2608 from nouiz/dnn

[WIP] Disable some optimization that could cause problem with cudnn v2 rc3.
......@@ -3434,6 +3434,9 @@ class CopyOnNegativeStrides(GpuOp):
i = i.copy()
out[0][0] = i
def infer_shape(self, node, xshp):
return xshp
def c_code(self, node, name, inp, out, sub):
input, = inp
z, = out
......
......@@ -649,12 +649,19 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
def contig_version(var):
if version() == -1:
var = gpu_contiguous(var)
else:
var = cp_on_negative_strides(var)
return var
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1,1) and
direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img = cp_on_negative_strides(img.dimshuffle(1, 0, 2, 3))
img = contig_version(img.dimshuffle(1, 0, 2, 3))
if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
......@@ -674,7 +681,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
img = cp_on_negative_strides(img)
img = gpu_contiguous(img) # cudnn v1 and v2 rc3 need contiguous data
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
......@@ -686,9 +693,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
# cp_on_negative_strides will return a gpu_contiguous copy
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
img = cp_on_negative_strides(img)
img = contig_version(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
......@@ -746,7 +753,7 @@ class GpuDnnPoolDesc(GpuOp):
self.stride = stride
assert len(stride) == 2
self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and version() < 20:
if (pad[0] != 0 or pad[1] != 0) and version() == -1:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
def __setstate__(self, d):
......@@ -755,7 +762,7 @@ class GpuDnnPoolDesc(GpuOp):
self.pad = (0, 0)
def make_node(self):
if self.pad != (0, 0) and version() < 20:
if self.pad != (0, 0) and version() == -1:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
return Apply(self, [],
......
......@@ -1763,7 +1763,7 @@ def get_device_type_sizes():
del t
except Exception, e:
_logger.warning(("Optimization Warning: "
"Got the following error, but we can ignore it. "
"Got the following error, but you can ignore it. "
"This could cause less GpuElemwise fused together.\n"
"%s") % e)
......
......@@ -70,7 +70,7 @@ def test_pooling():
x = T.ftensor4()
for func, pad in product((T.max, T.mean),
((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
if pad != (0, 0) and cuda.dnn.version() < 20:
if pad != (0, 0) and cuda.dnn.version() == -1:
continue
if pad != (0, 0) and func is T.mean:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论