提交 8748a7bb authored 作者: Frederic's avatar Frederic 提交者: Pascal Lamblin

Start to implement dnn pool with ignore_border=False and with pad.

Problems to fix: - dnn pool grad with ignore_border=False do not seam to compute the good value. Did we have a bug? Do they support it? Maybe not as it isn't documented. - We do not tests mode=average correctly as we do not have a reference implementation. - We do not tests pad. Need a reference implementation.
上级 bcebb452
......@@ -10,10 +10,13 @@ from theano.compat import PY3
from theano.compile.ops import shape_i
from theano.configparser import AddConfigVar, EnumStr
from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import (
DownsampleFactorMax, DownsampleFactorMaxGrad)
from theano.tensor.basic import ShapeError
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu,
gpu_contiguous, HostFromGpu,
cp_on_negative_strides)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
......@@ -648,8 +651,9 @@ class GpuDnnPoolDesc(GpuOp):
:param ws: windows size
:param stride: (dx, dy)
:param mode: 'max' or 'average'
:param pad: (padX, padY) padding information.
"""
__props__ = ('ws', 'stride', 'mode')
__props__ = ('ws', 'stride', 'mode', 'pad')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
......@@ -666,13 +670,22 @@ class GpuDnnPoolDesc(GpuOp):
def do_constant_folding(self, node):
return False
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max'):
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
assert mode in ('max', 'average')
self.mode = mode
assert len(ws) == 2
self.ws = ws
assert len(stride) == 2
self.stride = stride
assert len(stride) == 2
self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and dnn_version() < 20:
raise RuntimeError("CUDNN pooling need version v2 to support")
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(d, 'pad'):
self.pad = (0, 0)
def make_node(self):
return Apply(self, [],
......@@ -709,7 +722,7 @@ class GpuDnnPoolDesc(GpuOp):
%(desc)s,
%(mode_flag)s,
%(wsX)d, %(wsY)d,
0, 0,
%(padX)d, %(padY)d,
%(stridex)d, %(stridey)d
);
#endif
......@@ -720,11 +733,13 @@ class GpuDnnPoolDesc(GpuOp):
}
}
""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
wsX=self.ws[0], wsY=self.ws[1], stridex=self.stride[0],
stridey=self.stride[1])
wsX=self.ws[0], wsY=self.ws[1],
stridex=self.stride[0], stridey=self.stride[1],
padX=self.pad[0], padY=self.pad[1],
)
def c_code_cache_version(self):
return (1, version())
return (2, version())
class GpuDnnPool(DnnBase):
......@@ -734,7 +749,11 @@ class GpuDnnPool(DnnBase):
:param img: the image 4d tensor.
:param desc: the pooling descriptor.
"""
__props__ = ()
__props__ = ('ignore_border', )
def __init__(self, ignore_border):
self.ignore_border = ignore_border
DnnBase.__init__(self)
def make_node(self, img, desc):
img = as_cuda_ndarray_variable(img)
......@@ -834,8 +853,23 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
%(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
%(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] - wsX) / strideX + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] - wsY) / strideY + 1;
if (%(ignore_border)d){
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] - wsX) / strideX + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] - wsY) / strideY + 1;
}else{
int r = CudaNdarray_HOST_DIMS(%(input)s)[2];
int c = CudaNdarray_HOST_DIMS(%(input)s)[3];
if(strideX >= wsX){
%(out)s_dims[2] = (r - 1) / strideX + 1;
}else{
%(out)s_dims[2] = max(0, (r - 1 - wsX) / strideX + 1) + 1;
}
if(strideY >= wsY){
%(out)s_dims[3] = (c - 1) / strideY + 1;
}else{
%(out)s_dims[3] = max(0, (c - 1 - wsY) / strideY + 1) + 1;
}
}
if (CudaNdarray_prep_output(&%(out)s, 4, %(out)s_dims) != 0)
{
......@@ -874,6 +908,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
name=name, set_in=set_in,
set_out=set_out, input=inputs[0],
input_desc="input"+name,
ignore_border=self.ignore_border,
output_desc="output"+name)
def grad(self, inp, grads):
......@@ -884,7 +919,8 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
out = self(img, desc)
g_out = GpuDnnPoolGrad()(img, out, grad, desc)
g_out = GpuDnnPoolGrad(ignore_border=self.ignore_border)(
img, out, grad, desc)
return g_out, theano.gradient.DisconnectedType()()
......@@ -893,7 +929,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
return [[1], [0]]
def c_code_cache_version(self):
return (4, version())
return (5, version())
class GpuDnnPoolGrad(DnnBase):
......@@ -905,9 +941,15 @@ class GpuDnnPoolGrad(DnnBase):
:param inp_grad: same size as out, but is the corresponding gradient information.
:param desc: The pooling descriptor.
"""
__props__ = ()
__props__ = ('ignore_border', )
def __init__(self, ignore_border):
self.ignore_border = ignore_border
DnnBase.__init__(self)
def make_node(self, inp, out, inp_grad, desc):
if self.ignore_border is False:
raise NotImplementedError()
inp = as_cuda_ndarray_variable(inp)
if inp.type.ndim != 4:
raise TypeError('inp must be 4D tensor')
......@@ -1072,7 +1114,8 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max'):
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0),
ignore_border=True):
"""
GPU pooling using cuDNN from NVIDIA.
......@@ -1090,8 +1133,8 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
:note: This Op implements the ignore_border=True of max_pool_2d.
"""
img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode)()
return GpuDnnPool()(img, desc)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
return GpuDnnPool(ignore_border=ignore_border)(img, desc)
class GpuDnnSoftmaxBase(DnnBase):
......@@ -1420,11 +1463,25 @@ if True:
if not dnn_available():
return
if isinstance(node.op, GpuDownsampleFactorMax):
if not node.op.ignore_border:
return
img, = node.inputs
ds = node.op.ds
return [dnn_pool(gpu_contiguous(img), ds, ds)]
return [dnn_pool(gpu_contiguous(img), ds, ds,
ignore_border=node.op.ignore_border)]
@register_opt('cudnn')
@local_optimizer([DownsampleFactorMax])
def local_pool_dnn_stride(node):
if not dnn_available():
return
if isinstance(node.op, DownsampleFactorMax):
img, = node.inputs
ds = node.op.ds
stride = node.op.st
if (img.owner and isinstance(img.owner.op, HostFromGpu)):
ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
ds, stride=stride,
ignore_border=node.op.ignore_border)
return [host_from_gpu(ret)]
@register_opt('cudnn')
@local_optimizer([GpuDownsampleFactorMaxGrad])
......@@ -1432,16 +1489,41 @@ if True:
if not dnn_available():
return
if isinstance(node.op, GpuDownsampleFactorMaxGrad):
if not node.op.ignore_border:
return
inp, out, inp_grad = node.inputs
ds = node.op.ds
if not node.op.ignore_border:
return
desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")()
return [GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)]
return [GpuDnnPoolGrad(ignore_border=node.op.ignore_border)(
gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)]
@register_opt('cudnn')
@local_optimizer([DownsampleFactorMaxGrad])
def local_pool_dnn_grad_stride(node):
if not dnn_available():
return
if isinstance(node.op, DownsampleFactorMaxGrad):
inp, out, inp_grad = node.inputs
ds = node.op.ds
st = node.op.st
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(out.owner and isinstance(out.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op, HostFromGpu))
):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode="max")()
if not node.op.ignore_border:
return
ret = GpuDnnPoolGrad(ignore_border=node.op.ignore_border)(
gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)
return [host_from_gpu(ret)]
@register_opt('cudnn')
@local_optimizer([GpuSoftmax])
......
......@@ -31,6 +31,7 @@ else:
def pool_2d_i2n(input, ds=(2, 2), strides=None,
ignore_border=True,
pool_function=T.max, mode='ignore_borders'):
if strides is None:
strides = ds
......@@ -58,27 +59,28 @@ def test_pooling():
raise SkipTest(cuda.dnn.dnn_available.msg)
x = T.ftensor4()
for func in (T.max, T.mean):
for ws in (2, 4, 5):
for func, ignore_border in product(
(T.max, T.mean), (False, True)):
for ws in (4, 2, 5):
for stride in (2, 3):
if stride > ws:
continue
if ws == stride and func is T.max:
if func is T.max:
# We will check that the opt introduced it.
out1 = max_pool_2d(x, (ws, ws), ignore_border=True)
out1 = max_pool_2d(x, (ws, ws),
st=(stride, stride),
ignore_border=ignore_border)
else:
out1 = cuda.dnn.dnn_pool(
x, ws=(ws, ws),
stride=(stride, stride),
ignore_border=ignore_border,
mode='max' if func is T.max else "average")
out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
pool_function=func)
f1 = theano.function([x], out1, mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes])
f2 = theano.function([x], out2, mode=mode_with_gpu)
f2 = theano.function([x], out1, mode=mode_without_gpu)
assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f2.maker.fgraph.apply_nodes])
for shp in [(1, 10, 100, 100),
......@@ -102,41 +104,51 @@ def test_pooling():
# This test the CPU grad + opt + GPU implemtentation
def fn(x):
return max_pool_2d(x, (ws, ws), ignore_border=True)
return max_pool_2d(x, (ws, ws), ignore_border=ignore_border)
theano.tests.unittest_tools.verify_grad(fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that the opt would have inserted it.
f = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in f.maker.fgraph.toposort()])
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
if ignore_border:
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
else:
assert not any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
# Test the GPU grad + GPU implementation
def fn(x):
dnn_op = cuda.dnn.dnn_pool(
x, ws=(ws, ws),
stride=(stride, stride),
ignore_border=ignore_border,
mode='max' if func is T.max else "average")
return dnn_op
theano.tests.unittest_tools.verify_grad(fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that we get the good op.
f = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in f.maker.fgraph.toposort()])
g_out = f(data)
if func is T.max:
try:
theano.tests.unittest_tools.verify_grad(
fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that we get the good op.
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
g_out = fg(data)
assert ignore_border
except NotImplementedError:
assert not ignore_border
if func is T.max and ignore_border:
# Compare again the CPU result
out = max_pool_2d(x, (ws, ws), ignore_border=True)
f = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu)
out = max_pool_2d(x, (ws, ws), ignore_border=ignore_border)
fc = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu)
assert any([isinstance(node.op, DownsampleFactorMaxGrad)
for node in f.maker.fgraph.toposort()])
c_out = f(data)
for node in fc.maker.fgraph.toposort()])
c_out = fc(data)
assert numpy.allclose(c_out, g_out)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论