提交 6da4304d authored 作者: abergeron's avatar abergeron

Merge pull request #2556 from lamblin/dnn_pool

Dnn pool support of pad
......@@ -521,8 +521,8 @@ class BaseGpuCorrMM(GpuOp):
def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
if pad != (0, 0):
_logger.warning(
'do not use pad for BaseGpuCorrMM; please set padding in'
'border_mode, see the docstring for more details')
'do not use pad for BaseGpuCorrMM; please set padding in '
'border_mode parameter, see the docstring for more details')
if border_mode != "valid":
raise ValueError("border_mode must be 'valid'")
border_mode = pad
......
......@@ -10,10 +10,13 @@ from theano.compat import PY3
from theano.compile.ops import shape_i
from theano.configparser import AddConfigVar, EnumStr
from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import (
DownsampleFactorMax, DownsampleFactorMaxGrad)
from theano.tensor.basic import ShapeError
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu,
gpu_contiguous, HostFromGpu,
cp_on_negative_strides)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
......@@ -85,16 +88,19 @@ dnn_available.msg = None
def c_set_tensor4d(var, desc, err, fail):
return """
{
int str0, str1, str2, str3;
str3 = CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1;
str2 = CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3];
str1 = CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3];
str0 = CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1];
%(err)s = cudnnSetTensor4dDescriptorEx(
%(desc)s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
str0, str1, str2, str3
);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -105,13 +111,12 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
str0, str1, str2, str3
);
%(fail)s
}
}
""" % dict(var=var, err=err, desc=desc, fail=fail)
......@@ -659,8 +664,11 @@ class GpuDnnPoolDesc(GpuOp):
:param ws: windows size
:param stride: (dx, dy)
:param mode: 'max' or 'average'
:param pad: (padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
"""
__props__ = ('ws', 'stride', 'mode')
__props__ = ('ws', 'stride', 'mode', 'pad')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
......@@ -677,15 +685,27 @@ class GpuDnnPoolDesc(GpuOp):
def do_constant_folding(self, node):
return False
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max'):
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
assert mode in ('max', 'average')
self.mode = mode
assert len(ws) == 2
self.ws = ws
assert len(stride) == 2
self.stride = stride
assert len(stride) == 2
self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and version() < 20:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'pad'):
self.pad = (0, 0)
def make_node(self):
if self.pad != (0, 0) and version() < 20:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
return Apply(self, [],
[CDataType("cudnnPoolingDescriptor_t")()])
......@@ -720,7 +740,7 @@ class GpuDnnPoolDesc(GpuOp):
%(desc)s,
%(mode_flag)s,
%(wsX)d, %(wsY)d,
0, 0,
%(padX)d, %(padY)d,
%(stridex)d, %(stridey)d
);
#endif
......@@ -731,11 +751,13 @@ class GpuDnnPoolDesc(GpuOp):
}
}
""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
wsX=self.ws[0], wsY=self.ws[1], stridex=self.stride[0],
stridey=self.stride[1])
wsX=self.ws[0], wsY=self.ws[1],
stridex=self.stride[0], stridey=self.stride[1],
padX=self.pad[0], padY=self.pad[1],
)
def c_code_cache_version(self):
return (1, version())
return (2, version())
class GpuDnnPool(DnnBase):
......@@ -845,8 +867,8 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
%(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
%(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] - wsX) / strideX + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] - wsY) / strideY + 1;
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;
if (CudaNdarray_prep_output(&%(out)s, 4, %(out)s_dims) != 0)
{
......@@ -904,7 +926,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
return [[1], [0]]
def c_code_cache_version(self):
return (4, version())
return (6, version())
class GpuDnnPoolGrad(DnnBase):
......@@ -1063,8 +1085,29 @@ _handle,
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPoolGrad: error doing operation: %%s",
cudnnGetErrorString(err%(name)s));
"GpuDnnPoolGrad: error doing operation: %%s. "
"input.shape=(%%d, %%d, %%d, %%d) "
"input_grad.shape=(%%d, %%d, %%d, %%d) "
"output.shape=(%%d, %%d, %%d, %%d) "
"output_grad.shape=(%%d, %%d, %%d, %%d)",
cudnnGetErrorString(err%(name)s),
CudaNdarray_HOST_DIMS(%(input)s)[0],
CudaNdarray_HOST_DIMS(%(input)s)[1],
CudaNdarray_HOST_DIMS(%(input)s)[2],
CudaNdarray_HOST_DIMS(%(input)s)[3],
CudaNdarray_HOST_DIMS(%(input_grad)s)[0],
CudaNdarray_HOST_DIMS(%(input_grad)s)[1],
CudaNdarray_HOST_DIMS(%(input_grad)s)[2],
CudaNdarray_HOST_DIMS(%(input_grad)s)[3],
CudaNdarray_HOST_DIMS(%(output)s)[0],
CudaNdarray_HOST_DIMS(%(output)s)[1],
CudaNdarray_HOST_DIMS(%(output)s)[2],
CudaNdarray_HOST_DIMS(%(output)s)[3],
CudaNdarray_HOST_DIMS(%(output_grad)s)[0],
CudaNdarray_HOST_DIMS(%(output_grad)s)[1],
CudaNdarray_HOST_DIMS(%(output_grad)s)[2],
CudaNdarray_HOST_DIMS(%(output_grad)s)[3]
);
%(fail)s
}
""" % dict(output_grad=out_grad, desc=desc,
......@@ -1077,13 +1120,13 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
output_grad_desc="output_grad"+name)
def c_code_cache_version(self):
return (4, version())
return (5, version())
def infer_shape(self, node, shape):
return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max'):
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
"""
GPU pooling using cuDNN from NVIDIA.
......@@ -1094,6 +1137,9 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
:param ws: subsampling window size
:param stride: subsampling stride (default: (1, 1))
:param mode: one of 'max', 'average' (default: 'max')
:param pad: (padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
......@@ -1101,7 +1147,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
:note: This Op implements the ignore_border=True of max_pool_2d.
"""
img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode)()
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
return GpuDnnPool()(img, desc)
......@@ -1437,6 +1483,23 @@ if True:
ds = node.op.ds
return [dnn_pool(gpu_contiguous(img), ds, ds)]
@register_opt('cudnn')
@local_optimizer([DownsampleFactorMax])
def local_pool_dnn_stride(node):
if not dnn_available():
return
if isinstance(node.op, DownsampleFactorMax):
if not node.op.ignore_border:
return
img, = node.inputs
ds = node.op.ds
stride = node.op.st
pad = node.op.padding
if (img.owner and isinstance(img.owner.op, HostFromGpu)):
ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
ds, stride=stride, pad=pad)
return [host_from_gpu(ret)]
@register_opt('cudnn')
@local_optimizer([GpuDownsampleFactorMaxGrad])
def local_pool_dnn_grad(node):
......@@ -1454,6 +1517,30 @@ if True:
gpu_contiguous(inp_grad),
desc)]
@register_opt('cudnn')
@local_optimizer([DownsampleFactorMaxGrad])
def local_pool_dnn_grad_stride(node):
if not dnn_available():
return
if isinstance(node.op, DownsampleFactorMaxGrad):
inp, out, inp_grad = node.inputs
ds = node.op.ds
st = node.op.st
pad = node.op.padding
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(out.owner and isinstance(out.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op, HostFromGpu))
):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode="max", pad=pad)()
if not node.op.ignore_border:
return
ret = GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)
return [host_from_gpu(ret)]
@register_opt('cudnn')
@local_optimizer([GpuSoftmax])
def local_softmax_dnn(node):
......
import logging
import unittest
from nose.plugins.skip import SkipTest
import numpy
......@@ -10,11 +9,10 @@ from theano.compat.six import StringIO
from theano.gof.python25 import any
import theano.tensor as T
import theano.tests.unittest_tools as utt
from theano.sandbox.neighbours import images2neibs, neibs2images
from theano.sandbox.neighbours import images2neibs
from theano.tensor.signal.downsample import max_pool_2d
from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import gpu_contiguous
# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda
......@@ -31,6 +29,7 @@ else:
def pool_2d_i2n(input, ds=(2, 2), strides=None,
pad=(0, 0),
pool_function=T.max, mode='ignore_borders'):
if strides is None:
strides = ds
......@@ -40,8 +39,19 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,
"strides should be smaller than or equal to ds,"
" strides=(%d, %d) and ds=(%d, %d)" %
(strides + ds))
shape = input.shape
if pad != (0, 0):
assert pool_function is T.max
pad_x = pad[0]
pad_y = pad[1]
a = T.alloc(-numpy.inf, shape[0], shape[1], shape[2] + pad_x*2,
shape[3] + pad_y*2)
input = T.set_subtensor(a[:, :,
pad_x:pad_x+shape[2],
pad_y:pad_y+shape[3]],
input)
shape = input.shape
neibs = images2neibs(input, ds, strides, mode=mode)
pooled_neibs = pool_function(neibs, axis=1)
......@@ -58,33 +68,41 @@ def test_pooling():
raise SkipTest(cuda.dnn.dnn_available.msg)
x = T.ftensor4()
for func, pad in product((T.max, T.mean),
((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
if pad != (0, 0) and cuda.dnn.version() < 20:
continue
for func in (T.max, T.mean):
for ws in (2, 4, 5):
for ws in (4, 2, 5):
for stride in (2, 3):
if stride > ws:
continue
if ws == stride and func is T.max:
if func is T.max:
# We will check that the opt introduced it.
out1 = max_pool_2d(x, (ws, ws), ignore_border=True)
out1 = max_pool_2d(x, (ws, ws),
st=(stride, stride),
ignore_border=True,
padding=pad)
else:
out1 = cuda.dnn.dnn_pool(
x, ws=(ws, ws),
stride=(stride, stride),
pad=pad,
mode='max' if func is T.max else "average")
out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
pad=pad,
pool_function=func)
f1 = theano.function([x], out1, mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes])
f2 = theano.function([x], out2, mode=mode_with_gpu)
f2 = theano.function([x], out2, mode=mode_without_gpu)
assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f2.maker.fgraph.apply_nodes])
for shp in [(1, 10, 100, 100),
(1, 3, 99, 99),
(32, 1, 147, 197),
]:
]:
data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__()
......@@ -98,45 +116,50 @@ def test_pooling():
data = numpy.random.normal(0, 1, shp).astype("float32")*10
ws = 2
strides = 2
stride = 2
# This test the CPU grad + opt + GPU implemtentation
def fn(x):
return max_pool_2d(x, (ws, ws), ignore_border=True)
return max_pool_2d(x, (ws, ws), ignore_border=True,
padding=pad)
theano.tests.unittest_tools.verify_grad(fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that the opt would have inserted it.
f = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in f.maker.fgraph.toposort()])
for node in fg.maker.fgraph.toposort()])
# Test the GPU grad + GPU implementation
def fn(x):
dnn_op = cuda.dnn.dnn_pool(
x, ws=(ws, ws),
stride=(stride, stride),
pad=pad,
mode='max' if func is T.max else "average")
return dnn_op
theano.tests.unittest_tools.verify_grad(fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
theano.tests.unittest_tools.verify_grad(
fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that we get the good op.
f = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in f.maker.fgraph.toposort()])
g_out = f(data)
for node in fg.maker.fgraph.toposort()])
g_out = fg(data)
if func is T.max:
# Compare again the CPU result
out = max_pool_2d(x, (ws, ws), ignore_border=True)
f = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu)
out = max_pool_2d(x, (ws, ws),
padding=pad,
ignore_border=True)
fc = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu)
assert any([isinstance(node.op, DownsampleFactorMaxGrad)
for node in f.maker.fgraph.toposort()])
c_out = f(data)
for node in fc.maker.fgraph.toposort()])
c_out = fc(data)
assert numpy.allclose(c_out, g_out)
......@@ -165,7 +188,7 @@ def test_pooling_opt():
def test_dnn_tag():
"""
We test that if cudnn isn't avail we crash and that if it is avail, we use it.
Test that if cudnn isn't avail we crash and that if it is avail, we use it.
"""
x = T.ftensor4()
old = theano.config.on_opt_error
......@@ -412,11 +435,11 @@ class TestDnnInferShapes(utt.InferShapeTester):
mode=params[2]
)()
pool_grad = dnn.GpuDnnPoolGrad()(
img,
out,
img_grad,
desc
)
img,
out,
img_grad,
desc
)
self._compile_and_check(
[img, img_grad, out],
[pool_grad],
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论