提交 7ddf071c authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #4256 from harmdevries89/gpupool_newbackend

Gpupool newbackend
......@@ -364,28 +364,32 @@ def test_pooling_with_tensor_vars():
cast_to_output_type=False,
mode=mode_with_gpu)
out2 = pool_2d_i2n(x, ds=(2, 2), strides=(1, 1),
pad=(0, 0),
pool_function=T.max)
mode_without_gpu2 = mode_without_gpu.including()
mode_without_gpu2.check_isfinite = False
f1 = theano.function([x], fn(x), mode=mode_with_gpu)
f_gpu = theano.function([x], fn(x), mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes])
f2 = theano.function([x], out2, mode=mode_without_gpu2)
assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f2.maker.fgraph.apply_nodes])
for node in f_gpu.maker.fgraph.apply_nodes])
i = 1
for shp in [(1, 10, 100, 100),
(1, 3, 99, 99),
(32, 1, 147, 197),
]:
(32, 1, 147, 197)]:
data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__()
b = f2(data).__array__()
out = pool_2d_i2n(x, ds=(i, i), strides=(1, 1),
pad=(0, 0),
pool_function=T.max)
f_cpu = theano.function([x], out, mode=mode_without_gpu2)
assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f_cpu.maker.fgraph.apply_nodes])
# Change the window size dynamically for gpu op
ws.set_value(numpy.array([i, i]).astype('int32'))
a = f_gpu(data).__array__()
b = f_cpu(data).__array__()
utt.assert_allclose(a, b)
i += 1
def test_old_pool_interface():
......@@ -745,6 +749,7 @@ def test_dnn_tag():
class TestDnnInferShapes(utt.InferShapeTester):
def setUp(self):
super(TestDnnInferShapes, self).setUp()
self.mode = mode_with_gpu
......
......@@ -142,6 +142,7 @@ dnn_available.msg = None
class DnnBase(COp):
"""
Creates a handle for cudnn and pulls in the cudnn libraries and headers.
......@@ -255,6 +256,7 @@ version.v = None
class GpuDnnConvDesc(COp):
"""
This Op builds a convolution descriptor for use in the other convolution
operations.
......@@ -388,6 +390,7 @@ def ensure_dt(val, default, name, dtype):
class GpuDnnConv(DnnBase):
"""
The forward convolution.
......@@ -555,6 +558,7 @@ class GpuDnnConv(DnnBase):
class GpuDnnConvGradW(DnnBase):
"""
The convolution gradient with respect to the weights.
......@@ -675,6 +679,7 @@ class GpuDnnConvGradW(DnnBase):
class GpuDnnConvGradI(DnnBase):
"""
The convolution gradient with respect to the inputs.
......@@ -943,6 +948,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
class GpuDnnPoolDesc(Op):
"""
This Op builds a pooling descriptor for use in the other
pooling operations.
......@@ -1061,69 +1067,87 @@ class GpuDnnPoolDesc(Op):
class GpuDnnPool(DnnBase):
"""
Pooling.
"""
Parameters
----------
img
The image 4d tensor.
desc
The pooling descriptor.
The image 4d or 5d tensor.
Parameters
----------
ws : tensor variable
Window size.
stride : tensor variable
(dx, dy) or (dx, dy, dz).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' corresponds to 'average_inc_pad'.
pad : tensor
(padX, padY) or (padX, padY, padZ)
"""
__props__ = ()
__props__ = ('mode',)
def __init__(self):
def __init__(self, mode='max'):
DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def make_node(self, img, desc):
img = as_gpuarray_variable(img, infer_context_name(img))
def get_op_params(self):
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if desc.owner is not None:
e_ndim = desc.owner.op.get_ndim() + 2
return [('MODE_FLAG', mode_flag)]
if img.type.ndim != e_ndim:
raise TypeError('img must be %dD tensor' % (e_ndim,))
def make_node(self, img, ws, stride, pad):
ctx_name = infer_context_name(img)
img = as_gpuarray_variable(img, ctx_name)
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnPoolingDescriptor_t'):
raise TypeError('desc must be cudnnPoolingDescriptor_t')
ws = tensor.as_tensor_variable(ws)
stride = tensor.as_tensor_variable(stride)
pad = tensor.as_tensor_variable(pad)
assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
assert ws.type.ndim == 1
return Apply(self, [img, desc], [img.type()])
return Apply(self, [img, ws, stride, pad], [img.type()])
def infer_shape(self, node, shape):
desc = node.inputs[1].owner.op
w = desc.ws
s = desc.stride
p = desc.pad
w = node.inputs[1]
s = node.inputs[2]
p = node.inputs[3]
res = [shape[0][0], shape[0][1],
(shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
(shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1
]
if len(w) > 2:
if node.inputs[0].ndim == 5:
res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [res]
def grad(self, inp, grads):
img, desc = inp
img, ws, stride, pad = inp
grad, = grads
grad = gpu_contiguous(grad)
out = self(img, desc)
out = self(img, ws, stride, pad)
g_out = GpuDnnPoolGrad()(img, out, grad, desc)
g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
return g_out, theano.gradient.DisconnectedType()()
return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
def connection_pattern(self, node):
# not connected to desc
return [[1], [0]]
# not connected to parameters
return [[1], [0], [0], [0]]
class GpuDnnPoolGrad(DnnBase):
"""
The pooling gradient.
......@@ -1135,40 +1159,56 @@ class GpuDnnPoolGrad(DnnBase):
The output of the pooling in the forward.
out_grad
Same size as out, but is the corresponding gradient information.
desc
The pooling descriptor.
ws : tensor variable
Window size.
stride : tensor variable
(dx, dy) or (dx, dy, dz).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' corresponds to 'average_inc_pad'.
pad : tensor
(padX, padY) or (padX, padY, padZ)
"""
__props__ = ()
__props__ = ('mode',)
def __init__(self):
def __init__(self, mode='max'):
DnnBase.__init__(self, ["dnn_pool_grad.c"],
"APPLY_SPECIFIC(dnn_pool_grad)")
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def get_op_params(self):
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
def make_node(self, inp, out, out_grad, desc):
return [('MODE_FLAG', mode_flag)]
def make_node(self, inp, out, out_grad, ws, stride, pad):
ctx_name = infer_context_name(inp, out, out_grad)
inp = as_gpuarray_variable(inp, ctx_name)
assert (inp.ndim in [4, 5])
out_grad = as_gpuarray_variable(out_grad, ctx_name)
assert (out_grad.ndim in [4, 5])
out = as_gpuarray_variable(out, ctx_name)
assert(out.ndim in [4, 5])
if desc.owner is not None:
nd = desc.owner.op.get_ndim() + 2
if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,))
assert (out_grad.ndim == inp.ndim)
assert (inp.ndim == out.ndim)
if out_grad.type.ndim != nd:
raise TypeError('out_grad must be %dD tensor' % (nd,))
ws = tensor.as_tensor_variable(ws)
stride = tensor.as_tensor_variable(stride)
pad = tensor.as_tensor_variable(pad)
assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
assert ws.type.ndim == 1
if out.type.ndim != nd:
raise TypeError('out must be %dD tensor' % (nd,))
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnPoolingDescriptor_t'):
raise TypeError('desc must be cudnnPoolingDescriptor_t')
return Apply(self, [inp, out, out_grad, desc], [inp.type()])
return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
def infer_shape(self, node, shape):
return [shape[0]]
......@@ -1206,11 +1246,11 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
"""
img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
return GpuDnnPool()(img, desc)
return GpuDnnPool(mode=mode)(img, ws, stride, pad)
class GpuDnnSoftmaxBase(DnnBase):
"""
Op for the cuDNN Softmax.
......@@ -1263,6 +1303,7 @@ class GpuDnnSoftmaxBase(DnnBase):
class GpuDnnSoftmax(GpuDnnSoftmaxBase):
"""
Op for the cuDNN Softmax.
......@@ -1296,6 +1337,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
"""
Op for the cuDNN SoftmaxGrad.
......@@ -1467,11 +1509,12 @@ def local_pool_dnn_grad_stride(node, ctx_name):
pad = node.op.padding
mode = node.op.mode
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
return GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(out_grad),
desc)
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(out_grad),
ds,
st,
pad)
@register_opt('cudnn')
......@@ -1491,11 +1534,10 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name):
cg = gpu_contiguous(out_grad)
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
# We reuse cg because CuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad()(gpu_contiguous(inp), cg, cg, desc)
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ds, st, pad)
@register_opt('cudnn')
......@@ -1548,6 +1590,7 @@ def local_logsoftmax_to_dnn(node, ctx_name):
class NoCuDNNRaise(Optimizer):
def apply(self, fgraph):
"""
Raise a error if cudnn can't be used.
......
......@@ -2,12 +2,15 @@
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(pool) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
......@@ -19,16 +22,25 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
"(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cudnnPoolingDescriptor_t desc,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnStatus_t err;
......@@ -46,14 +58,21 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
int w[3];
int p[3];
int s[3];
int ndims;
int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
}
for(int i = 0; i < ndims; i++) {
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
for(int i = 0; i < ndims; i++) {
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error doing cudnnGetPoolingDescriptor operation: %s",
cudnnGetErrorString(err));
return 1;
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
}
dims[0] = PyGpuArray_DIM(img, 0);
......@@ -98,7 +117,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), desc,
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
......
......@@ -4,6 +4,7 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
#section init_code_struct
......@@ -11,6 +12,7 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(input_grad) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(output_grad) = NULL;
APPLY_SPECIFIC(pool) = NULL;
{
cudnnStatus_t err;
......@@ -38,6 +40,11 @@ APPLY_SPECIFIC(output_grad) = NULL;
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
"(pool): %s", cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
......@@ -46,13 +53,16 @@ if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC
if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out,
PyGpuArrayObject *out_grad,
cudnnPoolingDescriptor_t desc,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **inp_grad,
PyGpuContextObject *c) {
cudnnStatus_t err;
......@@ -85,6 +95,26 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
return 1;
}
int w[3];
int p[3];
int s[3];
int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
}
for(int i = 0; i < ndims; i++) {
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
for(int i = 0; i < ndims; i++) {
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
}
if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
return 1;
......@@ -118,7 +148,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), desc,
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
......@@ -275,6 +275,55 @@ def test_pooling():
utt.assert_allclose(c_out, g_out)
def test_pooling_with_tensor_vars():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
x = T.ftensor4()
ws = theano.shared(numpy.array([2, 2], dtype='int32'))
st = theano.shared(numpy.array([1, 1], dtype='int32'))
pad = theano.shared(numpy.array([0, 0], dtype='int32'))
mode = 'max'
def fn(x):
dnn_op = dnn.dnn_pool(x,
ws=ws,
stride=st,
pad=pad,
mode=mode)
return dnn_op
for shp in [(1, 1, 2, 2),
(1, 1, 3, 3)]:
data = numpy.random.normal(0, 1, shp).astype("float32") * 10
theano.tests.unittest_tools.verify_grad(
fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
out2 = pool_2d_i2n(x, ds=(2, 2), strides=(1, 1),
pad=(0, 0),
pool_function=T.max)
mode_without_gpu2 = mode_without_gpu.including()
mode_without_gpu2.check_isfinite = False
f1 = theano.function([x], fn(x), mode=mode_with_gpu)
assert any([isinstance(node.op, dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes])
f2 = theano.function([x], out2, mode=mode_without_gpu2)
assert not any([isinstance(node.op, dnn.GpuDnnPool)
for node in f2.maker.fgraph.apply_nodes])
for shp in [(1, 10, 100, 100),
(1, 3, 99, 99),
(32, 1, 147, 197),
]:
data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__()
b = f2(data).__array__()
utt.assert_allclose(a, b)
def test_pooling_opt():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
......@@ -340,6 +389,7 @@ def test_dnn_tag():
class TestDnnInferShapes(utt.InferShapeTester):
def setUp(self):
super(TestDnnInferShapes, self).setUp()
self.mode = mode_with_gpu
......@@ -525,14 +575,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
[(1, 1), (2, 2), (3, 3)],
modes
):
desc = dnn.GpuDnnPoolDesc(
ws=params[0],
stride=params[1],
mode=params[2]
)()
self._compile_and_check(
[img],
[dnn.GpuDnnPool()(img, desc)],
[dnn.GpuDnnPool(mode=params[2])(img, params[0], params[1], (0, 0))],
[img_val],
dnn.GpuDnnPool
)
......@@ -561,16 +606,13 @@ class TestDnnInferShapes(utt.InferShapeTester):
[(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad']
):
desc = dnn.GpuDnnPoolDesc(
ws=params[0],
stride=params[1],
mode=params[2]
)()
pool_grad = dnn.GpuDnnPoolGrad()(
pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
img,
out,
img_grad,
desc
params[0],
params[1],
(0, 0)
)
self._compile_and_check(
[img, img_grad, out],
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论