提交 268bc917 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron 提交者: --global

Extend the current pooling op to support 3d pooling.

This does not have any tests or optimizations for 3d pooling, but the 2d support still works at least.
上级 525c9c84
......@@ -197,6 +197,85 @@ cudnnConvolutionBackwardData_v2(
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
static inline cudnnStatus_t
cudnnSetPoolingNdDescriptor(
cudnnPoolingDescriptor_t poolingDesc,
const cudnnPoolingMode_t mode,
int nbDims,
const int windowDimA[],
const int paddingA[],
const in strideA[]) {
if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetPoolingDescriptor(poolingDesc, mode,
windowDimA[0], windowDimA[1],
strideA[0], strideA[1]);
}
static inline cudnnStatus_t
cudnnGetPoolingNdDescriptor(
const cudnnPoolingDescriptor_t poolingDesc,
const int nbDimsRequested,
cudnnPoolingMode_t *mode,
int *nbDims,
int windowA[],
int paddingA[],
int strideA[]) {
int win0, win1, str0, str1;
cudnnStatus_t err;
if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
&str0, &str1);
if (err != CUDNN_STATUS_SUCCESS) return err;
*nbDims = 2;
paddingA[0] = 0;
paddingA[1] = 0;
windowA[0] = win0;
windowA[1] = win1;
strideA[0] = str0;
strideA[1] = str1;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnPoolingForward_v2(
cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const void *beta,
const cudnnTensorDescriptor_t destDesc,
void *destData) {
if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnPoolingForward(handle, poolingDesc, srcDesc, srcData,
destDesc, destData);
}
#define cudnnPoolingForward cudnnPoolingForward_v2
static inline cudnnStatus_t
cudnnPoolingBackward_v2(
cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t srcDiffDesc,
const void *srcDiffData,
const cudnnTensorDescriptor_t destDesc,
const void *destData,
const void *beta,
const cudnnTensorDescriptor_t destDiffDesc,
void *destDiffData) {
if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnPoolingBackward(handle, poolingDesc,
srcDesc, srcData,
srcDiffDesc, srcDiffData,
destDesc, destData,
destDiffDesc, destDiffData);
}
#define cudnnPoolingBackward cudnnPoolingBackward_v2
//Needed for R2 rc2
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
#else
......
......@@ -1266,7 +1266,7 @@ class GpuDnnPoolDesc(GpuOp):
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
"""
__props__ = ('ws', 'stride', 'mode', 'pad')
__props__ = ('ws', 'stride', 'mode', 'pad', 'nd')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
......@@ -1283,19 +1283,23 @@ class GpuDnnPoolDesc(GpuOp):
def do_constant_folding(self, node):
return False
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0), nd=2):
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
assert len(ws) == 2
assert len(ws) == nd
self.ws = ws
assert len(stride) == 2
assert len(stride) == nd
self.stride = stride
assert len(stride) == 2
assert len(stride) == nd
self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and version() == -1:
raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
assert nd in (2, 3)
if nd == 3 and version() < (3000, 3000):
raise RuntimeError("3 pooling only supported on CuDNN v3")
self.nd = nd
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -1333,22 +1337,14 @@ class GpuDnnPoolDesc(GpuOp):
"descriptor: %%s", cudnnGetErrorString(err));
%(fail)s
}
#ifndef CUDNN_VERSION
err = cudnnSetPoolingDescriptor(
%(desc)s,
%(mode_flag)s,
%(wsX)d, %(wsY)d,
%(stridex)d, %(stridey)d
);
#else
err = cudnnSetPooling2dDescriptor(
%(desc)s,
%(mode_flag)s,
%(wsX)d, %(wsY)d,
%(padX)d, %(padY)d,
%(stridex)d, %(stridey)d
);
#endif
{
int win[%(nd)d] = {%(win)s};
int pad[%(nd)d] = {%(pad)s};
int str[%(nd)d] = {%(str)s};
err = cudnnSetPoolingNdDescriptor(
%(desc)s, %(mode_flag)s, %(nd)d,
win, pad, str);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
......@@ -1356,46 +1352,49 @@ class GpuDnnPoolDesc(GpuOp):
}
}
""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
wsX=self.ws[0], wsY=self.ws[1],
stridex=self.stride[0], stridey=self.stride[1],
padX=self.pad[0], padY=self.pad[1])
nd=self.nd, win=', '.join(str(w) for w in self.ws),
pad=', '.join(str(p) for p in self.pad),
str=', '.join(str(s) for s in self.stride))
def c_code_cache_version(self):
return (2, version())
return (3, version())
class GpuDnnPool(DnnBase):
"""
Pooling.
:param img: the image 4d tensor.
:param img: the image 4d or 5d tensor.
:param desc: the pooling descriptor.
"""
__props__ = ()
def make_node(self, img, desc):
img = as_cuda_ndarray_variable(img)
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t')
return Apply(self, [img, desc],
[img.type()])
dop = desc.owner.op
e_ndim = dop.nd + 2 # 4 or 5
if img.type.ndim != e_ndim:
raise TypeError('img must be %dD tensor' % e_ndim)
return Apply(self, [img, desc], [img.type()])
def infer_shape(self, node, shape):
desc = node.inputs[1].owner.op
kh, kw = desc.ws
sh, sw = desc.stride
padh, padw = desc.pad
return [(
shape[0][0],
shape[0][1],
(shape[0][2] + 2*padh - kh)//sh + 1,
(shape[0][3] + 2*padw - kw)//sw + 1
)]
nd = desc.nd
w = desc.ws
s = desc.stride
p = desc.pad
ret = [shape[0][0], shape[0][1],
(shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
(shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1]
if nd == 3:
ret.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [ret]
def c_support_code_struct(self, node, name):
return """
......@@ -1409,12 +1408,12 @@ cudnnStatus_t err%(name)s;
input%(name)s = NULL;
output%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
......@@ -1430,65 +1429,51 @@ if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
desc = inputs[1]
out, = outputs
set_in = c_set_tensor4d(inputs[0], "input" + str(name),
'err' + name, sub['fail'])
set_out = c_set_tensor4d(out, "output" + str(name),
'err' + name, sub['fail'])
return """
cudnnStatus_t err%(name)s;
int %(out)s_dims[4];
int %(out)s_dims[5];
if (!CudaNdarray_is_c_contiguous(%(input)s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
%(set_in)s
if (c_set_tensorNd(%(input)s, %(input)s->nd, %(input_desc)s) != 0)
%(fail)s
cudnnPoolingMode_t mode;
int wsX, wsY, vpad, hpad, strideX, strideY;
#ifndef CUDNN_VERSION
err%(name)s = cudnnGetPoolingDescriptor(
%(desc)s, &mode,
&wsX, &wsY,
&strideX, &strideY);
#else
err%(name)s = cudnnGetPooling2dDescriptor(
%(desc)s, &mode,
&wsX, &wsY,
&vpad, &hpad,
&strideX, &strideY);
#endif
int win[3];
int pad[3];
int str[3];
int ndims;
err%(name)s = cudnnGetPoolingNdDescriptor(
%(desc)s, 3,
&mode, &ndims,
win, pad, str);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnGetPoolingDescriptor operation: %%s",
"GpuDnnPool: error doing cudnnGetPoolingNdDescriptor operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
%(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
%(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (pad[0]*2) - win[0]) / str[0] + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (pad[1]*2) - win[1]) / str[1] + 1;
if (ndims == 3)
%(out)s_dims[4] = (CudaNdarray_HOST_DIMS(%(input)s)[4] + (pad[2]*2) - win[2]) / str[2] + 1;
if (CudaNdarray_prep_output(&%(out)s, 4, %(out)s_dims) != 0)
if (CudaNdarray_prep_output(&%(out)s, ndims+2, %(out)s_dims) != 0)
{
%(fail)s
}
%(set_out)s
#ifndef CUDNN_VERSION
err%(name)s = cudnnPoolingForward(
_handle,
%(desc)s,
%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
);
#else
if (c_set_tensorNd(%(out)s, %(out)s->nd, %(output_desc)s) != 0)
%(fail)s
{
const float alpha = 1;
const float beta = 0;
......@@ -1501,7 +1486,6 @@ _handle,
%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
);
}
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
......@@ -1509,8 +1493,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
%(fail)s
}
""" % dict(out=out, desc=desc, fail=sub['fail'],
name=name, set_in=set_in,
set_out=set_out, input=inputs[0],
name=name, input=inputs[0],
input_desc="input"+name,
output_desc="output"+name)
......@@ -1531,7 +1514,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
return [[1], [0]]
def c_code_cache_version(self):
return (6, version())
return (7, version())
class GpuDnnPoolGrad(DnnBase):
......@@ -1546,21 +1529,23 @@ class GpuDnnPoolGrad(DnnBase):
__props__ = ()
def make_node(self, inp, out, inp_grad, desc):
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t')
nd = desc.owner.op.nd + 2 # 4 or 5
inp = as_cuda_ndarray_variable(inp)
if inp.type.ndim != 4:
raise TypeError('inp must be 4D tensor')
if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,))
inp_grad = as_cuda_ndarray_variable(inp_grad)
if inp_grad.type.ndim != 4:
raise TypeError('inp_grad must be 4D tensor')
if inp_grad.type.ndim != nd:
raise TypeError('inp_grad must be %dD tensor' % (nd,))
out = as_cuda_ndarray_variable(out)
if out.type.ndim != 4:
raise TypeError('out must be 4D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t')
if out.type.ndim != nd:
raise TypeError('out must be %dD tensor' % (nd,))
return Apply(self, [inp, out, inp_grad, desc],
[inp.type()])
......@@ -1621,18 +1606,6 @@ if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(nam
out, inp, inp_grad, desc = inputs
out_grad, = outputs
set_in = "\n".join([
c_set_tensor4d(inp, "input" + name,
'err' + name, sub['fail']),
c_set_tensor4d(inp_grad, "input_grad" + name,
'err' + name, sub['fail']),
c_set_tensor4d(out, "output" + name,
'err' + name, sub['fail'])
])
set_out = c_set_tensor4d(out, "output_grad" + name,
'err' + name, sub['fail'])
return """
cudnnStatus_t err%(name)s;
......@@ -1654,25 +1627,23 @@ if (!CudaNdarray_is_c_contiguous(%(output)s)) {
%(fail)s
}
%(set_in)s
if (c_set_tensorNd(%(input)s, %(input)s->nd, %(input_desc)s) != 0)
%(fail)s
if (c_set_tensorNd(%(input_grad)s, %(input_grad)s->nd, %(input_grad_desc)s) != 0)
%(fail)s
if (c_set_tensorNd(%(output)s, %(output)s->nd, %(output_desc)s) != 0)
%(fail)s
if (CudaNdarray_prep_output(&%(output_grad)s, 4,
if (CudaNdarray_prep_output(&%(output_grad)s,
%(output)s->nd,
CudaNdarray_HOST_DIMS(%(output)s)) != 0)
{
%(fail)s
}
%(set_out)s
#ifndef CUDNN_VERSION
err%(name)s = cudnnPoolingBackward(
_handle,
%(desc)s,
%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
%(input_grad_desc)s, CudaNdarray_DEV_DATA(%(input_grad)s),
%(output_desc)s, CudaNdarray_DEV_DATA(%(output)s),
%(output_grad_desc)s, CudaNdarray_DEV_DATA(%(output_grad)s)
);
#else
if (c_set_tensorNd(%(output_grad)s, %(output_grad)s->nd, %(output_grad_desc)s) != 0)
%(fail)s
{
const float alpha = 1;
const float beta = 0;
......@@ -1687,51 +1658,28 @@ _handle,
%(output_grad_desc)s, CudaNdarray_DEV_DATA(%(output_grad)s)
);
}
#endif
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPoolGrad: error doing operation: %%s. "
"input.shape=(%%d, %%d, %%d, %%d) "
"input_grad.shape=(%%d, %%d, %%d, %%d) "
"output.shape=(%%d, %%d, %%d, %%d) "
"output_grad.shape=(%%d, %%d, %%d, %%d)",
cudnnGetErrorString(err%(name)s),
CudaNdarray_HOST_DIMS(%(input)s)[0],
CudaNdarray_HOST_DIMS(%(input)s)[1],
CudaNdarray_HOST_DIMS(%(input)s)[2],
CudaNdarray_HOST_DIMS(%(input)s)[3],
CudaNdarray_HOST_DIMS(%(input_grad)s)[0],
CudaNdarray_HOST_DIMS(%(input_grad)s)[1],
CudaNdarray_HOST_DIMS(%(input_grad)s)[2],
CudaNdarray_HOST_DIMS(%(input_grad)s)[3],
CudaNdarray_HOST_DIMS(%(output)s)[0],
CudaNdarray_HOST_DIMS(%(output)s)[1],
CudaNdarray_HOST_DIMS(%(output)s)[2],
CudaNdarray_HOST_DIMS(%(output)s)[3],
CudaNdarray_HOST_DIMS(%(output_grad)s)[0],
CudaNdarray_HOST_DIMS(%(output_grad)s)[1],
CudaNdarray_HOST_DIMS(%(output_grad)s)[2],
CudaNdarray_HOST_DIMS(%(output_grad)s)[3]
);
%(fail)s
"GpuDnnPoolGrad: error doing operation: %%s.",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(output_grad=out_grad, desc=desc,
fail=sub['fail'],
name=name, set_in=set_in,
set_out=set_out, input=inp, input_grad=inp_grad, output=out,
fail=sub['fail'], name=name,
input=inp, input_grad=inp_grad, output=out,
input_desc="input"+name,
input_grad_desc="input_grad"+name,
output_desc="output"+name,
output_grad_desc="output_grad"+name)
def c_code_cache_version(self):
return (5, version())
return (6, version())
def infer_shape(self, node, shape):
return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0), nd=2):
"""
GPU pooling using cuDNN from NVIDIA.
......@@ -1746,6 +1694,9 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
:param pad: (padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
:param nd: dimensions of pooling, can be 2 or 3 for 2d or 3d pooling
If set to 3 all other params (except mode) must have an extra
dimension to match. 3 is only available for cudnn v3
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
......@@ -1753,7 +1704,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
:note: This Op implements the ignore_border=True of max_pool_2d.
"""
img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad, nd=nd)()
return GpuDnnPool()(img, desc)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论