提交 ef4c9e9b authored 作者: Harm de Vries's avatar Harm de Vries 提交者: Frederic Bastien

new gpu backend accepting tensor variables

上级 fe0b1477
......@@ -141,6 +141,7 @@ dnn_available.msg = None
class DnnBase(COp):
"""
Creates a handle for cudnn and pulls in the cudnn libraries and headers.
......@@ -254,6 +255,7 @@ version.v = None
class GpuDnnConvDesc(COp):
"""
This Op builds a convolution descriptor for use in the other convolution
operations.
......@@ -387,6 +389,7 @@ def ensure_dt(val, default, name, dtype):
class GpuDnnConv(DnnBase):
"""
The forward convolution.
......@@ -554,6 +557,7 @@ class GpuDnnConv(DnnBase):
class GpuDnnConvGradW(DnnBase):
"""
The convolution gradient with respect to the weights.
......@@ -674,6 +678,7 @@ class GpuDnnConvGradW(DnnBase):
class GpuDnnConvGradI(DnnBase):
"""
The convolution gradient with respect to the inputs.
......@@ -942,6 +947,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
class GpuDnnPoolDesc(Op):
"""
This Op builds a pooling descriptor for use in the other
pooling operations.
......@@ -1060,69 +1066,89 @@ class GpuDnnPoolDesc(Op):
class GpuDnnPool(DnnBase):
"""
Pooling.
"""
Parameters
----------
img
The image 4d tensor.
desc
The pooling descriptor.
The image 4d or 5d tensor.
Parameters
----------
ws : tensor variable
Window size.
stride : tensor variable
(dx, dy) or (dx, dy, dz).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' corresponds to 'average_inc_pad'.
pad : tensor
(padX, padY) or (padX, padY, padZ)
"""
__props__ = ()
__props__ = ('mode',)
def __init__(self):
def __init__(self, mode='max'):
DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
self.mode = mode
def make_node(self, img, desc):
img = as_gpuarray_variable(img, infer_context_name(img))
def get_op_params(self):
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if desc.owner is not None:
e_ndim = desc.owner.op.get_ndim() + 2
return [('MODE_FLAG', mode_flag)]
if img.type.ndim != e_ndim:
raise TypeError('img must be %dD tensor' % (e_ndim,))
def make_node(self, img, ws, stride, pad):
ctx_name = infer_context_name(img, ws, stride, pad)
img = as_gpuarray_variable(img, ctx_name)
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnPoolingDescriptor_t'):
raise TypeError('desc must be cudnnPoolingDescriptor_t')
ws = tensor.as_tensor_variable(ws)
stride = tensor.as_tensor_variable(stride)
pad = tensor.as_tensor_variable(pad)
assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
assert ws.type.ndim == 1
return Apply(self, [img, desc], [img.type()])
return Apply(self, [img, ws, stride, pad], [img.type()])
def infer_shape(self, node, shape):
desc = node.inputs[1].owner.op
w = desc.ws
s = desc.stride
p = desc.pad
w = node.inputs[1]
s = node.inputs[2]
p = node.inputs[3]
res = [shape[0][0], shape[0][1],
(shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
(shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1
]
if len(w) > 2:
if node.inputs[0].ndim == 5:
res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [res]
def grad(self, inp, grads):
img, desc = inp
img, ws, stride, pad = inp
grad, = grads
grad = gpu_contiguous(grad)
out = self(img, desc)
out = self(img, ws, stride, pad)
g_out = GpuDnnPoolGrad()(img, out, grad, desc)
g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
return g_out, theano.gradient.DisconnectedType()()
return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
def connection_pattern(self, node):
# not connected to desc
return [[1], [0]]
# not connected to parameters
return [[1], [0], [0], [0]]
class GpuDnnPoolGrad(DnnBase):
"""
The pooling gradient.
......@@ -1134,40 +1160,56 @@ class GpuDnnPoolGrad(DnnBase):
The output of the pooling in the forward.
out_grad
Same size as out, but is the corresponding gradient information.
desc
The pooling descriptor.
ws : tensor variable
Window size.
stride : tensor variable
(dx, dy) or (dx, dy, dz).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' corresponds to 'average_inc_pad'.
pad : tensor
(padX, padY) or (padX, padY, padZ)
"""
__props__ = ()
__props__ = ('mode',)
def __init__(self):
def __init__(self, mode='max'):
DnnBase.__init__(self, ["dnn_pool_grad.c"],
"APPLY_SPECIFIC(dnn_pool_grad)")
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def get_op_params(self):
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
def make_node(self, inp, out, out_grad, desc):
return [('MODE_FLAG', mode_flag)]
def make_node(self, inp, out, out_grad, ws, stride, pad):
ctx_name = infer_context_name(inp, out, out_grad)
inp = as_gpuarray_variable(inp, ctx_name)
assert (inp.ndim in [4, 5])
out_grad = as_gpuarray_variable(out_grad, ctx_name)
assert (out_grad.ndim in [4, 5])
out = as_gpuarray_variable(out, ctx_name)
assert(out.ndim in [4, 5])
if desc.owner is not None:
nd = desc.owner.op.get_ndim() + 2
if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,))
assert (out_grad.ndim == inp.ndim)
assert (inp.ndim == out.ndim)
if out_grad.type.ndim != nd:
raise TypeError('out_grad must be %dD tensor' % (nd,))
ws = tensor.as_tensor_variable(ws)
stride = tensor.as_tensor_variable(stride)
pad = tensor.as_tensor_variable(pad)
assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
assert ws.type.ndim == 1
if out.type.ndim != nd:
raise TypeError('out must be %dD tensor' % (nd,))
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnPoolingDescriptor_t'):
raise TypeError('desc must be cudnnPoolingDescriptor_t')
return Apply(self, [inp, out, out_grad, desc], [inp.type()])
return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
def infer_shape(self, node, shape):
return [shape[0]]
......@@ -1205,11 +1247,11 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
"""
img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
return GpuDnnPool()(img, desc)
return GpuDnnPool(mode=mode)(img, ws, stride, pad)
class GpuDnnSoftmaxBase(DnnBase):
"""
Op for the cuDNN Softmax.
......@@ -1262,6 +1304,7 @@ class GpuDnnSoftmaxBase(DnnBase):
class GpuDnnSoftmax(GpuDnnSoftmaxBase):
"""
Op for the cuDNN Softmax.
......@@ -1295,6 +1338,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
"""
Op for the cuDNN SoftmaxGrad.
......@@ -1466,11 +1510,12 @@ def local_pool_dnn_grad_stride(node, ctx_name):
pad = node.op.padding
mode = node.op.mode
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
return GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(out_grad),
desc)
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(out_grad),
ds,
st,
pad)
@register_opt('cudnn')
......@@ -1490,11 +1535,10 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name):
cg = gpu_contiguous(out_grad)
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
# We reuse cg because CuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad()(gpu_contiguous(inp), cg, cg, desc)
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ds, st, pad)
@register_opt('cudnn')
......@@ -1547,6 +1591,7 @@ def local_logsoftmax_to_dnn(node, ctx_name):
class NoCuDNNRaise(Optimizer):
def apply(self, fgraph):
"""
Raise a error if cudnn can't be used.
......
......@@ -2,12 +2,15 @@
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(pool) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
......@@ -19,16 +22,25 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
"(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cudnnPoolingDescriptor_t desc,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnStatus_t err;
......@@ -46,14 +58,21 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
int w[3];
int p[3];
int s[3];
int ndims;
int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
}
for(int i = 0; i < ndims; i++) {
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
for(int i = 0; i < ndims; i++) {
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error doing cudnnGetPoolingDescriptor operation: %s",
cudnnGetErrorString(err));
return 1;
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
}
dims[0] = PyGpuArray_DIM(img, 0);
......@@ -98,7 +117,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), desc,
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
......
......@@ -4,6 +4,7 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
#section init_code_struct
......@@ -11,6 +12,7 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(input_grad) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(output_grad) = NULL;
APPLY_SPECIFIC(pool) = NULL;
{
cudnnStatus_t err;
......@@ -38,6 +40,11 @@ APPLY_SPECIFIC(output_grad) = NULL;
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
"(pool): %s", cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
......@@ -46,13 +53,16 @@ if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC
if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out,
PyGpuArrayObject *out_grad,
cudnnPoolingDescriptor_t desc,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **inp_grad,
PyGpuContextObject *c) {
cudnnStatus_t err;
......@@ -85,6 +95,26 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
return 1;
}
int w[3];
int p[3];
int s[3];
int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
for(int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
}
for(int i = 0; i < ndims; i++) {
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
for(int i = 0; i < ndims; i++) {
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
}
if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
return 1;
......@@ -118,7 +148,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), desc,
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
......@@ -339,6 +339,7 @@ def test_dnn_tag():
class TestDnnInferShapes(utt.InferShapeTester):
def setUp(self):
super(TestDnnInferShapes, self).setUp()
self.mode = mode_with_gpu
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论