提交 5c172018 authored 作者: Harm de Vries's avatar Harm de Vries

gpu dnn pool takes tensor variables

上级 50e06772
...@@ -1351,47 +1351,55 @@ class GpuDnnPoolDesc(GpuOp): ...@@ -1351,47 +1351,55 @@ class GpuDnnPoolDesc(GpuOp):
class GpuDnnPool(DnnBase): class GpuDnnPool(DnnBase):
""" """
Pooling. Pooling.
Parameters Parameters
---------- ----------
img img
The image 4d or 5d tensor. The image 4d or 5d tensor.
desc ws
The pooling descriptor. Windows size.
stride
(dx, dy).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' correspond to 'average_inc_pad'.
pad
(padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
""" """
__props__ = () __props__ = ("mode",)
def __init__(self, mode='max'):
super(GpuDnnPool, self).__init__()
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def make_node(self, img, desc): def make_node(self, img, ws, stride, pad):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
if not isinstance(desc.type, CDataType) \ assert (img.ndim in [4, 5])
or desc.type.ctype != 'cudnnPoolingDescriptor_t':
raise TypeError('desc must be cudnnPoolingDescriptor_t') ws = tensor.as_tensor_variable(ws)
stride = tensor.as_tensor_variable(stride)
if desc.owner is not None: pad = tensor.as_tensor_variable(pad)
dop = desc.owner.op assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
e_ndim = dop.get_ndim() + 2 # 4 or 5 assert ws.type.ndim == 1
if img.type.ndim != e_ndim: return Apply(self, [img, ws, stride, pad], [img.type()])
raise TypeError('img must be %dD tensor' % e_ndim)
return Apply(self, [img, desc], [img.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
if not node.inputs[1].owner: w = node.inputs[1]
raise theano.tensor.ShapeError() s = node.inputs[2]
desc = node.inputs[1].owner.op p = node.inputs[3]
nd = desc.get_ndim()
w = desc.ws
s = desc.stride
p = desc.pad
ret = [shape[0][0], shape[0][1], ret = [shape[0][0], shape[0][1],
(shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1, (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
(shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1] (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1]
if nd == 3: if node.inputs[0].ndim == 5:
ret.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1) ret.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [ret] return [ret]
...@@ -1399,6 +1407,7 @@ class GpuDnnPool(DnnBase): ...@@ -1399,6 +1407,7 @@ class GpuDnnPool(DnnBase):
return """ return """
cudnnTensorDescriptor_t input%(name)s; cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t output%(name)s; cudnnTensorDescriptor_t output%(name)s;
cudnnPoolingDescriptor_t pool%(name)s;
""" % dict(name=name) """ % dict(name=name)
def c_init_code_struct(self, node, name, sub): def c_init_code_struct(self, node, name, sub):
...@@ -1406,6 +1415,7 @@ cudnnTensorDescriptor_t output%(name)s; ...@@ -1406,6 +1415,7 @@ cudnnTensorDescriptor_t output%(name)s;
cudnnStatus_t err%(name)s; cudnnStatus_t err%(name)s;
input%(name)s = NULL; input%(name)s = NULL;
output%(name)s = NULL; output%(name)s = NULL;
pool%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) { if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %%s", cudnnGetErrorString(err%(name)s)); "(inp): %%s", cudnnGetErrorString(err%(name)s));
...@@ -1416,20 +1426,41 @@ if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS ...@@ -1416,20 +1426,41 @@ if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS
"(out): %%s", cudnnGetErrorString(err%(name)s)); "(out): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
if ((err%(name)s = cudnnCreatePoolingDescriptor(&pool%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling "
"descriptor: %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail']) """ % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name): def c_cleanup_code_struct(self, node, name):
return """ return """
if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); } if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); } if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
""" % dict(name=name) """ % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
desc = inputs[1] ws = inputs[1]
stride = inputs[2]
pad = inputs[3]
out, = outputs out, = outputs
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else:
raise NotImplementedError("Unsupported pooling model.")
return """ return """
cudnnStatus_t err%(name)s; fprintf(stderr, "test_forward\\n");
cudnnStatus_t err;
int %(out)s_dims[5]; int %(out)s_dims[5];
...@@ -1441,31 +1472,36 @@ if (!CudaNdarray_is_c_contiguous(%(input)s)) { ...@@ -1441,31 +1472,36 @@ if (!CudaNdarray_is_c_contiguous(%(input)s)) {
if (c_set_tensorNd(%(input)s, %(input_desc)s) != 0) if (c_set_tensorNd(%(input)s, %(input_desc)s) != 0)
%(fail)s %(fail)s
cudnnPoolingMode_t mode; int win[%(nd)d];
int win[3]; int pad[%(nd)d];
int pad[3]; int str[%(nd)d];
int str[3]; for(int i = 0; i < %(nd)d; i++) {
int ndims; win[i] = *((npy_intp*)PyArray_GETPTR1(%(ws)s, i));
err%(name)s = cudnnGetPoolingNdDescriptor( }
%(desc)s, 3, for(int i = 0; i < %(nd)d; i++) {
&mode, &ndims, pad[i] = *((npy_intp*)PyArray_GETPTR1(%(pad)s, i));
win, pad, str); }
for(int i = 0; i < %(nd)d; i++) {
if (err%(name)s != CUDNN_STATUS_SUCCESS) { str[i] = *((npy_intp*)PyArray_GETPTR1(%(str)s, i));
PyErr_Format(PyExc_RuntimeError, }
"GpuDnnPool: error doing cudnnGetPoolingNdDescriptor operation: %%s", err = cudnnSetPoolingNdDescriptor(
cudnnGetErrorString(err%(name)s)); pool%(name)s, %(mode_flag)s, %(nd)d,
%(fail)s win, pad, str);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
} }
%(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0]; %(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
%(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1]; %(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (pad[0]*2) - win[0]) / str[0] + 1; %(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (pad[0]*2) - win[0]) / str[0] + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (pad[1]*2) - win[1]) / str[1] + 1; %(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (pad[1]*2) - win[1]) / str[1] + 1;
if (ndims == 3) if (%(nd)s == 3)
%(out)s_dims[4] = (CudaNdarray_HOST_DIMS(%(input)s)[4] + (pad[2]*2) - win[2]) / str[2] + 1; %(out)s_dims[4] = (CudaNdarray_HOST_DIMS(%(input)s)[4] + (pad[2]*2) - win[2]) / str[2] + 1;
if (CudaNdarray_prep_output(&%(out)s, ndims+2, %(out)s_dims) != 0) if (CudaNdarray_prep_output(&%(out)s, %(nd)s+2, %(out)s_dims) != 0)
{ {
%(fail)s %(fail)s
} }
...@@ -1476,44 +1512,46 @@ if (c_set_tensorNd(%(out)s, %(output_desc)s) != 0) ...@@ -1476,44 +1512,46 @@ if (c_set_tensorNd(%(out)s, %(output_desc)s) != 0)
{ {
const float alpha = 1; const float alpha = 1;
const float beta = 0; const float beta = 0;
err%(name)s = cudnnPoolingForward( err = cudnnPoolingForward(
_handle, _handle,
%(desc)s, pool%(name)s,
&alpha, &alpha,
%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s), %(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
&beta, &beta,
%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s) %(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
); );
} }
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %%s", "GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err));
%(fail)s %(fail)s
} }
""" % dict(out=out, desc=desc, fail=sub['fail'], """ % dict(out=out, fail=sub['fail'],
name=name, input=inputs[0], name=name, input=inputs[0],
input_desc="input" + name, ws=ws, pad=pad, str=stride,
output_desc="output" + name) nd=node.inputs[0].ndim-2, input_desc="input"+name,
output_desc="output"+name,
mode_flag=mode_flag)
def grad(self, inp, grads): def grad(self, inp, grads):
img, desc = inp img, ws, stride, pad = inp
grad, = grads grad, = grads
grad = gpu_contiguous(grad) grad = gpu_contiguous(grad)
out = self(img, desc) out = self(img, ws, stride, pad)
g_out = GpuDnnPoolGrad()(img, out, grad, desc) g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
return g_out, theano.gradient.DisconnectedType()() return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [0]] return [[1], [0], [0], [0]]
def c_code_cache_version(self): #def c_code_cache_version(self):
return (7, version()) # return (8, version())
class GpuDnnPoolGrad(DnnBase): class GpuDnnPoolGrad(DnnBase):
...@@ -1528,35 +1566,42 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1528,35 +1566,42 @@ class GpuDnnPoolGrad(DnnBase):
The output of the pooling in the forward. The output of the pooling in the forward.
inp_grad inp_grad
Same size as out, but is the corresponding gradient information. Same size as out, but is the corresponding gradient information.
desc ws
The pooling descriptor. Windows size.
stride
(dx, dy).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' correspond to 'average_inc_pad'.
pad
(padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
""" """
__props__ = () __props__ = ('mode',)
def make_node(self, inp, out, inp_grad, desc): def __init__(self, mode='max'):
if not isinstance(desc.type, CDataType) \ super(GpuDnnPoolGrad, self).__init__()
or desc.type.ctype != 'cudnnPoolingDescriptor_t': if mode == 'average':
raise TypeError('desc must be cudnnPoolingDescriptor_t') mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def make_node(self, inp, out, inp_grad, ws, stride, pad):
inp = as_cuda_ndarray_variable(inp) inp = as_cuda_ndarray_variable(inp)
assert (inp.ndim in [4, 5])
inp_grad = as_cuda_ndarray_variable(inp_grad) inp_grad = as_cuda_ndarray_variable(inp_grad)
assert (inp_grad.ndim in [4, 5])
out = as_cuda_ndarray_variable(out) out = as_cuda_ndarray_variable(out)
assert(out.ndim in [4, 5])
if desc.owner is not None:
nd = desc.owner.op.get_ndim() + 2 # 4 or 5 ws = tensor.as_tensor_variable(ws)
stride = tensor.as_tensor_variable(stride)
if inp.type.ndim != nd: pad = tensor.as_tensor_variable(pad)
raise TypeError('inp must be %dD tensor' % (nd,)) assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
assert ws.type.ndim == 1
if inp_grad.type.ndim != nd:
raise TypeError('inp_grad must be %dD tensor' % (nd,)) return Apply(self, [inp, out, inp_grad, ws, stride, pad],
if out.type.ndim != nd:
raise TypeError('out must be %dD tensor' % (nd,))
return Apply(self, [inp, out, inp_grad, desc],
[inp.type()]) [inp.type()])
def c_support_code_struct(self, node, name): def c_support_code_struct(self, node, name):
...@@ -1565,6 +1610,7 @@ cudnnTensorDescriptor_t input%(name)s; ...@@ -1565,6 +1610,7 @@ cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t input_grad%(name)s; cudnnTensorDescriptor_t input_grad%(name)s;
cudnnTensorDescriptor_t output%(name)s; cudnnTensorDescriptor_t output%(name)s;
cudnnTensorDescriptor_t output_grad%(name)s; cudnnTensorDescriptor_t output_grad%(name)s;
cudnnPoolingDescriptor_t pool%(name)s;
""" % dict(name=name) """ % dict(name=name)
def c_init_code_struct(self, node, name, sub): def c_init_code_struct(self, node, name, sub):
...@@ -1574,6 +1620,7 @@ input%(name)s = NULL; ...@@ -1574,6 +1620,7 @@ input%(name)s = NULL;
input_grad%(name)s = NULL; input_grad%(name)s = NULL;
output%(name)s = NULL; output%(name)s = NULL;
output_grad%(name)s = NULL; output_grad%(name)s = NULL;
pool%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) { if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor " "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
...@@ -1598,6 +1645,12 @@ if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_S ...@@ -1598,6 +1645,12 @@ if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_S
"(output_grad): %%s", cudnnGetErrorString(err%(name)s)); "(output_grad): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
if ((err%(name)s = cudnnCreatePoolingDescriptor(&pool%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate pooling descriptor "
"(pool): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail']) """ % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name): def c_cleanup_code_struct(self, node, name):
...@@ -1606,17 +1659,35 @@ if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); } ...@@ -1606,17 +1659,35 @@ if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); } if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); } if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); } if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); }
if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
""" % dict(name=name) """ % dict(name=name)
# def perform(self, node, inputs_storage, output_storage):
# output_storage[0][0] = inputs_storage[0].copy()
# return
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
# raise NotImplementedError()
# Here the name out and inp are based on the cudnn definition. # Here the name out and inp are based on the cudnn definition.
# Not the definition of this class. # Not the definition of this class.
# This make it complicated. # This make it complicated.
out, inp, inp_grad, desc = inputs out, inp, inp_grad, ws, stride, pad = inputs
out_grad, = outputs out_grad, = outputs
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else:
raise NotImplementedError("Unsupported pooling model.")
print mode_flag
return """ return """
cudnnStatus_t err%(name)s; cudnnStatus_t err%(name)s;
//raise(SIGINT);
if (!CudaNdarray_is_c_contiguous(%(input)s)) { if (!CudaNdarray_is_c_contiguous(%(input)s)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -1650,16 +1721,27 @@ if (CudaNdarray_prep_output(&%(output_grad)s, ...@@ -1650,16 +1721,27 @@ if (CudaNdarray_prep_output(&%(output_grad)s,
%(fail)s %(fail)s
} }
// Get the pooling_mode to be used. Variable 'tmp' is used because we don't
// care about the other outputs of the function int win[%(nd)d];
cudnnPoolingMode_t pooling_mode; int pad[%(nd)d];
int tmp; int str[%(nd)d];
err%(name)s = cudnnGetPoolingNdDescriptor(%(desc)s, 0, &pooling_mode, &tmp, for(int i = 0; i < %(nd)d; i++) {
&tmp, &tmp, &tmp); win[i] = *((npy_intp*)PyArray_GETPTR1(%(ws)s, i));
}
for(int i = 0; i < %(nd)d; i++) {
pad[i] = *((npy_intp*)PyArray_GETPTR1(%(pad)s, i));
}
for(int i = 0; i < %(nd)d; i++) {
str[i] = *((npy_intp*)PyArray_GETPTR1(%(str)s, i));
}
err%(name)s = cudnnSetPoolingNdDescriptor(
pool%(name)s, %(mode_flag)s, %(nd)d,
win, pad, str);
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
"GpuDnnPoolGrad: could not obtain pooling mode"); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
if (c_set_tensorNd(%(output_grad)s, %(output_grad_desc)s) != 0) if (c_set_tensorNd(%(output_grad)s, %(output_grad_desc)s) != 0)
...@@ -1670,7 +1752,7 @@ const float alpha = 1; ...@@ -1670,7 +1752,7 @@ const float alpha = 1;
const float beta = 0; const float beta = 0;
err%(name)s = cudnnPoolingBackward( err%(name)s = cudnnPoolingBackward(
_handle, _handle,
%(desc)s, pool%(name)s,
&alpha, &alpha,
%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s), %(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
%(input_grad_desc)s, CudaNdarray_DEV_DATA(%(input_grad)s), %(input_grad_desc)s, CudaNdarray_DEV_DATA(%(input_grad)s),
...@@ -1685,16 +1767,19 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -1685,16 +1767,19 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
""" % dict(output_grad=out_grad, desc=desc, """ % dict(output_grad=out_grad,
fail=sub['fail'], name=name, fail=sub['fail'], name=name,
input=inp, input_grad=inp_grad, output=out, input=inp, input_grad=inp_grad, output=out,
input_desc="input" + name, input_desc="input"+name,
input_grad_desc="input_grad" + name, input_grad_desc="input_grad"+name,
output_desc="output" + name, output_desc="output"+name,
output_grad_desc="output_grad" + name) output_grad_desc="output_grad"+name,
mode_flag=mode_flag, nd=node.inputs[0].ndim - 2,
ws=ws, pad=pad, str=stride)
def c_code_cache_version(self): def c_code_cache_version(self):
return (7, version()) return
#return (7, version())
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
...@@ -1716,7 +1801,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -1716,7 +1801,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
stride stride
Subsampling stride (default: (1, 1)). Subsampling stride (default: (1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad} mode : {'max', 'average_inc_pad', 'average_exc_pad}
pad pad :
(pad_h, pad_w) padding information. (pad_h, pad_w) padding information.
pad_h is the number of zero-valued pixels added to each of the top and pad_h is the number of zero-valued pixels added to each of the top and
bottom borders. bottom borders.
...@@ -1733,8 +1818,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -1733,8 +1818,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
""" """
img = gpu_contiguous(img) img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)() return GpuDnnPool(mode=mode)(img, ws, stride, pad)
return GpuDnnPool()(img, desc)
class GpuDnnSoftmaxBase(DnnBase): class GpuDnnSoftmaxBase(DnnBase):
...@@ -2212,12 +2296,11 @@ if True: ...@@ -2212,12 +2296,11 @@ if True:
return return
inp, out, inp_grad = node.inputs inp, out, inp_grad = node.inputs
ds = node.op.ds ds = node.op.ds
desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")() return [GpuDnnPoolGrad(mode='max')(gpu_contiguous(inp),
return [GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
gpu_contiguous(inp_grad), gpu_contiguous(inp_grad),
desc)] ds, ds, (0, 0))]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([MaxPoolGrad]) @local_optimizer([MaxPoolGrad])
...@@ -2237,11 +2320,11 @@ if True: ...@@ -2237,11 +2320,11 @@ if True:
(out.owner and isinstance(out.owner.op, HostFromGpu)) or (out.owner and isinstance(out.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op, (inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))): HostFromGpu))):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
ret = GpuDnnPoolGrad()(gpu_contiguous(inp), ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
gpu_contiguous(inp_grad), gpu_contiguous(inp_grad),
desc) ds, st, pad)
return [host_from_gpu(ret)] return [host_from_gpu(ret)]
@register_opt('cudnn') @register_opt('cudnn')
...@@ -2261,14 +2344,14 @@ if True: ...@@ -2261,14 +2344,14 @@ if True:
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op, (inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))): HostFromGpu))):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
contiguous_inp_grad = gpu_contiguous(inp_grad) contiguous_inp_grad = gpu_contiguous(inp_grad)
ret = GpuDnnPoolGrad()(gpu_contiguous(inp), ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
contiguous_inp_grad, contiguous_inp_grad,
contiguous_inp_grad, contiguous_inp_grad,
desc) ds, st, pad)
return [host_from_gpu(ret)] return [host_from_gpu(ret)]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([GpuSoftmax]) @local_optimizer([GpuSoftmax])
def local_softmax_dnn(node): def local_softmax_dnn(node):
......
...@@ -240,10 +240,11 @@ def test_pooling(): ...@@ -240,10 +240,11 @@ def test_pooling():
modes = ('max', 'average_inc_pad') modes = ('max', 'average_inc_pad')
else: else:
modes = ('max', 'average_inc_pad', 'average_exc_pad') modes = ('max', 'average_inc_pad', 'average_exc_pad')
x = T.ftensor4() x = T.ftensor4()
for mode, pad in product(modes, for mode, pad in product(modes,
((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))): ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
if mode == 'max': if mode == 'max':
func = T.max func = T.max
else: else:
...@@ -285,22 +286,23 @@ def test_pooling(): ...@@ -285,22 +286,23 @@ def test_pooling():
a = f1(data).__array__() a = f1(data).__array__()
b = f2(data).__array__() b = f2(data).__array__()
utt.assert_allclose(a, b) assert numpy.allclose(a, b,
atol=numpy.finfo(numpy.float32).eps)
# Test the grad # Test the grad
for shp in [(1, 1, 2, 2), for shp in [(1, 1, 2, 2),
(1, 1, 3, 3)]: (1, 1, 3, 3)]:
data = numpy.random.normal(0, 1, shp).astype("float32") * 10 data = numpy.random.normal(0, 1, shp).astype("float32") * 10
ws = 2 ws = theano.shared(numpy.array([2, 2]))
stride = 2 stride = theano.shared(numpy.array([1, 1]))
if pad[0] > stride or pad[1] > stride: if pad[0] > 1 or pad[1] > 1:
# Not implemented # Not implemented
continue continue
pad_ = theano.shared(numpy.array(pad))
# This test the CPU grad + opt + GPU implemtentation ## This test the CPU grad + opt + GPU implemtentation
def fn(x): def fn(x):
return pool_2d(x, (ws, ws), ignore_border=True, return pool_2d(x, (2, 2), ignore_border=True,
padding=pad, mode=mode) padding=pad, mode=mode)
theano.tests.unittest_tools.verify_grad(fn, [data], theano.tests.unittest_tools.verify_grad(fn, [data],
cast_to_output_type=False, cast_to_output_type=False,
...@@ -310,15 +312,16 @@ def test_pooling(): ...@@ -310,15 +312,16 @@ def test_pooling():
mode=mode_with_gpu) mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad) assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()]) for node in fg.maker.fgraph.toposort()])
# Test the GPU grad + GPU implementation # Test the GPU grad + GPU implementation
def fn(x): def fn(x):
dnn_op = cuda.dnn.dnn_pool( dnn_op = cuda.dnn.dnn_pool(
x, ws=(ws, ws), x, ws=ws,
stride=(stride, stride), stride=stride,
pad=pad, pad=pad_,
mode=mode) mode=mode)
return dnn_op return dnn_op
theano.tests.unittest_tools.verify_grad( theano.tests.unittest_tools.verify_grad(
fn, [data], fn, [data],
cast_to_output_type=False, cast_to_output_type=False,
...@@ -331,9 +334,10 @@ def test_pooling(): ...@@ -331,9 +334,10 @@ def test_pooling():
g_out = fg(data) g_out = fg(data)
# Compare again the CPU result # Compare again the CPU result
out = pool_2d(x, (ws, ws), out = pool_2d(x, (2, 2), st=(1, 1),
padding=pad, padding=pad,
ignore_border=True, mode=mode) ignore_border=True, mode=mode)
fc = theano.function([x], theano.grad(out.sum(), x), fc = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu) mode=mode_without_gpu)
if mode == 'max': if mode == 'max':
...@@ -343,7 +347,7 @@ def test_pooling(): ...@@ -343,7 +347,7 @@ def test_pooling():
assert any([isinstance(node.op, AveragePoolGrad) assert any([isinstance(node.op, AveragePoolGrad)
for node in fc.maker.fgraph.toposort()]) for node in fc.maker.fgraph.toposort()])
c_out = fc(data) c_out = fc(data)
utt.assert_allclose(c_out, g_out) assert numpy.allclose(c_out, g_out)
def test_pooling3d(): def test_pooling3d():
...@@ -999,14 +1003,9 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -999,14 +1003,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
modes modes
): ):
desc = dnn.GpuDnnPoolDesc(
ws=params[0],
stride=params[1],
mode=params[2]
)()
self._compile_and_check( self._compile_and_check(
[img], [img],
[dnn.GpuDnnPool()(img, desc)], [dnn.GpuDnnPool(mode=params[2])(img, params[0], params[1], (0,0))],
[img_val], [img_val],
dnn.GpuDnnPool dnn.GpuDnnPool
) )
...@@ -1035,16 +1034,13 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -1035,16 +1034,13 @@ class TestDnnInferShapes(utt.InferShapeTester):
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad'] ['max', 'average_inc_pad']
): ):
desc = dnn.GpuDnnPoolDesc(
ws=params[0],
stride=params[1],
mode=params[2]
)()
pool_grad = dnn.GpuDnnPoolGrad()( pool_grad = dnn.GpuDnnPoolGrad()(
img, img,
out, out,
img_grad, img_grad,
desc params[0],
params[1],
(0, 0)
) )
self._compile_and_check( self._compile_and_check(
[img, img_grad, out], [img, img_grad, out],
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论