提交 8472d13a authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3965 from harmdevries89/gpudnnpool2

gpu dnn pool takes tensor variables
...@@ -1845,7 +1845,7 @@ class _Linker(gof.link.LocalLinker): ...@@ -1845,7 +1845,7 @@ class _Linker(gof.link.LocalLinker):
thunk.outputs = [storage_map[v] for v in node.outputs] thunk.outputs = [storage_map[v] for v in node.outputs]
thunk_other = thunk thunk_other = thunk
else: else:
new_node = node.op.prepare_node(node) new_node = node.op.prepare_node(node, storage_map, compute_map)
if new_node is not None: if new_node is not None:
node = new_node node = new_node
......
...@@ -836,7 +836,7 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -836,7 +836,7 @@ class Op(utils.object2, PureOp, CLinkerOp):
else: else:
return NotImplemented return NotImplemented
def prepare_node(self, node): def prepare_node(self, node, storage_map, compute_map):
""" """
Make any special modifications that the Op needs before doing Make any special modifications that the Op needs before doing
make_thunk(). make_thunk().
...@@ -959,7 +959,8 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -959,7 +959,8 @@ class Op(utils.object2, PureOp, CLinkerOp):
""" """
logger = logging.getLogger('theano.gof.op.Op') logger = logging.getLogger('theano.gof.op.Op')
new_node = self.prepare_node(node) new_node = self.prepare_node(node, storage_map=storage_map,
compute_map=compute_map)
if new_node is not None: if new_node is not None:
node = new_node node = new_node
...@@ -1218,7 +1219,8 @@ int main( int argc, const char* argv[] ) ...@@ -1218,7 +1219,8 @@ int main( int argc, const char* argv[] )
self.openmp = False self.openmp = False
theano.config.openmp = False theano.config.openmp = False
def prepare_node(self, node): def prepare_node(self, node, storage_map,
compute_map):
self.update_self_openmp() self.update_self_openmp()
......
...@@ -1953,7 +1953,7 @@ class GpuConv(GpuOp): ...@@ -1953,7 +1953,7 @@ class GpuConv(GpuOp):
images[2] * images[3] * 2) images[2] * images[3] * 2)
return flops return flops
def prepare_node(self, node): def prepare_node(self, node, storage_map, compute_map):
if node.op.max_threads_dim0 is None: if node.op.max_threads_dim0 is None:
cuda = theano.sandbox.cuda cuda = theano.sandbox.cuda
device_id = cuda.use.device_number device_id = cuda.use.device_number
......
...@@ -1367,40 +1367,78 @@ class GpuDnnPool(DnnBase): ...@@ -1367,40 +1367,78 @@ class GpuDnnPool(DnnBase):
---------- ----------
img img
The image 4d or 5d tensor. The image 4d or 5d tensor.
desc ws
The pooling descriptor. Windows size.
stride
(dx, dy).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' correspond to 'average_inc_pad'.
pad
(padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
""" """
__props__ = () __props__ = ("mode",)
def make_node(self, img, desc): def __init__(self, mode='max'):
img = as_cuda_ndarray_variable(img) super(GpuDnnPool, self).__init__()
if not isinstance(desc.type, CDataType) \ if mode == 'average':
or desc.type.ctype != 'cudnnPoolingDescriptor_t': mode = 'average_inc_pad'
raise TypeError('desc must be cudnnPoolingDescriptor_t') assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
if desc.owner is not None: def prepare_node(self, node, storage_map, compute_map):
dop = desc.owner.op if len(node.inputs) == 2:
e_ndim = dop.get_ndim() + 2 # 4 or 5 warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
# Old interface
self.mode = node.inputs[1].owner.op.mode
ws = theano.tensor.constant(node.inputs[1].owner.op.ws)
st = theano.tensor.constant(node.inputs[1].owner.op.stride)
pad = theano.tensor.constant(node.inputs[1].owner.op.pad)
node.inputs[1] = ws
node.inputs.append(st)
node.inputs.append(pad)
if isinstance(ws, theano.Constant):
storage_map[ws] = [ws.data]
compute_map[ws] = [True]
else:
storage_map[ws] = [None]
compute_map[ws] = [False]
if isinstance(st, theano.Constant):
storage_map[st] = [st.data]
compute_map[st] = [True]
else:
storage_map[st] = [None]
compute_map[st] = [False]
if isinstance(pad, theano.Constant):
storage_map[pad] = [pad.data]
compute_map[pad] = [True]
else:
storage_map[pad] = [None]
compute_map[pad] = [False]
if img.type.ndim != e_ndim: def make_node(self, img, ws, stride, pad):
raise TypeError('img must be %dD tensor' % e_ndim) img = as_cuda_ndarray_variable(img)
assert (img.ndim in [4, 5])
return Apply(self, [img, desc], [img.type()]) ws = tensor.as_tensor_variable(ws)
stride = tensor.as_tensor_variable(stride)
pad = tensor.as_tensor_variable(pad)
assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
assert ws.type.ndim == 1
return Apply(self, [img, ws, stride, pad], [img.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
if not node.inputs[1].owner: w = node.inputs[1]
raise theano.tensor.ShapeError() s = node.inputs[2]
desc = node.inputs[1].owner.op p = node.inputs[3]
nd = desc.get_ndim()
w = desc.ws
s = desc.stride
p = desc.pad
ret = [shape[0][0], shape[0][1], ret = [shape[0][0], shape[0][1],
(shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1, (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
(shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1] (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1]
if nd == 3: if node.inputs[0].ndim == 5:
ret.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1) ret.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [ret] return [ret]
...@@ -1408,6 +1446,7 @@ class GpuDnnPool(DnnBase): ...@@ -1408,6 +1446,7 @@ class GpuDnnPool(DnnBase):
return """ return """
cudnnTensorDescriptor_t input%(name)s; cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t output%(name)s; cudnnTensorDescriptor_t output%(name)s;
cudnnPoolingDescriptor_t pool%(name)s;
""" % dict(name=name) """ % dict(name=name)
def c_init_code_struct(self, node, name, sub): def c_init_code_struct(self, node, name, sub):
...@@ -1415,6 +1454,7 @@ cudnnTensorDescriptor_t output%(name)s; ...@@ -1415,6 +1454,7 @@ cudnnTensorDescriptor_t output%(name)s;
cudnnStatus_t err%(name)s; cudnnStatus_t err%(name)s;
input%(name)s = NULL; input%(name)s = NULL;
output%(name)s = NULL; output%(name)s = NULL;
pool%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) { if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %%s", cudnnGetErrorString(err%(name)s)); "(inp): %%s", cudnnGetErrorString(err%(name)s));
...@@ -1425,20 +1465,40 @@ if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS ...@@ -1425,20 +1465,40 @@ if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS
"(out): %%s", cudnnGetErrorString(err%(name)s)); "(out): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
if ((err%(name)s = cudnnCreatePoolingDescriptor(&pool%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate pooling "
"descriptor: %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail']) """ % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name): def c_cleanup_code_struct(self, node, name):
return """ return """
if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); } if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); } if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
""" % dict(name=name) """ % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
desc = inputs[1] ws = inputs[1]
stride = inputs[2]
pad = inputs[3]
out, = outputs out, = outputs
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else:
raise NotImplementedError("Unsupported pooling model.")
return """ return """
cudnnStatus_t err%(name)s; cudnnStatus_t err;
int %(out)s_dims[5]; int %(out)s_dims[5];
...@@ -1450,31 +1510,36 @@ if (!CudaNdarray_is_c_contiguous(%(input)s)) { ...@@ -1450,31 +1510,36 @@ if (!CudaNdarray_is_c_contiguous(%(input)s)) {
if (c_set_tensorNd(%(input)s, %(input_desc)s) != 0) if (c_set_tensorNd(%(input)s, %(input_desc)s) != 0)
%(fail)s %(fail)s
cudnnPoolingMode_t mode; int win[%(nd)d];
int win[3]; int pad[%(nd)d];
int pad[3]; int str[%(nd)d];
int str[3]; for(int i = 0; i < %(nd)d; i++) {
int ndims; win[i] = *((npy_intp*)PyArray_GETPTR1(%(ws)s, i));
err%(name)s = cudnnGetPoolingNdDescriptor( }
%(desc)s, 3, for(int i = 0; i < %(nd)d; i++) {
&mode, &ndims, pad[i] = *((npy_intp*)PyArray_GETPTR1(%(pad)s, i));
win, pad, str); }
for(int i = 0; i < %(nd)d; i++) {
if (err%(name)s != CUDNN_STATUS_SUCCESS) { str[i] = *((npy_intp*)PyArray_GETPTR1(%(str)s, i));
PyErr_Format(PyExc_RuntimeError, }
"GpuDnnPool: error doing cudnnGetPoolingNdDescriptor operation: %%s", err = cudnnSetPoolingNdDescriptor(
cudnnGetErrorString(err%(name)s)); pool%(name)s, %(mode_flag)s, %(nd)d,
%(fail)s win, pad, str);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
} }
%(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0]; %(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
%(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1]; %(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (pad[0]*2) - win[0]) / str[0] + 1; %(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (pad[0]*2) - win[0]) / str[0] + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (pad[1]*2) - win[1]) / str[1] + 1; %(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (pad[1]*2) - win[1]) / str[1] + 1;
if (ndims == 3) if (%(nd)s == 3)
%(out)s_dims[4] = (CudaNdarray_HOST_DIMS(%(input)s)[4] + (pad[2]*2) - win[2]) / str[2] + 1; %(out)s_dims[4] = (CudaNdarray_HOST_DIMS(%(input)s)[4] + (pad[2]*2) - win[2]) / str[2] + 1;
if (CudaNdarray_prep_output(&%(out)s, ndims+2, %(out)s_dims) != 0) if (CudaNdarray_prep_output(&%(out)s, %(nd)s+2, %(out)s_dims) != 0)
{ {
%(fail)s %(fail)s
} }
...@@ -1485,44 +1550,46 @@ if (c_set_tensorNd(%(out)s, %(output_desc)s) != 0) ...@@ -1485,44 +1550,46 @@ if (c_set_tensorNd(%(out)s, %(output_desc)s) != 0)
{ {
const float alpha = 1; const float alpha = 1;
const float beta = 0; const float beta = 0;
err%(name)s = cudnnPoolingForward( err = cudnnPoolingForward(
_handle, _handle,
%(desc)s, pool%(name)s,
&alpha, &alpha,
%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s), %(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
&beta, &beta,
%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s) %(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
); );
} }
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %%s", "GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err));
%(fail)s %(fail)s
} }
""" % dict(out=out, desc=desc, fail=sub['fail'], """ % dict(out=out, fail=sub['fail'],
name=name, input=inputs[0], name=name, input=inputs[0],
input_desc="input" + name, ws=ws, pad=pad, str=stride,
output_desc="output" + name) nd=node.inputs[0].ndim - 2, input_desc="input" + name,
output_desc="output" + name,
mode_flag=mode_flag)
def grad(self, inp, grads): def grad(self, inp, grads):
img, desc = inp img, ws, stride, pad = inp
grad, = grads grad, = grads
grad = gpu_contiguous(grad) grad = gpu_contiguous(grad)
out = self(img, desc) out = self(img, ws, stride, pad)
g_out = GpuDnnPoolGrad()(img, out, grad, desc) g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
return g_out, theano.gradient.DisconnectedType()() return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [0]] return [[1], [0], [0], [0]]
def c_code_cache_version(self): def c_code_cache_version(self):
return (7, version()) return (8, version())
class GpuDnnPoolGrad(DnnBase): class GpuDnnPoolGrad(DnnBase):
...@@ -1537,35 +1604,75 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1537,35 +1604,75 @@ class GpuDnnPoolGrad(DnnBase):
The output of the pooling in the forward. The output of the pooling in the forward.
inp_grad inp_grad
Same size as out, but is the corresponding gradient information. Same size as out, but is the corresponding gradient information.
desc ws
The pooling descriptor. Windows size.
stride
(dx, dy).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' correspond to 'average_inc_pad'.
pad
(padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
""" """
__props__ = () __props__ = ('mode',)
def make_node(self, inp, out, inp_grad, desc): def __init__(self, mode='max'):
if not isinstance(desc.type, CDataType) \ super(GpuDnnPoolGrad, self).__init__()
or desc.type.ctype != 'cudnnPoolingDescriptor_t': if mode == 'average':
raise TypeError('desc must be cudnnPoolingDescriptor_t') mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
def prepare_node(self, node, storage_map, compute_map):
if len(node.inputs) == 4:
warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
# Old interface
self.mode = node.inputs[3].owner.op.mode
ws = theano.tensor.constant(node.inputs[3].owner.op.ws)
st = theano.tensor.constant(node.inputs[3].owner.op.stride)
pad = theano.tensor.constant(node.inputs[3].owner.op.pad)
node.inputs[3] = ws
node.inputs.append(st)
node.inputs.append(pad)
if isinstance(ws, theano.Constant):
storage_map[ws] = [ws.data]
compute_map[ws] = [True]
else:
storage_map[ws] = [None]
compute_map[ws] = [False]
if isinstance(st, theano.Constant):
storage_map[st] = [st.data]
compute_map[st] = [True]
else:
storage_map[st] = [None]
compute_map[st] = [False]
if isinstance(pad, theano.Constant):
storage_map[pad] = [pad.data]
compute_map[pad] = [True]
else:
storage_map[pad] = [None]
compute_map[pad] = [False]
def make_node(self, inp, out, inp_grad, ws, stride, pad):
inp = as_cuda_ndarray_variable(inp) inp = as_cuda_ndarray_variable(inp)
assert (inp.ndim in [4, 5])
inp_grad = as_cuda_ndarray_variable(inp_grad) inp_grad = as_cuda_ndarray_variable(inp_grad)
assert (inp_grad.ndim in [4, 5])
out = as_cuda_ndarray_variable(out) out = as_cuda_ndarray_variable(out)
assert(out.ndim in [4, 5])
if desc.owner is not None: assert (inp_grad.ndim == inp.ndim)
nd = desc.owner.op.get_ndim() + 2 # 4 or 5 assert (inp.ndim == out.ndim)
if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,))
if inp_grad.type.ndim != nd:
raise TypeError('inp_grad must be %dD tensor' % (nd,))
if out.type.ndim != nd: ws = tensor.as_tensor_variable(ws)
raise TypeError('out must be %dD tensor' % (nd,)) stride = tensor.as_tensor_variable(stride)
pad = tensor.as_tensor_variable(pad)
assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
assert ws.type.ndim == 1
return Apply(self, [inp, out, inp_grad, desc], return Apply(self, [inp, out, inp_grad, ws, stride, pad],
[inp.type()]) [inp.type()])
def c_support_code_struct(self, node, name): def c_support_code_struct(self, node, name):
...@@ -1574,6 +1681,7 @@ cudnnTensorDescriptor_t input%(name)s; ...@@ -1574,6 +1681,7 @@ cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t input_grad%(name)s; cudnnTensorDescriptor_t input_grad%(name)s;
cudnnTensorDescriptor_t output%(name)s; cudnnTensorDescriptor_t output%(name)s;
cudnnTensorDescriptor_t output_grad%(name)s; cudnnTensorDescriptor_t output_grad%(name)s;
cudnnPoolingDescriptor_t pool%(name)s;
""" % dict(name=name) """ % dict(name=name)
def c_init_code_struct(self, node, name, sub): def c_init_code_struct(self, node, name, sub):
...@@ -1583,6 +1691,7 @@ input%(name)s = NULL; ...@@ -1583,6 +1691,7 @@ input%(name)s = NULL;
input_grad%(name)s = NULL; input_grad%(name)s = NULL;
output%(name)s = NULL; output%(name)s = NULL;
output_grad%(name)s = NULL; output_grad%(name)s = NULL;
pool%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) { if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate tensor4d descriptor " "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
...@@ -1607,6 +1716,12 @@ if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_S ...@@ -1607,6 +1716,12 @@ if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_S
"(output_grad): %%s", cudnnGetErrorString(err%(name)s)); "(output_grad): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
if ((err%(name)s = cudnnCreatePoolingDescriptor(&pool%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"GpuDnnPoolGrad: could not allocate pooling descriptor "
"(pool): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail']) """ % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name): def c_cleanup_code_struct(self, node, name):
...@@ -1615,15 +1730,28 @@ if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); } ...@@ -1615,15 +1730,28 @@ if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); } if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); } if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); } if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); }
if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
""" % dict(name=name) """ % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
# Here the name out and inp are based on the cudnn definition. # Here the name out and inp are based on the cudnn definition.
# Not the definition of this class. # Not the definition of this class.
# This make it complicated. # This make it complicated.
out, inp, inp_grad, desc = inputs out, inp, inp_grad, ws, stride, pad = inputs
out_grad, = outputs out_grad, = outputs
if self.mode == 'max':
mode_flag = 'CUDNN_POOLING_MAX'
elif self.mode == "average_inc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else:
raise NotImplementedError("Unsupported pooling model.")
return """ return """
cudnnStatus_t err%(name)s; cudnnStatus_t err%(name)s;
...@@ -1659,16 +1787,27 @@ if (CudaNdarray_prep_output(&%(output_grad)s, ...@@ -1659,16 +1787,27 @@ if (CudaNdarray_prep_output(&%(output_grad)s,
%(fail)s %(fail)s
} }
// Get the pooling_mode to be used. Variable 'tmp' is used because we don't
// care about the other outputs of the function int win[%(nd)d];
cudnnPoolingMode_t pooling_mode; int pad[%(nd)d];
int tmp; int str[%(nd)d];
err%(name)s = cudnnGetPoolingNdDescriptor(%(desc)s, 0, &pooling_mode, &tmp, for(int i = 0; i < %(nd)d; i++) {
&tmp, &tmp, &tmp); win[i] = *((npy_intp*)PyArray_GETPTR1(%(ws)s, i));
}
for(int i = 0; i < %(nd)d; i++) {
pad[i] = *((npy_intp*)PyArray_GETPTR1(%(pad)s, i));
}
for(int i = 0; i < %(nd)d; i++) {
str[i] = *((npy_intp*)PyArray_GETPTR1(%(str)s, i));
}
err%(name)s = cudnnSetPoolingNdDescriptor(
pool%(name)s, %(mode_flag)s, %(nd)d,
win, pad, str);
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
"GpuDnnPoolGrad: could not obtain pooling mode"); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
if (c_set_tensorNd(%(output_grad)s, %(output_grad_desc)s) != 0) if (c_set_tensorNd(%(output_grad)s, %(output_grad_desc)s) != 0)
...@@ -1679,7 +1818,7 @@ const float alpha = 1; ...@@ -1679,7 +1818,7 @@ const float alpha = 1;
const float beta = 0; const float beta = 0;
err%(name)s = cudnnPoolingBackward( err%(name)s = cudnnPoolingBackward(
_handle, _handle,
%(desc)s, pool%(name)s,
&alpha, &alpha,
%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s), %(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
%(input_grad_desc)s, CudaNdarray_DEV_DATA(%(input_grad)s), %(input_grad_desc)s, CudaNdarray_DEV_DATA(%(input_grad)s),
...@@ -1694,16 +1833,18 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -1694,16 +1833,18 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
""" % dict(output_grad=out_grad, desc=desc, """ % dict(output_grad=out_grad,
fail=sub['fail'], name=name, fail=sub['fail'], name=name,
input=inp, input_grad=inp_grad, output=out, input=inp, input_grad=inp_grad, output=out,
input_desc="input" + name, input_desc="input" + name,
input_grad_desc="input_grad" + name, input_grad_desc="input_grad" + name,
output_desc="output" + name, output_desc="output" + name,
output_grad_desc="output_grad" + name) output_grad_desc="output_grad" + name,
mode_flag=mode_flag, nd=node.inputs[0].ndim - 2,
ws=ws, pad=pad, str=stride)
def c_code_cache_version(self): def c_code_cache_version(self):
return (7, version()) return (8, version())
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
...@@ -1725,7 +1866,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -1725,7 +1866,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
stride stride
Subsampling stride (default: (1, 1)). Subsampling stride (default: (1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad} mode : {'max', 'average_inc_pad', 'average_exc_pad}
pad pad :
(pad_h, pad_w) padding information. (pad_h, pad_w) padding information.
pad_h is the number of zero-valued pixels added to each of the top and pad_h is the number of zero-valued pixels added to each of the top and
bottom borders. bottom borders.
...@@ -1742,8 +1883,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -1742,8 +1883,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
""" """
img = gpu_contiguous(img) img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)() return GpuDnnPool(mode=mode)(img, ws, stride, pad)
return GpuDnnPool()(img, desc)
class GpuDnnSoftmaxBase(DnnBase): class GpuDnnSoftmaxBase(DnnBase):
...@@ -2222,11 +2362,10 @@ if True: ...@@ -2222,11 +2362,10 @@ if True:
inp, out, inp_grad = node.inputs inp, out, inp_grad = node.inputs
ds = node.op.ds ds = node.op.ds
desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")() return [GpuDnnPoolGrad(mode='max')(gpu_contiguous(inp),
return [GpuDnnPoolGrad()(gpu_contiguous(inp), gpu_contiguous(out),
gpu_contiguous(out), gpu_contiguous(inp_grad),
gpu_contiguous(inp_grad), ds, ds, (0, 0))]
desc)]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([MaxPoolGrad]) @local_optimizer([MaxPoolGrad])
...@@ -2246,11 +2385,11 @@ if True: ...@@ -2246,11 +2385,11 @@ if True:
(out.owner and isinstance(out.owner.op, HostFromGpu)) or (out.owner and isinstance(out.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op, (inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))): HostFromGpu))):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
ret = GpuDnnPoolGrad()(gpu_contiguous(inp), ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
gpu_contiguous(inp_grad), gpu_contiguous(inp_grad),
desc) ds, st, pad)
return [host_from_gpu(ret)] return [host_from_gpu(ret)]
@register_opt('cudnn') @register_opt('cudnn')
...@@ -2270,12 +2409,11 @@ if True: ...@@ -2270,12 +2409,11 @@ if True:
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op, (inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))): HostFromGpu))):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
contiguous_inp_grad = gpu_contiguous(inp_grad) contiguous_inp_grad = gpu_contiguous(inp_grad)
ret = GpuDnnPoolGrad()(gpu_contiguous(inp), ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
contiguous_inp_grad, contiguous_inp_grad,
contiguous_inp_grad, contiguous_inp_grad,
desc) ds, st, pad)
return [host_from_gpu(ret)] return [host_from_gpu(ret)]
@register_opt('cudnn') @register_opt('cudnn')
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -3,6 +3,8 @@ import logging ...@@ -3,6 +3,8 @@ import logging
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy import numpy
from itertools import chain, product from itertools import chain, product
import six.moves.cPickle as pickle
import os
import theano import theano
from six import StringIO from six import StringIO
...@@ -70,19 +72,6 @@ def test_dnn_conv_desc_merge(): ...@@ -70,19 +72,6 @@ def test_dnn_conv_desc_merge():
assert d1 == d2 assert d1 == d2
def test_dnn_pool_desc_merge():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
x = theano.tensor.ftensor4('x')
y = dnn.dnn_pool(x, (2, 2))
z = dnn.dnn_pool(x, (2, 2))
f = theano.function([x], [y, z])
descs = [n for n in f.maker.fgraph.apply_nodes
if isinstance(n.op, dnn.GpuDnnPoolDesc)]
assert len(descs) == 1, f.maker.fgraph
def test_dnn_conv_merge(): def test_dnn_conv_merge():
"""This test that we merge correctly multiple dnn_conv. """This test that we merge correctly multiple dnn_conv.
...@@ -346,6 +335,64 @@ def test_pooling(): ...@@ -346,6 +335,64 @@ def test_pooling():
utt.assert_allclose(c_out, g_out) utt.assert_allclose(c_out, g_out)
def test_pooling_with_tensor_vars():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
x = T.ftensor4()
ws = theano.shared(numpy.array([2, 2], dtype='int32'))
st = theano.shared(numpy.array([1, 1], dtype='int32'))
pad = theano.shared(numpy.array([0, 0], dtype='int32'))
mode = 'max'
def fn(x):
dnn_op = cuda.dnn.dnn_pool(
x, ws=ws,
stride=st,
pad=pad,
mode=mode)
return dnn_op
for shp in [(1, 1, 2, 2),
(1, 1, 3, 3)]:
data = numpy.random.normal(0, 1, shp).astype("float32") * 10
theano.tests.unittest_tools.verify_grad(
fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
out2 = pool_2d_i2n(x, ds=(2, 2), strides=(1, 1),
pad=(0, 0),
pool_function=T.max)
mode_without_gpu2 = mode_without_gpu.including()
mode_without_gpu2.check_isfinite = False
f1 = theano.function([x], fn(x), mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes])
f2 = theano.function([x], out2, mode=mode_without_gpu2)
assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f2.maker.fgraph.apply_nodes])
for shp in [(1, 10, 100, 100),
(1, 3, 99, 99),
(32, 1, 147, 197),
]:
data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__()
b = f2(data).__array__()
utt.assert_allclose(a, b)
def test_old_pool_interface():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
testfile_dir = os.path.dirname(os.path.realpath(__file__))
fname = 'old_pool_interface.pkl'
with open(os.path.join(testfile_dir, fname), 'rb') as fp:
pickle.load(fp)
def test_pooling3d(): def test_pooling3d():
# CuDNN 3d pooling requires CuDNN v3. Don't test if the CuDNN version is # CuDNN 3d pooling requires CuDNN v3. Don't test if the CuDNN version is
# too old. # too old.
...@@ -607,8 +654,9 @@ class test_DnnSoftMax(test_nnet.test_SoftMax): ...@@ -607,8 +654,9 @@ class test_DnnSoftMax(test_nnet.test_SoftMax):
input_val = numpy.random.normal(0, 1, inp_shape).astype("float32") input_val = numpy.random.normal(0, 1, inp_shape).astype("float32")
out = f(input_val) out = f(input_val)
expected_out = numpy.log(numpy.exp(input_val) / expected_out = numpy.log(
numpy.exp(input_val).sum(1)[:, None, :, :]) numpy.exp(input_val) /
numpy.exp(input_val).sum(1)[:, None, :, :])
utt.assert_allclose(out, expected_out) utt.assert_allclose(out, expected_out)
...@@ -999,14 +1047,10 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -999,14 +1047,10 @@ class TestDnnInferShapes(utt.InferShapeTester):
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
modes modes
): ):
desc = dnn.GpuDnnPoolDesc(
ws=params[0],
stride=params[1],
mode=params[2]
)()
self._compile_and_check( self._compile_and_check(
[img], [img],
[dnn.GpuDnnPool()(img, desc)], [dnn.GpuDnnPool(mode=params[2])
(img, params[0], params[1], (0, 0))],
[img_val], [img_val],
dnn.GpuDnnPool dnn.GpuDnnPool
) )
...@@ -1035,16 +1079,13 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -1035,16 +1079,13 @@ class TestDnnInferShapes(utt.InferShapeTester):
[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)],
['max', 'average_inc_pad'] ['max', 'average_inc_pad']
): ):
desc = dnn.GpuDnnPoolDesc(
ws=params[0],
stride=params[1],
mode=params[2]
)()
pool_grad = dnn.GpuDnnPoolGrad()( pool_grad = dnn.GpuDnnPoolGrad()(
img, img,
out, out,
img_grad, img_grad,
desc params[0],
params[1],
(0, 0)
) )
self._compile_and_check( self._compile_and_check(
[img, img_grad, out], [img, img_grad, out],
......
...@@ -792,7 +792,7 @@ class Elemwise(OpenMPOp): ...@@ -792,7 +792,7 @@ class Elemwise(OpenMPOp):
return ret return ret
def prepare_node(self, node): def prepare_node(self, node, storage_map, compute_map):
# Postpone the ufunc building to the last minutes # Postpone the ufunc building to the last minutes
# NumPy ufunc support only up to 31 inputs. # NumPy ufunc support only up to 31 inputs.
# But our c code support more. # But our c code support more.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论