提交 6f9b538b authored 作者: Frederic's avatar Frederic 提交者: Pascal Lamblin

tmp

上级 ee75577d
......@@ -88,16 +88,22 @@ dnn_available.msg = None
def c_set_tensor4d(var, desc, err, fail):
return """
{
int str0, str1, str2, str3;
str3 = CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1;
str2 = CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3];
str1 = CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3];
str0 = CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1];
printf("str0=%%d str1=%%d str2=%%d str3=%%d\\n",
str0, str1, str2, str3
);
%(err)s = cudnnSetTensor4dDescriptorEx(
%(desc)s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
str0, str1, str2, str3
);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -108,13 +114,12 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
str0, str1, str2, str3
);
%(fail)s
}
}
""" % dict(var=var, err=err, desc=desc, fail=fail)
......@@ -679,15 +684,20 @@ class GpuDnnPoolDesc(GpuOp):
self.stride = stride
assert len(stride) == 2
self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and dnn_version() < 20:
raise RuntimeError("CUDNN pooling need version v2 to support")
if (pad[0] != 0 or pad[1] != 0) and version() < 20:
raise RuntimeError("Pooling with padding need CUDNN v2 or"
" more recent.")
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(d, 'pad'):
if not hasattr(self, 'pad'):
import pdb;pdb.set_trace()
self.pad = (0, 0)
def make_node(self):
if self.pad != (0, 0) and version() < 20:
raise RuntimeError(
"CUDNNpooling with padding request CUDNN v2 or more recent.")
return Apply(self, [],
[CDataType("cudnnPoolingDescriptor_t")()])
......@@ -749,11 +759,7 @@ class GpuDnnPool(DnnBase):
:param img: the image 4d tensor.
:param desc: the pooling descriptor.
"""
__props__ = ('ignore_border', )
def __init__(self, ignore_border):
self.ignore_border = ignore_border
DnnBase.__init__(self)
__props__ = ()
def make_node(self, img, desc):
img = as_cuda_ndarray_variable(img)
......@@ -853,23 +859,8 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
%(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
%(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
if (%(ignore_border)d){
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] - wsX) / strideX + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] - wsY) / strideY + 1;
}else{
int r = CudaNdarray_HOST_DIMS(%(input)s)[2];
int c = CudaNdarray_HOST_DIMS(%(input)s)[3];
if(strideX >= wsX){
%(out)s_dims[2] = (r - 1) / strideX + 1;
}else{
%(out)s_dims[2] = max(0, (r - 1 - wsX) / strideX + 1) + 1;
}
if(strideY >= wsY){
%(out)s_dims[3] = (c - 1) / strideY + 1;
}else{
%(out)s_dims[3] = max(0, (c - 1 - wsY) / strideY + 1) + 1;
}
}
%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;
if (CudaNdarray_prep_output(&%(out)s, 4, %(out)s_dims) != 0)
{
......@@ -908,7 +899,6 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
name=name, set_in=set_in,
set_out=set_out, input=inputs[0],
input_desc="input"+name,
ignore_border=self.ignore_border,
output_desc="output"+name)
def grad(self, inp, grads):
......@@ -919,8 +909,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
out = self(img, desc)
g_out = GpuDnnPoolGrad(ignore_border=self.ignore_border)(
img, out, grad, desc)
g_out = GpuDnnPoolGrad()(img, out, grad, desc)
return g_out, theano.gradient.DisconnectedType()()
......@@ -929,6 +918,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
return [[1], [0]]
def c_code_cache_version(self):
return
return (5, version())
......@@ -941,15 +931,9 @@ class GpuDnnPoolGrad(DnnBase):
:param inp_grad: same size as out, but is the corresponding gradient information.
:param desc: The pooling descriptor.
"""
__props__ = ('ignore_border', )
def __init__(self, ignore_border):
self.ignore_border = ignore_border
DnnBase.__init__(self)
__props__ = ()
def make_node(self, inp, out, inp_grad, desc):
if self.ignore_border is False:
raise NotImplementedError()
inp = as_cuda_ndarray_variable(inp)
if inp.type.ndim != 4:
raise TypeError('inp must be 4D tensor')
......@@ -1096,6 +1080,24 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPoolGrad: error doing operation: %%s",
cudnnGetErrorString(err%(name)s));
printf("input.shape=(%%d, %%d, %%d, %%d) input_grad.shape=(%%d, %%d, %%d, %%d) output.shape=(%%d, %%d, %%d, %%d) output_grad.shape=(%%d, %%d, %%d, %%d)\\n",
CudaNdarray_HOST_DIMS(%(input)s)[0],
CudaNdarray_HOST_DIMS(%(input)s)[1],
CudaNdarray_HOST_DIMS(%(input)s)[2],
CudaNdarray_HOST_DIMS(%(input)s)[3],
CudaNdarray_HOST_DIMS(%(input_grad)s)[0],
CudaNdarray_HOST_DIMS(%(input_grad)s)[1],
CudaNdarray_HOST_DIMS(%(input_grad)s)[2],
CudaNdarray_HOST_DIMS(%(input_grad)s)[3],
CudaNdarray_HOST_DIMS(%(output)s)[0],
CudaNdarray_HOST_DIMS(%(output)s)[1],
CudaNdarray_HOST_DIMS(%(output)s)[2],
CudaNdarray_HOST_DIMS(%(output)s)[3],
CudaNdarray_HOST_DIMS(%(output_grad)s)[0],
CudaNdarray_HOST_DIMS(%(output_grad)s)[1],
CudaNdarray_HOST_DIMS(%(output_grad)s)[2],
CudaNdarray_HOST_DIMS(%(output_grad)s)[3]
);
%(fail)s
}
""" % dict(output_grad=out_grad, desc=desc,
......@@ -1108,14 +1110,14 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
output_grad_desc="output_grad"+name)
def c_code_cache_version(self):
return
return (4, version())
def infer_shape(self, node, shape):
return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0),
ignore_border=True):
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
"""
GPU pooling using cuDNN from NVIDIA.
......@@ -1126,6 +1128,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0),
:param ws: subsampling window size
:param stride: subsampling stride (default: (1, 1))
:param mode: one of 'max', 'average' (default: 'max')
:param pad: todo doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
......@@ -1134,7 +1137,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0),
"""
img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
return GpuDnnPool(ignore_border=ignore_border)(img, desc)
return GpuDnnPool()(img, desc)
class GpuDnnSoftmaxBase(DnnBase):
......@@ -1463,10 +1466,11 @@ if True:
if not dnn_available():
return
if isinstance(node.op, GpuDownsampleFactorMax):
if not node.op.ignore_border:
return
img, = node.inputs
ds = node.op.ds
return [dnn_pool(gpu_contiguous(img), ds, ds,
ignore_border=node.op.ignore_border)]
return [dnn_pool(gpu_contiguous(img), ds, ds)]
@register_opt('cudnn')
@local_optimizer([DownsampleFactorMax])
......@@ -1474,13 +1478,14 @@ if True:
if not dnn_available():
return
if isinstance(node.op, DownsampleFactorMax):
if not node.op.ignore_border:
return
img, = node.inputs
ds = node.op.ds
stride = node.op.st
if (img.owner and isinstance(img.owner.op, HostFromGpu)):
ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
ds, stride=stride,
ignore_border=node.op.ignore_border)
ds, stride=stride)
return [host_from_gpu(ret)]
@register_opt('cudnn')
......@@ -1489,17 +1494,16 @@ if True:
if not dnn_available():
return
if isinstance(node.op, GpuDownsampleFactorMaxGrad):
inp, out, inp_grad = node.inputs
ds = node.op.ds
if not node.op.ignore_border:
return
inp, out, inp_grad = node.inputs
ds = node.op.ds
desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")()
return [GpuDnnPoolGrad(ignore_border=node.op.ignore_border)(
gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)]
return [GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)]
@register_opt('cudnn')
@local_optimizer([DownsampleFactorMaxGrad])
......@@ -1518,11 +1522,10 @@ if True:
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode="max")()
if not node.op.ignore_border:
return
ret = GpuDnnPoolGrad(ignore_border=node.op.ignore_border)(
gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)
ret = GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
desc)
return [host_from_gpu(ret)]
@register_opt('cudnn')
......
......@@ -31,7 +31,7 @@ else:
def pool_2d_i2n(input, ds=(2, 2), strides=None,
ignore_border=True,
pad=(0, 0),
pool_function=T.max, mode='ignore_borders'):
if strides is None:
strides = ds
......@@ -41,8 +41,19 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,
"strides should be smaller than or equal to ds,"
" strides=(%d, %d) and ds=(%d, %d)" %
(strides + ds))
shape = input.shape
if pad != (0, 0):
assert pool_function is T.max
pad_x = pad[0]
pad_y = pad[1]
a = T.alloc(-numpy.inf, shape[0], shape[1], shape[2] + pad_x*2,
shape[3] + pad_y*2)
input = T.set_subtensor(a[:, :,
pad_x:pad_x+shape[2],
pad_y:pad_y+shape[3]],
input)
shape = input.shape
neibs = images2neibs(input, ds, strides, mode=mode)
pooled_neibs = pool_function(neibs, axis=1)
......@@ -59,34 +70,43 @@ def test_pooling():
raise SkipTest(cuda.dnn.dnn_available.msg)
x = T.ftensor4()
for func, ignore_border in product(
(T.max, T.mean), (False, True)):
for func, pad in product(
(T.max, T.mean), ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
if pad != (0, 0) and cuda.dnn.version() < 20:
continue
for ws in (4, 2, 5):
for stride in (2, 3):
if stride > ws:
continue
if func is T.max:
if func is T.max and pad == (0, 0):
# We will check that the opt introduced it.
out1 = max_pool_2d(x, (ws, ws),
st=(stride, stride),
ignore_border=ignore_border)
ignore_border=True,)
# pad=pad)
else:
out1 = cuda.dnn.dnn_pool(
x, ws=(ws, ws),
stride=(stride, stride),
ignore_border=ignore_border,
pad=pad,
mode='max' if func is T.max else "average")
out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
pad=pad,
pool_function=func)
f1 = theano.function([x], out1, mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes])
f2 = theano.function([x], out1, mode=mode_without_gpu)
f2 = theano.function([x], out2, mode=mode_without_gpu)
assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f2.maker.fgraph.apply_nodes])
for shp in [(1, 10, 100, 100),
(1, 3, 99, 99),
(32, 1, 147, 197),
]:
print func, pad, ws, stride, shp
data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__()
......@@ -101,49 +121,45 @@ def test_pooling():
ws = 2
strides = 2
print func, pad, ws, stride, shp
# This test the CPU grad + opt + GPU implemtentation
def fn(x):
return max_pool_2d(x, (ws, ws), ignore_border=ignore_border)
return max_pool_2d(x, (ws, ws), ignore_border=True,)
# pad=pad)
theano.tests.unittest_tools.verify_grad(fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that the opt would have inserted it.
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
if ignore_border:
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
else:
assert not any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
# Test the GPU grad + GPU implementation
def fn(x):
dnn_op = cuda.dnn.dnn_pool(
x, ws=(ws, ws),
stride=(stride, stride),
ignore_border=ignore_border,
pad=pad,
mode='max' if func is T.max else "average")
return dnn_op
try:
theano.tests.unittest_tools.verify_grad(
fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that we get the good op.
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
g_out = fg(data)
assert ignore_border
except NotImplementedError:
assert not ignore_border
if func is T.max and ignore_border:
theano.tests.unittest_tools.verify_grad(
fn, [data],
cast_to_output_type=False,
mode=mode_with_gpu)
# Confirm that we get the good op.
fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
g_out = fg(data)
if func is T.max and pad == (0, 0):
# Compare again the CPU result
out = max_pool_2d(x, (ws, ws), ignore_border=ignore_border)
out = max_pool_2d(x, (ws, ws),
# pad=pad,
ignore_border=True)
fc = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu)
assert any([isinstance(node.op, DownsampleFactorMaxGrad)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论