提交 1ed69042 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #4884 from gvtulder/f-pool-3d

theano.tensor.signal.Pool with 3D support
......@@ -13,3 +13,4 @@
.. autofunction:: theano.tensor.signal.pool.pool_2d
.. autofunction:: theano.tensor.signal.pool.max_pool_2d_same_size
.. autofunction:: theano.tensor.signal.pool.pool_3d
......@@ -38,7 +38,7 @@ from .nnet import GpuSoftmax
from .opt import (gpu_seqopt, register_opt,
op_lifter, register_opt2)
from .opt_util import alpha_merge, output_merge, inplace_allocempty
from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
......@@ -1404,7 +1404,7 @@ class GpuDnnPoolGrad(DnnBase):
return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
def dnn_pool(img, ws, stride=None, mode='max', pad=None):
"""
GPU pooling using cuDNN from NVIDIA.
......@@ -1418,13 +1418,13 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
img
Images to do the pooling over.
ws : tuple
Subsampling window size.
Subsampling window size. Should have 2 or 3 elements.
stride : tuple
Subsampling stride (default: (1, 1)).
Subsampling stride (default: (1, 1) or (1, 1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
pad : tuple
(padX, padY) or (padX, padY, padZ)
default: (0, 0)
default: (0, 0) or (0, 0, 0)
.. warning:: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
......@@ -1436,6 +1436,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
"""
img = gpu_contiguous(img)
if stride is None:
stride = (1,) * len(ws)
if pad is None:
pad = (0,) * len(ws)
if mode == "sum":
ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
context_name = ret.type.context_name
......@@ -2073,9 +2077,19 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not op.ignore_border:
return
img, ws, stride, pad = inputs
img = as_gpuarray_variable(img, ctx_name)
nd = op.ndim if op.ndim else (img.ndim - 2)
if nd not in (2, 3):
return
img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
mode = op.mode
return dnn_pool(gpu_contiguous(img), ws, stride=stride, pad=pad, mode=mode)
# dnn_pool expects exactly 2 non-pooling dimensions
if img.ndim == nd + 2:
return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
img_padded = pad_dims(img, 2, nd)
ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
return unpad_dims(ret_padded, img, 2, nd)
@register_opt('cudnn', 'fast_compile')
......@@ -2087,17 +2101,34 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not op.ignore_border:
return
inp, out, out_grad, ws, stride, pad = inputs
inp = as_gpuarray_variable(inp, ctx_name)
out = as_gpuarray_variable(out, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name)
nd = op.ndim if op.ndim else (inp.ndim - 2)
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(out_grad),
ws,
stride,
pad)
# the GPU ops expect exactly 2 non-pooling dimensions
if inp.ndim == nd + 2:
return GpuDnnPoolGrad(mode=mode)(inp,
out,
out_grad,
ws,
stride,
pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_padded = pad_dims(out, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
out_padded,
out_grad_padded,
ws,
stride,
pad)
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt('cudnn', 'fast_compile')
......@@ -2109,16 +2140,30 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not op.ignore_border:
return
inp, out_grad, ws, stride, pad = inputs
inp = as_gpuarray_variable(inp, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name)
nd = op.ndim if op.ndim else (inp.ndim - 2)
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
cg = gpu_contiguous(out_grad)
# We reuse cg because cuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ws, stride, pad)
# the GPU ops expect exactly 2 non-pooling dimensions
if inp.ndim == nd + 2:
# We reuse out_grad because cuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
out_grad_padded,
out_grad_padded,
ws,
stride,
pad)
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt('cudnn', 'fast_compile')
......
......@@ -3,12 +3,12 @@ from functools import wraps
import numpy
from theano import scalar as scal, Constant
from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape, gpu_alloc_empty
from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
......@@ -329,3 +329,76 @@ def inplace_allocempty(op, idx):
return maker(node, inputs)
return opt
return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
Use `unpad_dims` to reshape back to the original dimensions.
Examples
--------
Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
adds a singleton dimension and reshapes to (1, 3, 5, 7).
Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
reshapes back to (3, 5, 7).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
does not reshape and returns output with shape (3, 5, 7, 9).
Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
combines the first two dimensions and reshapes to (15, 7, 9, 11).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
adds a singleton dimension and reshapes to (1, 3, 5, 7, 9).
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
non_pool_ndim = input.ndim - rightdims
if non_pool_ndim < leftdims:
# too few dimensions, pad on the left
dummy_dims = tensor.as_tensor([1] * (leftdims - non_pool_ndim))
new_shape = tensor.join(0, dummy_dims,
input.shape[:non_pool_ndim],
img_shape)
else:
# too many dimensions, combine the leading dimensions
batched_ndim = non_pool_ndim - leftdims + 1
batch_size = tensor.prod(input.shape[:batched_ndim])
# convert to a vector for tensor.join
batch_size = tensor.shape_padright(batch_size, 1)
new_shape = tensor.join(0, batch_size,
input.shape[batched_ndim:non_pool_ndim],
img_shape)
# store in the required shape
new_shape = tensor.cast(new_shape, 'int64')
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)
差异被折叠。
......@@ -40,6 +40,7 @@ from theano.sandbox.cuda.basic_ops import (
GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
from theano.sandbox.cuda.opt_util import pad_dims, unpad_dims
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (
......@@ -1883,15 +1884,12 @@ def local_convtransp3d_gemm(node):
gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
def _check_constant_args_pool(ws, stride, pad, node):
def _check_constant_args_pool(ndim, ws, stride, pad, node):
"""Check if the args of pool are constants. Warns if not."""
try:
ws_w = tensor.get_scalar_constant_value(ws[0])
ws_h = tensor.get_scalar_constant_value(ws[1])
stride_w = tensor.get_scalar_constant_value(stride[0])
stride_h = tensor.get_scalar_constant_value(stride[1])
pad_w = tensor.get_scalar_constant_value(pad[0])
pad_h = tensor.get_scalar_constant_value(pad[1])
ws = tuple(tensor.get_scalar_constant_value(ws[i]) for i in range(ndim))
stride = tuple(tensor.get_scalar_constant_value(stride[i]) for i in range(ndim))
pad = tuple(tensor.get_scalar_constant_value(pad[i]) for i in range(ndim))
except tensor.NotScalarConstantError:
msg = ("Pool with tensor variable for the window size, stride or "
"padding is only supported in the new GPU backend, so this op "
......@@ -1901,65 +1899,96 @@ def _check_constant_args_pool(ws, stride, pad, node):
elif config.assert_no_cpu_op == "raise":
raise AssertionError(msg)
return None
ws = (ws_w, ws_h)
stride = (stride_w, stride_h)
pad = (pad_w, pad_h)
return ws, stride, pad
@register_opt()
@local_optimizer([pool.Pool])
def local_gpu_downsample_factor_max(node):
if isinstance(node.op, pool.Pool):
assert node.op.__props__ == ('ignore_border', 'mode')
if (isinstance(node.op, pool.Pool)):
assert node.op.__props__ == ('ignore_border', 'mode', 'ndim')
x, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node)
nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None:
return
ws, stride, pad = ret
if (pad) != (0, 0) or node.op.mode != 'max' or stride != ws:
if (nd != 2 or
max(pad) != 0 or
node.op.mode != 'max' or
stride != ws):
return
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds = GpuDownsampleFactorMax(ws, node.op.ignore_border)
return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
gpu_ws = GpuDownsampleFactorMax(ws, node.op.ignore_border)
if node.inputs[0].ndim == 4:
return [host_from_gpu(gpu_ws(x.owner.inputs[0]))]
else:
input_4D = pad_dims(x.owner.inputs[0], 2, 2)
output_4D = gpu_ws(input_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt()
@local_optimizer([pool.MaxPoolGrad])
def local_gpu_downsample_factor_max_grad(node):
if isinstance(node.op, pool.MaxPoolGrad):
assert node.op.__props__ == ('ignore_border', 'mode')
if (isinstance(node.op, pool.MaxPoolGrad)):
assert node.op.__props__ == ('ignore_border', 'mode', 'ndim')
x, z, gz, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node)
nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None:
return
ws, stride, pad = ret
if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
if (nd != 2 or
max(pad) != 0 or
node.op.mode != 'max' or
stride != ws):
return
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(gz)))]
gpu_ws_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
if node.inputs[0].ndim == 4:
return [host_from_gpu(gpu_ws_grad(x.owner.inputs[0],
as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(gz)))]
else:
x_4D = pad_dims(x.owner.inputs[0], 2, 2)
z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
gz_4D = pad_dims(as_cuda_ndarray_variable(gz), 2, 2)
output_4D = gpu_ws_grad(x_4D, z_4D, gz_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt()
@local_optimizer([pool.DownsampleFactorMaxGradGrad])
def local_gpu_downsample_factor_max_grad_grad(node):
if isinstance(node.op, pool.DownsampleFactorMaxGradGrad):
assert node.op.__props__ == ('ignore_border', 'mode')
assert node.op.__props__ == ('ignore_border', 'mode', 'ndim')
x, z, gx, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node)
nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None:
return
ws, stride, pad = ret
if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
if (nd != 2 or
max(pad) != 0 or
node.op.mode != 'max' or
stride != ws):
return
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
op = GpuDownsampleFactorMaxGradGrad(ws, node.op.ignore_border)
return [host_from_gpu(op(x.owner.inputs[0],
as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(gx)))]
if node.inputs[0].ndim == 4:
return [host_from_gpu(op(x.owner.inputs[0],
as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(gx)))]
else:
x_4D = pad_dims(x.owner.inputs[0], 2, 2)
z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
gx_4D = pad_dims(as_cuda_ndarray_variable(gx), 2, 2)
output_4D = op(x_4D, z_4D, gx_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt()
......
......@@ -3,13 +3,13 @@ from functools import wraps
import numpy
from theano import scalar as scal, Constant
from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from theano.sandbox.cuda.basic_ops import (
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise, GpuReshape)
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
......@@ -126,3 +126,76 @@ def output_merge(cls, alpha_in, beta_in, out_in):
return maker(targ, *inputs)
return opt
return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
Use `unpad_dims` to reshape back to the original dimensions.
Examples
--------
Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
adds a singleton dimension and reshapes to (3, 1, 5, 7).
Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
reshapes back to (3, 5, 7).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
does not reshape and returns output with shape (3, 5, 7, 9).
Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
combines the first two dimensions and reshapes to (8, 7, 9, 11).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
adds a singleton dimension and reshapes to (3, 1, 5, 7, 9).
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
non_pool_ndim = input.ndim - rightdims
if non_pool_ndim < leftdims:
# too few dimensions, pad on the left
dummy_dims = tensor.as_tensor([1] * (leftdims - non_pool_ndim))
new_shape = tensor.join(0, dummy_dims,
input.shape[:non_pool_ndim],
img_shape)
else:
# too many dimensions, combine the leading dimensions
batched_ndim = non_pool_ndim - leftdims + 1
batch_size = tensor.prod(input.shape[:batched_ndim])
# convert to a vector for tensor.join
batch_size = tensor.shape_padright(batch_size, 1)
new_shape = tensor.join(0, batch_size,
input.shape[batched_ndim:non_pool_ndim],
img_shape)
# store in the required shape
new_shape = tensor.cast(new_shape, 'int64')
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)
......@@ -326,7 +326,9 @@ if 0:
def test_downsample():
shps = [(1, 1, 1, 12),
shps = [(1, 12),
(1, 1, 12),
(1, 1, 1, 12),
(1, 1, 2, 2),
(1, 1, 1, 1),
(1, 1, 4, 4),
......@@ -359,17 +361,17 @@ def test_downsample():
for shp in shps:
for ds in (2, 2), (3, 2), (1, 1):
if ds[0] > shp[2]:
if ds[0] > shp[-2]:
continue
if ds[1] > shp[3]:
if ds[1] > shp[-1]:
continue
# GpuDownsampleFactorMax doesn't like having more than 512 columns
# in the output tensor.
if float(shp[3]) / ds[1] > 512:
if float(shp[-1]) / ds[1] > 512:
continue
for ignore_border in (True, False):
# print 'test_downsample', shp, ds, ignore_border
ds_op = Pool(ignore_border=ignore_border)
ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)
a = tcn.shared_constructor(my_rand(*shp), 'a')
f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论