提交 a16e91f7 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

theano.tensor.signal.Pool with 3D support.

上级 172e699c
......@@ -35,7 +35,7 @@ from .nnet import GpuSoftmax
from .opt import (gpu_seqopt, register_opt,
op_lifter, register_opt2)
from .opt_util import alpha_merge, output_merge, inplace_allocempty
from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
......@@ -1253,7 +1253,7 @@ class GpuDnnPoolGrad(DnnBase):
return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
def dnn_pool(img, ws, stride=None, mode='max', pad=None):
"""
GPU pooling using cuDNN from NVIDIA.
......@@ -1267,13 +1267,13 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
img
Images to do the pooling over.
ws : tuple
Subsampling window size.
Subsampling window size. Should have 2 or 3 elements.
stride : tuple
Subsampling stride (default: (1, 1)).
Subsampling stride (default: (1, 1) or (1, 1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
pad : tuple
(padX, padY) or (padX, padY, padZ)
default: (0, 0)
default: (0, 0) or (0, 0, 0)
.. warning:: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
......@@ -1285,6 +1285,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
"""
img = gpu_contiguous(img)
if stride is None:
stride = (1,) * len(ws)
if pad is None:
pad = (0,) * len(ws)
if mode == "sum":
ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
context_name = ret.type.context_name
......@@ -1868,9 +1872,18 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not op.ignore_border:
return
img, ws, stride, pad = inputs
img = as_gpuarray_variable(img, ctx_name)
nd = op.ndim if op.ndim else (img.ndim - 2)
if nd not in (2, 3):
return
img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
mode = op.mode
return dnn_pool(gpu_contiguous(img), ws, stride=stride, pad=pad, mode=mode)
if img.ndim == nd + 2:
return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
img_padded = pad_dims(img, 2, nd)
ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
return unpad_dims(ret_padded, img, 2, nd)
@register_opt('cudnn', 'fast_compile')
......@@ -1882,17 +1895,33 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not op.ignore_border:
return
inp, out, out_grad, ws, stride, pad = inputs
inp = as_gpuarray_variable(inp, ctx_name)
out = as_gpuarray_variable(out, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name)
nd = op.ndim if op.ndim else (inp.ndim - 2)
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(out_grad),
if inp.ndim == nd + 2:
return GpuDnnPoolGrad(mode=mode)(inp,
out,
out_grad,
ws,
stride,
pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_padded = pad_dims(out, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
out_padded,
out_grad_padded,
ws,
stride,
pad)
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt('cudnn', 'fast_compile')
......@@ -1904,16 +1933,28 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not op.ignore_border:
return
inp, out_grad, ws, stride, pad = inputs
inp = as_gpuarray_variable(inp, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name)
nd = op.ndim if op.ndim else (inp.ndim - 2)
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode
cg = gpu_contiguous(out_grad)
# We reuse cg because cuDNN does not use the value of the `out`
if inp.ndim == nd + 2:
# We reuse out_grad because cuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ws, stride, pad)
return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
else:
inp_padded = pad_dims(inp, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
out_grad_padded,
out_grad_padded,
ws,
stride,
pad)
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt('cudnn', 'fast_compile')
......
......@@ -3,12 +3,12 @@ from functools import wraps
import numpy
from theano import scalar as scal, Constant
from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape, gpu_alloc_empty
from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
......@@ -329,3 +329,48 @@ def inplace_allocempty(op, idx):
return maker(node, inputs)
return opt
return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
# count the number of "leading" dimensions, store as dmatrix
batch_size = tensor.prod(input.shape[:-rightdims])
batch_size = tensor.shape_padright(batch_size, 1)
# store in the required shape, for example as a 4D tensor
# with shape: (batch_size,1,height,width)
new_shape = tensor.cast(tensor.join(0, batch_size,
tensor.as_tensor([1] * (leftdims - 1)),
img_shape), 'int64')
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)
......@@ -30,7 +30,8 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax
from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
from theano.sandbox.cuda.opt_util import (alpha_merge, output_merge,
pad_dims, unpad_dims)
from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
......@@ -1391,20 +1392,23 @@ class GpuDnnPoolDesc(GpuOp):
def do_constant_folding(self, node):
return False
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
def __init__(self, ws=(1, 1), stride=None, mode='max', pad=None):
if mode == 'average':
mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode
if stride is None:
stride = (1,) * len(ws)
if pad is None:
pad = (0,) * len(ws)
assert len(ws) == len(stride) and len(stride) == len(pad)
assert len(ws) in (2, 3)
self.ws = ws
self.stride = stride
self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and version() == -1:
raise RuntimeError("cuDNN pooling with padding requires cuDNN v2")
if self.get_ndim() == 3 and version() < (3000, 3000):
raise RuntimeError("cuDNN 3d pooling requires cuDNN v3")
if (mode == 'average_exc_pad' and max(pad) > 0 and
......@@ -1418,12 +1422,9 @@ class GpuDnnPoolDesc(GpuOp):
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'pad'):
self.pad = (0, 0)
self.pad = (0,) * self.get_ndim()
def make_node(self):
if self.pad != (0, 0) and version() == -1:
raise RuntimeError("cuDNN pooling with padding requires cuDNN v2")
node = Apply(self, [],
[CDataType("cudnnPoolingDescriptor_t",
freefunc="cudnnDestroyPoolingDescriptor")()])
......@@ -1444,8 +1445,6 @@ class GpuDnnPoolDesc(GpuOp):
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else:
raise NotImplementedError("Unsupported pooling model.")
......@@ -1616,8 +1615,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else:
raise NotImplementedError("Unsupported pooling model.")
......@@ -1872,8 +1869,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else:
raise NotImplementedError("Unsupported pooling model.")
......@@ -1976,28 +1971,33 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
def dnn_pool(img, ws, stride=None, mode='max', pad=None):
"""
GPU pooling using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
For 2D pooling, the memory layout to use is 'bc01', that is 'batch',
'channel', 'first dim', 'second dim' in that order.
For 3D pooling, the memory layout to use is 'bc012', that is 'batch',
'channel', 'first dim', 'second dim', 'third dim'.
Parameters
----------
img
Images to do the pooling over.
ws
Subsampling window size.
Subsampling window size. Should have 2 or 3 elements.
stride
Subsampling stride (default: (1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad, 'sum'}
pad :
(pad_h, pad_w) padding information.
Subsampling stride (default: (1, 1) or (1, 1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
pad
Padding: (pad_h, pad_w) for 2D or (pad_h, pad_w, pad_d) for 3D.
pad_h is the number of zero-valued pixels added to each of the top and
bottom borders.
pad_w is the number of zero-valued pixels added to each of the left
and right borders.
pad_d is the number of zero-valued pixels added to each of the front
and back borders (3D pooling only).
.. warning:: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
......@@ -2009,6 +2009,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
"""
img = gpu_contiguous(img)
if stride is None:
stride = (1,) * len(ws)
if pad is None:
pad = (0,) * len(ws)
if mode == "sum":
ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
window_elem = theano.tensor.prod(ws).astype(ret.dtype)
......@@ -2972,10 +2976,21 @@ if True:
if not node.op.ignore_border:
return
img, ws, stride, pad = node.inputs
nd = node.op.ndim if node.op.ndim else (img.ndim - 2)
mode = node.op.mode
if nd not in (2, 3):
return
if (img.owner and isinstance(img.owner.op, HostFromGpu)):
if img.ndim == nd + 2:
ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
ws, stride=stride, pad=pad, mode=mode)
else:
input = gpu_contiguous(img.owner.inputs[0])
# reshape to 4D or 5D with 2 non-pooling dimensions
input_padded = pad_dims(input, 2, nd)
ret_padded = dnn_pool(input_padded,
ws, stride=stride, pad=pad, mode=mode)
ret = unpad_dims(ret_padded, input, 2, nd)
return [host_from_gpu(ret)]
@register_opt('cudnn')
......@@ -3003,17 +3018,30 @@ if True:
if not node.op.ignore_border:
return
inp, out, inp_grad, ws, stride, pad = node.inputs
nd = node.op.ndim if node.op.ndim else (inp.ndim - 2)
mode = node.op.mode
if nd not in (2, 3):
return
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(out.owner and isinstance(out.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))):
if inp.ndim == nd + 2:
ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out),
gpu_contiguous(inp_grad),
ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(gpu_contiguous(inp), 2, nd)
out_padded = pad_dims(gpu_contiguous(out), 2, nd)
inp_grad_padded = pad_dims(gpu_contiguous(inp_grad), 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
out_padded,
inp_grad_padded,
ws, stride, pad)
ret = unpad_dims(ret_padded, inp, 2, nd)
return [host_from_gpu(ret)]
@register_opt('cudnn')
......@@ -3025,16 +3053,28 @@ if True:
if not node.op.ignore_border:
return
inp, inp_grad, ws, stride, pad = node.inputs
nd = node.op.ndim if node.op.ndim else (inp.ndim - 2)
mode = node.op.mode
if nd not in (2, 3):
return
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))):
if inp.ndim == nd + 2:
contiguous_inp_grad = gpu_contiguous(inp_grad)
ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
contiguous_inp_grad,
contiguous_inp_grad,
ws, stride, pad)
else:
inp_padded = pad_dims(gpu_contiguous(inp), 2, nd)
inp_grad_padded = pad_dims(gpu_contiguous(inp_grad), 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
inp_grad_padded,
inp_grad_padded,
ws, stride, pad)
ret = unpad_dims(ret_padded, inp, 2, nd)
return [host_from_gpu(ret)]
@register_opt('cudnn')
......
......@@ -40,6 +40,7 @@ from theano.sandbox.cuda.basic_ops import (
GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
from theano.sandbox.cuda.opt_util import pad_dims, unpad_dims
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (
......@@ -1891,15 +1892,12 @@ def local_convtransp3d_gemm(node):
gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
def _check_constant_args_pool(ws, stride, pad, node):
def _check_constant_args_pool(ndim, ws, stride, pad, node):
"""Check if the args of pool are constants. Warns if not."""
try:
ws_w = tensor.get_scalar_constant_value(ws[0])
ws_h = tensor.get_scalar_constant_value(ws[1])
stride_w = tensor.get_scalar_constant_value(stride[0])
stride_h = tensor.get_scalar_constant_value(stride[1])
pad_w = tensor.get_scalar_constant_value(pad[0])
pad_h = tensor.get_scalar_constant_value(pad[1])
ws = tuple(tensor.get_scalar_constant_value(ws[i]) for i in range(ndim))
stride = tuple(tensor.get_scalar_constant_value(stride[i]) for i in range(ndim))
pad = tuple(tensor.get_scalar_constant_value(pad[i]) for i in range(ndim))
except tensor.NotScalarConstantError:
msg = ("Pool with tensor variable for the window size, stride or "
"padding is only supported in the new GPU backend, so this op "
......@@ -1909,65 +1907,96 @@ def _check_constant_args_pool(ws, stride, pad, node):
elif config.assert_no_cpu_op == "raise":
raise AssertionError(msg)
return None
ws = (ws_w, ws_h)
stride = (stride_w, stride_h)
pad = (pad_w, pad_h)
return ws, stride, pad
@register_opt()
@local_optimizer([pool.Pool])
def local_gpu_downsample_factor_max(node):
if isinstance(node.op, pool.Pool):
assert node.op.__props__ == ('ignore_border', 'mode')
if (isinstance(node.op, pool.Pool)):
assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
x, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node)
nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None:
return
ws, stride, pad = ret
if (pad) != (0, 0) or node.op.mode != 'max' or stride != ws:
if (nd != 2 or
max(node.op.padding) != 0 or
node.op.mode != 'max' or
stride != ws):
return
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds = GpuDownsampleFactorMax(ws, node.op.ignore_border)
return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
gpu_ws = GpuDownsampleFactorMax(ws, node.op.ignore_border)
if node.inputs[0].ndim == 4:
return [host_from_gpu(gpu_ws(x.owner.inputs[0]))]
else:
input_4D = pad_dims(x.owner.inputs[0], 2, 2)
output_4D = gpu_ws(input_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt()
@local_optimizer([pool.MaxPoolGrad])
def local_gpu_downsample_factor_max_grad(node):
if isinstance(node.op, pool.MaxPoolGrad):
assert node.op.__props__ == ('ignore_border', 'mode')
if (isinstance(node.op, pool.MaxPoolGrad)):
assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
x, z, gz, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node)
nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None:
return
ws, stride, pad = ret
if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
if (nd != 2 or
max(node.op.padding) != 0 or
node.op.mode != 'max' or
stride != ws):
return
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
gpu_ws_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
if node.inputs[0].ndim == 4:
return [host_from_gpu(gpu_ws_grad(x.owner.inputs[0],
as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(gz)))]
else:
x_4D = pad_dims(x.owner.inputs[0], 2, 2)
z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
gz_4D = pad_dims(as_cuda_ndarray_variable(gz), 2, 2)
output_4D = gpu_ws_grad(x_4D, z_4D, gz_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt()
@local_optimizer([pool.DownsampleFactorMaxGradGrad])
def local_gpu_downsample_factor_max_grad_grad(node):
if isinstance(node.op, pool.DownsampleFactorMaxGradGrad):
assert node.op.__props__ == ('ignore_border', 'mode')
assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
x, z, gx, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node)
nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None:
return
ws, stride, pad = ret
if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
if (nd != 2 or
max(node.op.padding) != 0 or
node.op.mode != 'max' or
stride != ws):
return
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
op = GpuDownsampleFactorMaxGradGrad(ws, node.op.ignore_border)
if node.inputs[0].ndim == 4:
return [host_from_gpu(op(x.owner.inputs[0],
as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(gx)))]
else:
x_4D = pad_dims(x.owner.inputs[0], 2, 2)
z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
gx_4D = pad_dims(as_cuda_ndarray_variable(gx), 2, 2)
output_4D = op(x_4D, z_4D, gx_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt()
......
......@@ -3,13 +3,13 @@ from functools import wraps
import numpy
from theano import scalar as scal, Constant
from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from theano.sandbox.cuda.basic_ops import (
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise, GpuReshape)
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
......@@ -126,3 +126,48 @@ def output_merge(cls, alpha_in, beta_in, out_in):
return maker(targ, *inputs)
return opt
return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
# count the number of "leading" dimensions, store as dmatrix
batch_size = tensor.prod(input.shape[:-rightdims])
batch_size = tensor.shape_padright(batch_size, 1)
# store in the required shape, for example as a 4D tensor
# with shape: (batch_size,1,height,width)
new_shape = tensor.cast(tensor.join(0, batch_size,
tensor.as_tensor([1] * (leftdims - 1)),
img_shape), 'int64')
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)
......@@ -326,7 +326,9 @@ if 0:
def test_downsample():
shps = [(1, 1, 1, 12),
shps = [(1, 12),
(1, 1, 12),
(1, 1, 1, 12),
(1, 1, 2, 2),
(1, 1, 1, 1),
(1, 1, 4, 4),
......@@ -359,17 +361,17 @@ def test_downsample():
for shp in shps:
for ds in (2, 2), (3, 2), (1, 1):
if ds[0] > shp[2]:
if ds[0] > shp[-2]:
continue
if ds[1] > shp[3]:
if ds[1] > shp[-1]:
continue
# GpuDownsampleFactorMax doesn't like having more than 512 columns
# in the output tensor.
if float(shp[3]) / ds[1] > 512:
if float(shp[-1]) / ds[1] > 512:
continue
for ignore_border in (True, False):
# print 'test_downsample', shp, ds, ignore_border
ds_op = Pool(ignore_border=ignore_border)
ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)
a = tcn.shared_constructor(my_rand(*shp), 'a')
f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds),
......
......@@ -15,8 +15,8 @@ import theano
import theano.tensor as T
import theano.tests.unittest_tools as utt
from theano.sandbox.neighbours import images2neibs
from theano.tensor.signal.pool import pool_2d
from theano.tensor.signal.pool import MaxPoolGrad, AveragePoolGrad
from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared
......@@ -170,7 +170,7 @@ def test_dnn_conv_inplace():
def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
pool_func=T.max, mode='ignore_borders'):
pool_function=T.max, mode='ignore_borders'):
if strides is None:
strides = ds
......@@ -179,13 +179,13 @@ def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
# resahpe to B, C*0, 1, 2 and do the pooling on 1, 2
first = input.reshape((shape[0], shape[1] * shape[2], shape[3], shape[4]))
pooled1 = pool_2d_i2n(first, ds=ds[1:], strides=strides[1:], pad=pad[1:],
pool_function=pool_func, mode=mode)
pool_function=pool_function, mode=mode)
shp1 = pooled1.shape
# reshape to B, C, 0, 1*2 and do the pooling on 0
second = pooled1.reshape((shape[0], shape[1], shape[2], shp1[2] * shp1[3]))
pooled2 = pool_2d_i2n(second, ds=(ds[0], 1), strides=(strides[0], 1),
pad=(pad[0], 0), pool_function=pool_func, mode=mode)
pad=(pad[0], 0), pool_function=pool_function, mode=mode)
shp2 = pooled2.shape
return pooled2.reshape((shape[0], shape[1], shp2[2], shp1[2], shp1[3]))
......@@ -241,8 +241,6 @@ def test_pooling():
func = T.max
else:
func = T.mean
if pad != (0, 0) and cuda.dnn.version() == -1:
continue
if pad != (0, 0) and func is T.mean:
continue
......@@ -418,6 +416,7 @@ def test_pooling3d():
if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000):
raise SkipTest(cuda.dnn.dnn_available.msg)
# We force the FAST_RUN as we don't want the reference to run in DebugMode.
mode_without_gpu_ref = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
......@@ -427,8 +426,7 @@ def test_pooling3d():
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
x = T.TensorType(broadcastable=(False, False, False, False, False),
dtype='float32')()
x = T.ftensor5()
for mode, pad in product(modes,
((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
(2, 3, 2), (3, 2, 2), (2, 2, 3))):
......@@ -436,8 +434,6 @@ def test_pooling3d():
func = T.max
else:
func = T.mean
if pad != (0, 0, 0) and cuda.dnn.version() == -1:
continue
if pad != (0, 0, 0) and func is T.mean:
continue
......@@ -449,13 +445,13 @@ def test_pooling3d():
if pad[0] > stride or pad[1] > stride or pad[2] > stride:
# Not implemented
continue
out1 = cuda.dnn.dnn_pool(x, (ws, ws, ws),
stride=(stride, stride, stride),
pad=pad, mode=mode)
out2 = pool3d2d(x, ds=(ws, ws, ws),
strides=(stride, stride, stride),
pad=pad, pool_func=func)
out1 = pool_3d(x, (ws, ws, ws),
st=(stride, stride, stride),
ignore_border=True,
padding=pad, mode=mode)
out2 = pool3d2d(x, ds=(ws, ws, ws), strides=(stride, stride, stride),
pad=pad,
pool_function=func)
f1 = theano.function([x], out1, mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes])
......@@ -510,11 +506,17 @@ def test_pooling3d():
g_out = fg(data)
# Compare again the CPU result
out = pool3d2d(x, (ws, ws, ws),
strides=(stride, stride, stride),
pad=pad, pool_func=func)
out = pool_3d(x, (ws, ws, ws),
padding=pad,
ignore_border=True, mode=mode)
fc = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu_ref)
if mode == 'max':
assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()])
else:
assert any([isinstance(node.op, AveragePoolGrad)
for node in fc.maker.fgraph.toposort()])
c_out = fc(data)
utt.assert_allclose(c_out, g_out)
......@@ -523,6 +525,7 @@ def test_pooling_opt():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
# 2D pooling
x = T.fmatrix()
f = theano.function(
......@@ -535,6 +538,7 @@ def test_pooling_opt():
f(numpy.zeros((10, 10), dtype='float32'))
# gradient of 2D pooling
f = theano.function(
[x],
T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad',
......@@ -545,6 +549,7 @@ def test_pooling_opt():
for n in f.maker.fgraph.toposort()])
f(numpy.zeros((10, 10), dtype='float32'))
# Test sum pooling
f = theano.function(
[x],
......@@ -557,6 +562,82 @@ def test_pooling_opt():
data = numpy.random.rand(10, 10).astype('float32')
f(data)
# 3D pooling
x = T.ftensor3()
f = theano.function(
[x],
pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad', ignore_border=True),
mode=mode_with_gpu)
assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
for n in f.maker.fgraph.toposort()])
f(numpy.zeros((10, 10, 10), dtype='float32'))
# gradient of 3D pooling
f = theano.function(
[x],
T.grad(pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad',
ignore_border=True).sum(), x),
mode=mode_with_gpu.including("cudnn"))
assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)
for n in f.maker.fgraph.toposort()])
f(numpy.zeros((10, 10, 10), dtype='float32'))
def test_pooling_opt_arbitrary_dimensions():
# test if input with an arbitrary number of non-pooling dimensions
# is correctly reshaped to run on the GPU
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
# 'average_exc_pad' is disabled for versions < 4004
if cuda.dnn.version() < (4004, 4004):
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
for n_non_pool_dims in (0, 1, 2, 3):
for ws in ((2, 2), (3, 3, 3)):
# create input shape: non-pooling dimensions
# followed by 2 or 3 pooling dimensions
shp = (2,) * n_non_pool_dims + (5,) * len(ws)
data = numpy.random.normal(0, 1, shp).astype('float32')
input = shared(data)
for mode in modes:
out_pool = Pool(ndim=len(ws), mode=mode, ignore_border=True)(input, ws)
out_pool_grad = T.grad(T.sum(out_pool), wrt=input)
out = [out_pool, out_pool_grad]
# run on GPU
fg = theano.function([], out, mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in fg.maker.fgraph.toposort()])
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
res_gpu = fg()
# run on CPU
fc = theano.function([], out, mode=mode_without_gpu)
assert any([isinstance(node.op, Pool)
for node in fc.maker.fgraph.toposort()])
if mode == 'max':
assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()])
else:
assert any([isinstance(node.op, AveragePoolGrad)
for node in fc.maker.fgraph.toposort()])
res_cpu = fg()
# check for similarity
utt.assert_allclose(res_gpu[0], res_cpu[0])
utt.assert_allclose(res_gpu[1], res_cpu[1])
class test_DnnSoftMax(test_nnet.test_SoftMax):
gpu_op = dnn.GpuDnnSoftmax
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论