提交 a16e91f7 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

theano.tensor.signal.Pool with 3D support.

上级 172e699c
...@@ -35,7 +35,7 @@ from .nnet import GpuSoftmax ...@@ -35,7 +35,7 @@ from .nnet import GpuSoftmax
from .opt import (gpu_seqopt, register_opt, from .opt import (gpu_seqopt, register_opt,
op_lifter, register_opt2) op_lifter, register_opt2)
from .opt_util import alpha_merge, output_merge, inplace_allocempty from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
...@@ -1253,7 +1253,7 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1253,7 +1253,7 @@ class GpuDnnPoolGrad(DnnBase):
return [shape[0]] return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): def dnn_pool(img, ws, stride=None, mode='max', pad=None):
""" """
GPU pooling using cuDNN from NVIDIA. GPU pooling using cuDNN from NVIDIA.
...@@ -1267,13 +1267,13 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -1267,13 +1267,13 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
img img
Images to do the pooling over. Images to do the pooling over.
ws : tuple ws : tuple
Subsampling window size. Subsampling window size. Should have 2 or 3 elements.
stride : tuple stride : tuple
Subsampling stride (default: (1, 1)). Subsampling stride (default: (1, 1) or (1, 1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'} mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
pad : tuple pad : tuple
(padX, padY) or (padX, padY, padZ) (padX, padY) or (padX, padY, padZ)
default: (0, 0) default: (0, 0) or (0, 0, 0)
.. warning:: The cuDNN library only works with GPU that have a compute .. warning:: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not capability of 3.0 or higer. This means that older GPU will not
...@@ -1285,6 +1285,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -1285,6 +1285,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
""" """
img = gpu_contiguous(img) img = gpu_contiguous(img)
if stride is None:
stride = (1,) * len(ws)
if pad is None:
pad = (0,) * len(ws)
if mode == "sum": if mode == "sum":
ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad) ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
context_name = ret.type.context_name context_name = ret.type.context_name
...@@ -1868,9 +1872,18 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs): ...@@ -1868,9 +1872,18 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not op.ignore_border: if not op.ignore_border:
return return
img, ws, stride, pad = inputs img, ws, stride, pad = inputs
img = as_gpuarray_variable(img, ctx_name) nd = op.ndim if op.ndim else (img.ndim - 2)
if nd not in (2, 3):
return
img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
mode = op.mode mode = op.mode
return dnn_pool(gpu_contiguous(img), ws, stride=stride, pad=pad, mode=mode) if img.ndim == nd + 2:
return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
img_padded = pad_dims(img, 2, nd)
ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
return unpad_dims(ret_padded, img, 2, nd)
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
...@@ -1882,17 +1895,33 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): ...@@ -1882,17 +1895,33 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not op.ignore_border: if not op.ignore_border:
return return
inp, out, out_grad, ws, stride, pad = inputs inp, out, out_grad, ws, stride, pad = inputs
inp = as_gpuarray_variable(inp, ctx_name) nd = op.ndim if op.ndim else (inp.ndim - 2)
out = as_gpuarray_variable(out, ctx_name) if nd not in (2, 3):
out_grad = as_gpuarray_variable(out_grad, ctx_name) return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode mode = op.mode
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), if inp.ndim == nd + 2:
gpu_contiguous(out), return GpuDnnPoolGrad(mode=mode)(inp,
gpu_contiguous(out_grad), out,
out_grad,
ws, ws,
stride, stride,
pad) pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
out_padded = pad_dims(out, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
out_padded,
out_grad_padded,
ws,
stride,
pad)
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
...@@ -1904,16 +1933,28 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): ...@@ -1904,16 +1933,28 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not op.ignore_border: if not op.ignore_border:
return return
inp, out_grad, ws, stride, pad = inputs inp, out_grad, ws, stride, pad = inputs
inp = as_gpuarray_variable(inp, ctx_name) nd = op.ndim if op.ndim else (inp.ndim - 2)
out_grad = as_gpuarray_variable(out_grad, ctx_name) if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
mode = op.mode mode = op.mode
cg = gpu_contiguous(out_grad) if inp.ndim == nd + 2:
# We reuse out_grad because cuDNN does not use the value of the `out`
# We reuse cg because cuDNN does not use the value of the `out`
# argument but still checks its shape for average pooling. This # argument but still checks its shape for average pooling. This
# has been observed in v2 and v3 as far as I know. # has been observed in v2 and v3 as far as I know.
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ws, stride, pad) return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
else:
inp_padded = pad_dims(inp, 2, nd)
out_grad_padded = pad_dims(out_grad, 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
out_grad_padded,
out_grad_padded,
ws,
stride,
pad)
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
......
...@@ -3,12 +3,12 @@ from functools import wraps ...@@ -3,12 +3,12 @@ from functools import wraps
import numpy import numpy
from theano import scalar as scal, Constant from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value, from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError) NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape, gpu_alloc_empty
from .elemwise import GpuDimShuffle, GpuElemwise from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32')) _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
...@@ -329,3 +329,48 @@ def inplace_allocempty(op, idx): ...@@ -329,3 +329,48 @@ def inplace_allocempty(op, idx):
return maker(node, inputs) return maker(node, inputs)
return opt return opt
return wrapper return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
# count the number of "leading" dimensions, store as dmatrix
batch_size = tensor.prod(input.shape[:-rightdims])
batch_size = tensor.shape_padright(batch_size, 1)
# store in the required shape, for example as a 4D tensor
# with shape: (batch_size,1,height,width)
new_shape = tensor.cast(tensor.join(0, batch_size,
tensor.as_tensor([1] * (leftdims - 1)),
img_shape), 'int64')
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)
...@@ -30,7 +30,8 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, ...@@ -30,7 +30,8 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad) GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax from theano.sandbox.cuda.nnet import GpuSoftmax
from theano.sandbox.cuda.opt_util import alpha_merge, output_merge from theano.sandbox.cuda.opt_util import (alpha_merge, output_merge,
pad_dims, unpad_dims)
from theano.sandbox.cuda import gpu_seqopt, register_opt from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
...@@ -1391,20 +1392,23 @@ class GpuDnnPoolDesc(GpuOp): ...@@ -1391,20 +1392,23 @@ class GpuDnnPoolDesc(GpuOp):
def do_constant_folding(self, node): def do_constant_folding(self, node):
return False return False
def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)): def __init__(self, ws=(1, 1), stride=None, mode='max', pad=None):
if mode == 'average': if mode == 'average':
mode = 'average_inc_pad' mode = 'average_inc_pad'
assert mode in ('max', 'average_inc_pad', 'average_exc_pad') assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
self.mode = mode self.mode = mode
if stride is None:
stride = (1,) * len(ws)
if pad is None:
pad = (0,) * len(ws)
assert len(ws) == len(stride) and len(stride) == len(pad) assert len(ws) == len(stride) and len(stride) == len(pad)
assert len(ws) in (2, 3) assert len(ws) in (2, 3)
self.ws = ws self.ws = ws
self.stride = stride self.stride = stride
self.pad = pad self.pad = pad
if (pad[0] != 0 or pad[1] != 0) and version() == -1:
raise RuntimeError("cuDNN pooling with padding requires cuDNN v2")
if self.get_ndim() == 3 and version() < (3000, 3000): if self.get_ndim() == 3 and version() < (3000, 3000):
raise RuntimeError("cuDNN 3d pooling requires cuDNN v3") raise RuntimeError("cuDNN 3d pooling requires cuDNN v3")
if (mode == 'average_exc_pad' and max(pad) > 0 and if (mode == 'average_exc_pad' and max(pad) > 0 and
...@@ -1418,12 +1422,9 @@ class GpuDnnPoolDesc(GpuOp): ...@@ -1418,12 +1422,9 @@ class GpuDnnPoolDesc(GpuOp):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
if not hasattr(self, 'pad'): if not hasattr(self, 'pad'):
self.pad = (0, 0) self.pad = (0,) * self.get_ndim()
def make_node(self): def make_node(self):
if self.pad != (0, 0) and version() == -1:
raise RuntimeError("cuDNN pooling with padding requires cuDNN v2")
node = Apply(self, [], node = Apply(self, [],
[CDataType("cudnnPoolingDescriptor_t", [CDataType("cudnnPoolingDescriptor_t",
freefunc="cudnnDestroyPoolingDescriptor")()]) freefunc="cudnnDestroyPoolingDescriptor")()])
...@@ -1444,8 +1445,6 @@ class GpuDnnPoolDesc(GpuOp): ...@@ -1444,8 +1445,6 @@ class GpuDnnPoolDesc(GpuOp):
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING' mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad": elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING' mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else: else:
raise NotImplementedError("Unsupported pooling model.") raise NotImplementedError("Unsupported pooling model.")
...@@ -1616,8 +1615,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); } ...@@ -1616,8 +1615,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING' mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad": elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING' mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else: else:
raise NotImplementedError("Unsupported pooling model.") raise NotImplementedError("Unsupported pooling model.")
...@@ -1872,8 +1869,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); } ...@@ -1872,8 +1869,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING' mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
elif self.mode == "average_exc_pad": elif self.mode == "average_exc_pad":
mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING' mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
if version() == -1:
raise Exception("cudnn v1 do not support average_exc_pad")
else: else:
raise NotImplementedError("Unsupported pooling model.") raise NotImplementedError("Unsupported pooling model.")
...@@ -1976,28 +1971,33 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -1976,28 +1971,33 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
return [shape[0]] return [shape[0]]
def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): def dnn_pool(img, ws, stride=None, mode='max', pad=None):
""" """
GPU pooling using cuDNN from NVIDIA. GPU pooling using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel', For 2D pooling, the memory layout to use is 'bc01', that is 'batch',
'first dim', 'second dim' in that order. 'channel', 'first dim', 'second dim' in that order.
For 3D pooling, the memory layout to use is 'bc012', that is 'batch',
'channel', 'first dim', 'second dim', 'third dim'.
Parameters Parameters
---------- ----------
img img
Images to do the pooling over. Images to do the pooling over.
ws ws
Subsampling window size. Subsampling window size. Should have 2 or 3 elements.
stride stride
Subsampling stride (default: (1, 1)). Subsampling stride (default: (1, 1) or (1, 1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad, 'sum'} mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
pad : pad
(pad_h, pad_w) padding information. Padding: (pad_h, pad_w) for 2D or (pad_h, pad_w, pad_d) for 3D.
pad_h is the number of zero-valued pixels added to each of the top and pad_h is the number of zero-valued pixels added to each of the top and
bottom borders. bottom borders.
pad_w is the number of zero-valued pixels added to each of the left pad_w is the number of zero-valued pixels added to each of the left
and right borders. and right borders.
pad_d is the number of zero-valued pixels added to each of the front
and back borders (3D pooling only).
.. warning:: The cuDNN library only works with GPU that have a compute .. warning:: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not capability of 3.0 or higer. This means that older GPU will not
...@@ -2009,6 +2009,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -2009,6 +2009,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
""" """
img = gpu_contiguous(img) img = gpu_contiguous(img)
if stride is None:
stride = (1,) * len(ws)
if pad is None:
pad = (0,) * len(ws)
if mode == "sum": if mode == "sum":
ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad) ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
window_elem = theano.tensor.prod(ws).astype(ret.dtype) window_elem = theano.tensor.prod(ws).astype(ret.dtype)
...@@ -2972,10 +2976,21 @@ if True: ...@@ -2972,10 +2976,21 @@ if True:
if not node.op.ignore_border: if not node.op.ignore_border:
return return
img, ws, stride, pad = node.inputs img, ws, stride, pad = node.inputs
nd = node.op.ndim if node.op.ndim else (img.ndim - 2)
mode = node.op.mode mode = node.op.mode
if nd not in (2, 3):
return
if (img.owner and isinstance(img.owner.op, HostFromGpu)): if (img.owner and isinstance(img.owner.op, HostFromGpu)):
if img.ndim == nd + 2:
ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]), ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
ws, stride=stride, pad=pad, mode=mode) ws, stride=stride, pad=pad, mode=mode)
else:
input = gpu_contiguous(img.owner.inputs[0])
# reshape to 4D or 5D with 2 non-pooling dimensions
input_padded = pad_dims(input, 2, nd)
ret_padded = dnn_pool(input_padded,
ws, stride=stride, pad=pad, mode=mode)
ret = unpad_dims(ret_padded, input, 2, nd)
return [host_from_gpu(ret)] return [host_from_gpu(ret)]
@register_opt('cudnn') @register_opt('cudnn')
...@@ -3003,17 +3018,30 @@ if True: ...@@ -3003,17 +3018,30 @@ if True:
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, out, inp_grad, ws, stride, pad = node.inputs inp, out, inp_grad, ws, stride, pad = node.inputs
nd = node.op.ndim if node.op.ndim else (inp.ndim - 2)
mode = node.op.mode mode = node.op.mode
if nd not in (2, 3):
return
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(out.owner and isinstance(out.owner.op, HostFromGpu)) or (out.owner and isinstance(out.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op, (inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))): HostFromGpu))):
if inp.ndim == nd + 2:
ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
gpu_contiguous(inp_grad), gpu_contiguous(inp_grad),
ws, stride, pad) ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(gpu_contiguous(inp), 2, nd)
out_padded = pad_dims(gpu_contiguous(out), 2, nd)
inp_grad_padded = pad_dims(gpu_contiguous(inp_grad), 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
out_padded,
inp_grad_padded,
ws, stride, pad)
ret = unpad_dims(ret_padded, inp, 2, nd)
return [host_from_gpu(ret)] return [host_from_gpu(ret)]
@register_opt('cudnn') @register_opt('cudnn')
...@@ -3025,16 +3053,28 @@ if True: ...@@ -3025,16 +3053,28 @@ if True:
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, inp_grad, ws, stride, pad = node.inputs inp, inp_grad, ws, stride, pad = node.inputs
nd = node.op.ndim if node.op.ndim else (inp.ndim - 2)
mode = node.op.mode mode = node.op.mode
if nd not in (2, 3):
return
if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
(inp_grad.owner and isinstance(inp_grad.owner.op, (inp_grad.owner and isinstance(inp_grad.owner.op,
HostFromGpu))): HostFromGpu))):
if inp.ndim == nd + 2:
contiguous_inp_grad = gpu_contiguous(inp_grad) contiguous_inp_grad = gpu_contiguous(inp_grad)
ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
contiguous_inp_grad, contiguous_inp_grad,
contiguous_inp_grad, contiguous_inp_grad,
ws, stride, pad) ws, stride, pad)
else:
inp_padded = pad_dims(gpu_contiguous(inp), 2, nd)
inp_grad_padded = pad_dims(gpu_contiguous(inp_grad), 2, nd)
ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
inp_grad_padded,
inp_grad_padded,
ws, stride, pad)
ret = unpad_dims(ret_padded, inp, 2, nd)
return [host_from_gpu(ret)] return [host_from_gpu(ret)]
@register_opt('cudnn') @register_opt('cudnn')
......
...@@ -40,6 +40,7 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -40,6 +40,7 @@ from theano.sandbox.cuda.basic_ops import (
GpuSubtensor, GpuAdvancedSubtensor1, GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
from theano.sandbox.cuda.opt_util import pad_dims, unpad_dims
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import ( from theano.sandbox.cuda.blas import (
...@@ -1891,15 +1892,12 @@ def local_convtransp3d_gemm(node): ...@@ -1891,15 +1892,12 @@ def local_convtransp3d_gemm(node):
gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm) gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
def _check_constant_args_pool(ws, stride, pad, node): def _check_constant_args_pool(ndim, ws, stride, pad, node):
"""Check if the args of pool are constants. Warns if not.""" """Check if the args of pool are constants. Warns if not."""
try: try:
ws_w = tensor.get_scalar_constant_value(ws[0]) ws = tuple(tensor.get_scalar_constant_value(ws[i]) for i in range(ndim))
ws_h = tensor.get_scalar_constant_value(ws[1]) stride = tuple(tensor.get_scalar_constant_value(stride[i]) for i in range(ndim))
stride_w = tensor.get_scalar_constant_value(stride[0]) pad = tuple(tensor.get_scalar_constant_value(pad[i]) for i in range(ndim))
stride_h = tensor.get_scalar_constant_value(stride[1])
pad_w = tensor.get_scalar_constant_value(pad[0])
pad_h = tensor.get_scalar_constant_value(pad[1])
except tensor.NotScalarConstantError: except tensor.NotScalarConstantError:
msg = ("Pool with tensor variable for the window size, stride or " msg = ("Pool with tensor variable for the window size, stride or "
"padding is only supported in the new GPU backend, so this op " "padding is only supported in the new GPU backend, so this op "
...@@ -1909,65 +1907,96 @@ def _check_constant_args_pool(ws, stride, pad, node): ...@@ -1909,65 +1907,96 @@ def _check_constant_args_pool(ws, stride, pad, node):
elif config.assert_no_cpu_op == "raise": elif config.assert_no_cpu_op == "raise":
raise AssertionError(msg) raise AssertionError(msg)
return None return None
ws = (ws_w, ws_h)
stride = (stride_w, stride_h)
pad = (pad_w, pad_h)
return ws, stride, pad return ws, stride, pad
@register_opt() @register_opt()
@local_optimizer([pool.Pool]) @local_optimizer([pool.Pool])
def local_gpu_downsample_factor_max(node): def local_gpu_downsample_factor_max(node):
if isinstance(node.op, pool.Pool): if (isinstance(node.op, pool.Pool)):
assert node.op.__props__ == ('ignore_border', 'mode') assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
x, ws, stride, pad = node.inputs x, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node) nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None: if ret is None:
return return
ws, stride, pad = ret ws, stride, pad = ret
if (pad) != (0, 0) or node.op.mode != 'max' or stride != ws: if (nd != 2 or
max(node.op.padding) != 0 or
node.op.mode != 'max' or
stride != ws):
return return
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds = GpuDownsampleFactorMax(ws, node.op.ignore_border) gpu_ws = GpuDownsampleFactorMax(ws, node.op.ignore_border)
return [host_from_gpu(gpu_ds(x.owner.inputs[0]))] if node.inputs[0].ndim == 4:
return [host_from_gpu(gpu_ws(x.owner.inputs[0]))]
else:
input_4D = pad_dims(x.owner.inputs[0], 2, 2)
output_4D = gpu_ws(input_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt() @register_opt()
@local_optimizer([pool.MaxPoolGrad]) @local_optimizer([pool.MaxPoolGrad])
def local_gpu_downsample_factor_max_grad(node): def local_gpu_downsample_factor_max_grad(node):
if isinstance(node.op, pool.MaxPoolGrad): if (isinstance(node.op, pool.MaxPoolGrad)):
assert node.op.__props__ == ('ignore_border', 'mode') assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
x, z, gz, ws, stride, pad = node.inputs x, z, gz, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node) nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None: if ret is None:
return return
ws, stride, pad = ret ws, stride, pad = ret
if pad != (0, 0) or node.op.mode != 'max' or stride != ws: if (nd != 2 or
max(node.op.padding) != 0 or
node.op.mode != 'max' or
stride != ws):
return return
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_ds_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border) gpu_ws_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0], if node.inputs[0].ndim == 4:
return [host_from_gpu(gpu_ws_grad(x.owner.inputs[0],
as_cuda_ndarray_variable(z), as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(gz)))] as_cuda_ndarray_variable(gz)))]
else:
x_4D = pad_dims(x.owner.inputs[0], 2, 2)
z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
gz_4D = pad_dims(as_cuda_ndarray_variable(gz), 2, 2)
output_4D = gpu_ws_grad(x_4D, z_4D, gz_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt() @register_opt()
@local_optimizer([pool.DownsampleFactorMaxGradGrad]) @local_optimizer([pool.DownsampleFactorMaxGradGrad])
def local_gpu_downsample_factor_max_grad_grad(node): def local_gpu_downsample_factor_max_grad_grad(node):
if isinstance(node.op, pool.DownsampleFactorMaxGradGrad): if isinstance(node.op, pool.DownsampleFactorMaxGradGrad):
assert node.op.__props__ == ('ignore_border', 'mode') assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
x, z, gx, ws, stride, pad = node.inputs x, z, gx, ws, stride, pad = node.inputs
ret = _check_constant_args_pool(ws, stride, pad, node) nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
ret = _check_constant_args_pool(nd, ws, stride, pad, node)
if ret is None: if ret is None:
return return
ws, stride, pad = ret ws, stride, pad = ret
if pad != (0, 0) or node.op.mode != 'max' or stride != ws: if (nd != 2 or
max(node.op.padding) != 0 or
node.op.mode != 'max' or
stride != ws):
return return
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
op = GpuDownsampleFactorMaxGradGrad(ws, node.op.ignore_border) op = GpuDownsampleFactorMaxGradGrad(ws, node.op.ignore_border)
if node.inputs[0].ndim == 4:
return [host_from_gpu(op(x.owner.inputs[0], return [host_from_gpu(op(x.owner.inputs[0],
as_cuda_ndarray_variable(z), as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(gx)))] as_cuda_ndarray_variable(gx)))]
else:
x_4D = pad_dims(x.owner.inputs[0], 2, 2)
z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
gx_4D = pad_dims(as_cuda_ndarray_variable(gx), 2, 2)
output_4D = op(x_4D, z_4D, gx_4D)
output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
return [host_from_gpu(output)]
@register_opt() @register_opt()
......
...@@ -3,13 +3,13 @@ from functools import wraps ...@@ -3,13 +3,13 @@ from functools import wraps
import numpy import numpy
from theano import scalar as scal, Constant from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value, from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError) NotScalarConstantError)
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise) GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise, GpuReshape)
_one = scal.constant(numpy.asarray(1.0, dtype='float32')) _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
...@@ -126,3 +126,48 @@ def output_merge(cls, alpha_in, beta_in, out_in): ...@@ -126,3 +126,48 @@ def output_merge(cls, alpha_in, beta_in, out_in):
return maker(targ, *inputs) return maker(targ, *inputs)
return opt return opt
return wrapper return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
# count the number of "leading" dimensions, store as dmatrix
batch_size = tensor.prod(input.shape[:-rightdims])
batch_size = tensor.shape_padright(batch_size, 1)
# store in the required shape, for example as a 4D tensor
# with shape: (batch_size,1,height,width)
new_shape = tensor.cast(tensor.join(0, batch_size,
tensor.as_tensor([1] * (leftdims - 1)),
img_shape), 'int64')
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)
...@@ -326,7 +326,9 @@ if 0: ...@@ -326,7 +326,9 @@ if 0:
def test_downsample(): def test_downsample():
shps = [(1, 1, 1, 12), shps = [(1, 12),
(1, 1, 12),
(1, 1, 1, 12),
(1, 1, 2, 2), (1, 1, 2, 2),
(1, 1, 1, 1), (1, 1, 1, 1),
(1, 1, 4, 4), (1, 1, 4, 4),
...@@ -359,17 +361,17 @@ def test_downsample(): ...@@ -359,17 +361,17 @@ def test_downsample():
for shp in shps: for shp in shps:
for ds in (2, 2), (3, 2), (1, 1): for ds in (2, 2), (3, 2), (1, 1):
if ds[0] > shp[2]: if ds[0] > shp[-2]:
continue continue
if ds[1] > shp[3]: if ds[1] > shp[-1]:
continue continue
# GpuDownsampleFactorMax doesn't like having more than 512 columns # GpuDownsampleFactorMax doesn't like having more than 512 columns
# in the output tensor. # in the output tensor.
if float(shp[3]) / ds[1] > 512: if float(shp[-1]) / ds[1] > 512:
continue continue
for ignore_border in (True, False): for ignore_border in (True, False):
# print 'test_downsample', shp, ds, ignore_border # print 'test_downsample', shp, ds, ignore_border
ds_op = Pool(ignore_border=ignore_border) ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)
a = tcn.shared_constructor(my_rand(*shp), 'a') a = tcn.shared_constructor(my_rand(*shp), 'a')
f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds), f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds),
......
...@@ -15,8 +15,8 @@ import theano ...@@ -15,8 +15,8 @@ import theano
import theano.tensor as T import theano.tensor as T
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
from theano.sandbox.neighbours import images2neibs from theano.sandbox.neighbours import images2neibs
from theano.tensor.signal.pool import pool_2d from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import MaxPoolGrad, AveragePoolGrad from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
import theano.sandbox.cuda.dnn as dnn import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared from theano.sandbox.cuda import float32_shared_constructor as shared
...@@ -170,7 +170,7 @@ def test_dnn_conv_inplace(): ...@@ -170,7 +170,7 @@ def test_dnn_conv_inplace():
def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0), def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
pool_func=T.max, mode='ignore_borders'): pool_function=T.max, mode='ignore_borders'):
if strides is None: if strides is None:
strides = ds strides = ds
...@@ -179,13 +179,13 @@ def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0), ...@@ -179,13 +179,13 @@ def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
# resahpe to B, C*0, 1, 2 and do the pooling on 1, 2 # resahpe to B, C*0, 1, 2 and do the pooling on 1, 2
first = input.reshape((shape[0], shape[1] * shape[2], shape[3], shape[4])) first = input.reshape((shape[0], shape[1] * shape[2], shape[3], shape[4]))
pooled1 = pool_2d_i2n(first, ds=ds[1:], strides=strides[1:], pad=pad[1:], pooled1 = pool_2d_i2n(first, ds=ds[1:], strides=strides[1:], pad=pad[1:],
pool_function=pool_func, mode=mode) pool_function=pool_function, mode=mode)
shp1 = pooled1.shape shp1 = pooled1.shape
# reshape to B, C, 0, 1*2 and do the pooling on 0 # reshape to B, C, 0, 1*2 and do the pooling on 0
second = pooled1.reshape((shape[0], shape[1], shape[2], shp1[2] * shp1[3])) second = pooled1.reshape((shape[0], shape[1], shape[2], shp1[2] * shp1[3]))
pooled2 = pool_2d_i2n(second, ds=(ds[0], 1), strides=(strides[0], 1), pooled2 = pool_2d_i2n(second, ds=(ds[0], 1), strides=(strides[0], 1),
pad=(pad[0], 0), pool_function=pool_func, mode=mode) pad=(pad[0], 0), pool_function=pool_function, mode=mode)
shp2 = pooled2.shape shp2 = pooled2.shape
return pooled2.reshape((shape[0], shape[1], shp2[2], shp1[2], shp1[3])) return pooled2.reshape((shape[0], shape[1], shp2[2], shp1[2], shp1[3]))
...@@ -241,8 +241,6 @@ def test_pooling(): ...@@ -241,8 +241,6 @@ def test_pooling():
func = T.max func = T.max
else: else:
func = T.mean func = T.mean
if pad != (0, 0) and cuda.dnn.version() == -1:
continue
if pad != (0, 0) and func is T.mean: if pad != (0, 0) and func is T.mean:
continue continue
...@@ -418,6 +416,7 @@ def test_pooling3d(): ...@@ -418,6 +416,7 @@ def test_pooling3d():
if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000): if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000):
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
# We force the FAST_RUN as we don't want the reference to run in DebugMode.
mode_without_gpu_ref = theano.compile.mode.get_mode( mode_without_gpu_ref = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu') 'FAST_RUN').excluding('gpu')
...@@ -427,8 +426,7 @@ def test_pooling3d(): ...@@ -427,8 +426,7 @@ def test_pooling3d():
else: else:
modes = ('max', 'average_inc_pad', 'average_exc_pad') modes = ('max', 'average_inc_pad', 'average_exc_pad')
x = T.TensorType(broadcastable=(False, False, False, False, False), x = T.ftensor5()
dtype='float32')()
for mode, pad in product(modes, for mode, pad in product(modes,
((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), ((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
(2, 3, 2), (3, 2, 2), (2, 2, 3))): (2, 3, 2), (3, 2, 2), (2, 2, 3))):
...@@ -436,8 +434,6 @@ def test_pooling3d(): ...@@ -436,8 +434,6 @@ def test_pooling3d():
func = T.max func = T.max
else: else:
func = T.mean func = T.mean
if pad != (0, 0, 0) and cuda.dnn.version() == -1:
continue
if pad != (0, 0, 0) and func is T.mean: if pad != (0, 0, 0) and func is T.mean:
continue continue
...@@ -449,13 +445,13 @@ def test_pooling3d(): ...@@ -449,13 +445,13 @@ def test_pooling3d():
if pad[0] > stride or pad[1] > stride or pad[2] > stride: if pad[0] > stride or pad[1] > stride or pad[2] > stride:
# Not implemented # Not implemented
continue continue
out1 = cuda.dnn.dnn_pool(x, (ws, ws, ws), out1 = pool_3d(x, (ws, ws, ws),
stride=(stride, stride, stride), st=(stride, stride, stride),
pad=pad, mode=mode) ignore_border=True,
out2 = pool3d2d(x, ds=(ws, ws, ws), padding=pad, mode=mode)
strides=(stride, stride, stride), out2 = pool3d2d(x, ds=(ws, ws, ws), strides=(stride, stride, stride),
pad=pad, pool_func=func) pad=pad,
pool_function=func)
f1 = theano.function([x], out1, mode=mode_with_gpu) f1 = theano.function([x], out1, mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool) assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in f1.maker.fgraph.apply_nodes]) for node in f1.maker.fgraph.apply_nodes])
...@@ -510,11 +506,17 @@ def test_pooling3d(): ...@@ -510,11 +506,17 @@ def test_pooling3d():
g_out = fg(data) g_out = fg(data)
# Compare again the CPU result # Compare again the CPU result
out = pool3d2d(x, (ws, ws, ws), out = pool_3d(x, (ws, ws, ws),
strides=(stride, stride, stride), padding=pad,
pad=pad, pool_func=func) ignore_border=True, mode=mode)
fc = theano.function([x], theano.grad(out.sum(), x), fc = theano.function([x], theano.grad(out.sum(), x),
mode=mode_without_gpu_ref) mode=mode_without_gpu_ref)
if mode == 'max':
assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()])
else:
assert any([isinstance(node.op, AveragePoolGrad)
for node in fc.maker.fgraph.toposort()])
c_out = fc(data) c_out = fc(data)
utt.assert_allclose(c_out, g_out) utt.assert_allclose(c_out, g_out)
...@@ -523,6 +525,7 @@ def test_pooling_opt(): ...@@ -523,6 +525,7 @@ def test_pooling_opt():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
# 2D pooling
x = T.fmatrix() x = T.fmatrix()
f = theano.function( f = theano.function(
...@@ -535,6 +538,7 @@ def test_pooling_opt(): ...@@ -535,6 +538,7 @@ def test_pooling_opt():
f(numpy.zeros((10, 10), dtype='float32')) f(numpy.zeros((10, 10), dtype='float32'))
# gradient of 2D pooling
f = theano.function( f = theano.function(
[x], [x],
T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad', T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad',
...@@ -545,6 +549,7 @@ def test_pooling_opt(): ...@@ -545,6 +549,7 @@ def test_pooling_opt():
for n in f.maker.fgraph.toposort()]) for n in f.maker.fgraph.toposort()])
f(numpy.zeros((10, 10), dtype='float32')) f(numpy.zeros((10, 10), dtype='float32'))
# Test sum pooling # Test sum pooling
f = theano.function( f = theano.function(
[x], [x],
...@@ -557,6 +562,82 @@ def test_pooling_opt(): ...@@ -557,6 +562,82 @@ def test_pooling_opt():
data = numpy.random.rand(10, 10).astype('float32') data = numpy.random.rand(10, 10).astype('float32')
f(data) f(data)
# 3D pooling
x = T.ftensor3()
f = theano.function(
[x],
pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad', ignore_border=True),
mode=mode_with_gpu)
assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
for n in f.maker.fgraph.toposort()])
f(numpy.zeros((10, 10, 10), dtype='float32'))
# gradient of 3D pooling
f = theano.function(
[x],
T.grad(pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad',
ignore_border=True).sum(), x),
mode=mode_with_gpu.including("cudnn"))
assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)
for n in f.maker.fgraph.toposort()])
f(numpy.zeros((10, 10, 10), dtype='float32'))
def test_pooling_opt_arbitrary_dimensions():
# test if input with an arbitrary number of non-pooling dimensions
# is correctly reshaped to run on the GPU
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
# 'average_exc_pad' is disabled for versions < 4004
if cuda.dnn.version() < (4004, 4004):
modes = ('max', 'average_inc_pad')
else:
modes = ('max', 'average_inc_pad', 'average_exc_pad')
for n_non_pool_dims in (0, 1, 2, 3):
for ws in ((2, 2), (3, 3, 3)):
# create input shape: non-pooling dimensions
# followed by 2 or 3 pooling dimensions
shp = (2,) * n_non_pool_dims + (5,) * len(ws)
data = numpy.random.normal(0, 1, shp).astype('float32')
input = shared(data)
for mode in modes:
out_pool = Pool(ndim=len(ws), mode=mode, ignore_border=True)(input, ws)
out_pool_grad = T.grad(T.sum(out_pool), wrt=input)
out = [out_pool, out_pool_grad]
# run on GPU
fg = theano.function([], out, mode=mode_with_gpu)
assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
for node in fg.maker.fgraph.toposort()])
assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
for node in fg.maker.fgraph.toposort()])
res_gpu = fg()
# run on CPU
fc = theano.function([], out, mode=mode_without_gpu)
assert any([isinstance(node.op, Pool)
for node in fc.maker.fgraph.toposort()])
if mode == 'max':
assert any([isinstance(node.op, MaxPoolGrad)
for node in fc.maker.fgraph.toposort()])
else:
assert any([isinstance(node.op, AveragePoolGrad)
for node in fc.maker.fgraph.toposort()])
res_cpu = fg()
# check for similarity
utt.assert_allclose(res_gpu[0], res_cpu[0])
utt.assert_allclose(res_gpu[1], res_cpu[1])
class test_DnnSoftMax(test_nnet.test_SoftMax): class test_DnnSoftMax(test_nnet.test_SoftMax):
gpu_op = dnn.GpuDnnSoftmax gpu_op = dnn.GpuDnnSoftmax
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论