提交 ebf8f12a authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #6116 from abergeron/dnn_redux2

Use GpuDnnReduction to replace GpuMaxAndArgmax when possible.
...@@ -12,7 +12,7 @@ import theano ...@@ -12,7 +12,7 @@ import theano
from theano import Op, Apply, tensor, config, Variable from theano import Op, Apply, tensor, config, Variable
from theano.scalar import (as_scalar, constant, Log, get_scalar_type, from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
int32 as int_t, bool as bool_t, uint32 as uint32_t) int32 as int_t, bool as bool_t, uint32 as uint32_t)
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable, Argmax
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
...@@ -37,6 +37,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -37,6 +37,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, GpuAllocEmpty, gpu_contiguous, GpuAllocEmpty,
empty_like, GpuArrayType, HostFromGpu) empty_like, GpuArrayType, HostFromGpu)
from .elemwise import GpuElemwise, GpuCAReduceCuda from .elemwise import GpuElemwise, GpuCAReduceCuda
from .reduction import GpuMaxAndArgmax
# These don't exist in gpuarray # These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
...@@ -1592,8 +1593,9 @@ class GpuDnnReduction(DnnBase): ...@@ -1592,8 +1593,9 @@ class GpuDnnReduction(DnnBase):
self.c_axis = self._convert_axis(axis) self.c_axis = self._convert_axis(axis)
# axis is a list of axes to reduce on # axis is a list of axes to reduce on
self.axis = axis self.axis = axis
if return_indices and (red_op != 'max' and red_op != 'min'): if return_indices and (red_op != 'maximum' and red_op != 'minimum'):
raise ValueError("Can't request indices for something other than min or max") raise ValueError("Can't request indices for something other than"
" minimum or maximum")
self.return_indices = return_indices self.return_indices = return_indices
def _convert_axis(self, axis): def _convert_axis(self, axis):
...@@ -1897,7 +1899,7 @@ class GpuDnnDropoutOp(DnnBase): ...@@ -1897,7 +1899,7 @@ class GpuDnnDropoutOp(DnnBase):
return Apply(self, [inp, descriptor, state], return Apply(self, [inp, descriptor, state],
[inp.type(), state.type(), gpudata_type()]) [inp.type(), state.type(), gpudata_type()])
def prepare_node(self, node, storage_map, compute_map): def prepare_node(self, node, storage_map, compute_map, impl):
assert self.inplace, "GpuDnnDropoutOp not inplace" assert self.inplace, "GpuDnnDropoutOp not inplace"
...@@ -3123,6 +3125,66 @@ def local_dnn_reduction(node): ...@@ -3123,6 +3125,66 @@ def local_dnn_reduction(node):
False)(node.inputs[0]),) False)(node.inputs[0]),)
@register_opt('cudnn')
@local_optimizer([GpuMaxAndArgmax])
def local_cudnn_maxandargmax(node):
if not isinstance(node.op, GpuMaxAndArgmax):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
if node.inputs[0].dtype != node.outputs[0].dtype:
return
if node.inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
# order of the axes influences the output indices
if (node.op.axis is not None and
tuple(sorted(node.op.axis)) != node.op.axis):
return
max, arg = GpuDnnReduction('maximum', node.op.axis, node.outputs[0].dtype,
node.outputs[0].dtype, True)(node.inputs[0])
# cudnn can only return int32 indices
return (max, as_gpuarray_variable(arg.astype('int64'),
node.outputs[1].type.context_name))
@register_opt('cudnn', 'fast_compile')
@op_lifter([Argmax])
@register_opt2([Argmax], 'fast_compile', 'cudnn')
def local_dnn_argmax(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if version(raises=False) < 6000:
return
if inputs[0].ndim > 8:
return
if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
# order of the axes influences the output indices
if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
return
max, arg = GpuDnnReduction('maximum', op.axis, inputs[0].dtype,
inputs[0].dtype, True)(*inputs)
return [as_gpuarray_variable(arg.astype('int64'), ctx_name)]
class NoCuDNNRaise(Optimizer): class NoCuDNNRaise(Optimizer):
def apply(self, fgraph): def apply(self, fgraph):
......
...@@ -61,11 +61,6 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input, ...@@ -61,11 +61,6 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
static float fbeta = 0.0f; static float fbeta = 0.0f;
static double dbeta = 0.0; static double dbeta = 0.0;
if (!GpuArray_IS_C_CONTIGUOUS(&input->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0) if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0)
return 1; return 1;
...@@ -83,7 +78,7 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input, ...@@ -83,7 +78,7 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
if (indices != NULL) { if (indices != NULL) {
if (theano_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0) if (theano_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0)
return 1; return 1;
indsize = PyGpuArray_SIZE(*indices); indsize = PyGpuArray_SIZE(*indices) * 4;
} }
if (p == input->ga.nd || rsz == 1) { if (p == input->ga.nd || rsz == 1) {
......
...@@ -37,8 +37,8 @@ class GpuMaxAndArgmax(Op): ...@@ -37,8 +37,8 @@ class GpuMaxAndArgmax(Op):
broadcastable = [b for i, b in enumerate(X.type.broadcastable) broadcastable = [b for i, b in enumerate(X.type.broadcastable)
if i not in all_axes] if i not in all_axes]
inputs = [as_gpuarray_variable(X, context_name)] inputs = [as_gpuarray_variable(X, context_name)]
outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name, name='max')(), outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name)(),
GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name, name='argmax')()] GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name)()]
return Apply(self, inputs, outputs) return Apply(self, inputs, outputs)
def c_headers(self): def c_headers(self):
......
...@@ -18,7 +18,7 @@ from theano.tensor.nnet import bn ...@@ -18,7 +18,7 @@ from theano.tensor.nnet import bn
from .. import dnn from .. import dnn
from ..basic_ops import GpuAllocEmpty from ..basic_ops import GpuAllocEmpty
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor, GpuArrayType
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast
from . import test_nnet from . import test_nnet
...@@ -26,6 +26,11 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer ...@@ -26,6 +26,11 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
try:
import pygpu
except ImportError:
pass
mode_with_gpu = mode_with_gpu.including() mode_with_gpu = mode_with_gpu.including()
# Globally disabled for mode_without_gpu # Globally disabled for mode_without_gpu
mode_with_gpu.check_py_code = False mode_with_gpu.check_py_code = False
...@@ -1506,6 +1511,55 @@ def test_dnn_reduction_opt(): ...@@ -1506,6 +1511,55 @@ def test_dnn_reduction_opt():
yield dnn_reduction, 2, idtype, adtype, odtype yield dnn_reduction, 2, idtype, adtype, odtype
def dnn_reduction_strides(shp, shuffle, slice):
utt.fetch_seed()
inp = GpuArrayType('float32', (False,) * len(shp),
context_name=test_ctx_name)()
tmp = inp.dimshuffle(shuffle)[slice]
res = tmp.sum(acc_dtype='float32', dtype='float32')
f = theano.function([inp], res, mode=mode_with_gpu)
assert any(isinstance(n.op, dnn.GpuDnnReduction)
for n in f.maker.fgraph.apply_nodes)
data = np.random.random(shp).astype('float32')
res = np.sum(data)
gdata = pygpu.array(data, context=inp.type.context)
gres = f(gdata)
utt.assert_allclose(res, np.array(gres))
def test_dnn_reduction_strides():
yield dnn_reduction_strides, (2, 3, 2), (1, 0, 2), slice(None, None, None)
yield dnn_reduction_strides, (2, 3, 2), (0, 1, 2), slice(None, None, -1)
def dnn_maxargmax(nd, idtype, axis):
inp = T.TensorType(idtype, (False,) * nd)()
res = T.max_and_argmax(inp, axis=axis)
f = theano.function([inp], res, mode=mode_with_gpu)
assert any(isinstance(n.op, dnn.GpuDnnReduction)
for n in f.maker.fgraph.apply_nodes)
def test_dnn_maxandargmax_opt():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
for nd in range(1, 9):
yield dnn_maxargmax, nd, 'float32', None
for idtype in ('float64', 'float16'):
yield dnn_maxargmax, 2, idtype, None
yield dnn_maxargmax, 3, 'float32', (0, 1)
yield dnn_maxargmax, 3, 'float32', (0, 2)
yield dnn_maxargmax, 3, 'float32', (1, 2)
yield dnn_maxargmax, 3, 'float32', (0, 1, 2)
yield dnn_maxargmax, 3, 'float32', (0,)
yield dnn_maxargmax, 3, 'float32', (1,)
yield dnn_maxargmax, 3, 'float32', (2,)
yield dnn_maxargmax, 3, 'float32', ()
def test_dnn_batchnorm_train(): def test_dnn_batchnorm_train():
if not dnn.dnn_available(test_ctx_name): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
......
...@@ -10,6 +10,8 @@ from theano.tests.unittest_tools import SkipTest ...@@ -10,6 +10,8 @@ from theano.tests.unittest_tools import SkipTest
from .config import mode_with_gpu, mode_without_gpu from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import rand_gpuarray from .test_basic_ops import rand_gpuarray
from .. import GpuArrayType from .. import GpuArrayType
from ..reduction import GpuMaxAndArgmax
from ..dnn import GpuDnnReduction
import math import math
...@@ -53,14 +55,14 @@ def numpy_maxandargmax(X, axis=None): ...@@ -53,14 +55,14 @@ def numpy_maxandargmax(X, axis=None):
return (ref_max, np.argmax(reshaped_x, axis=-1)) return (ref_max, np.argmax(reshaped_x, axis=-1))
def check_if_gpu_maxandargmax_in_graph(theano_function): def check_if_gpu_reduce_in_graph(theano_function):
assert len([node for node in theano_function.maker.fgraph.apply_nodes assert any(isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0 for node in theano_function.maker.fgraph.apply_nodes)
def check_if_gpu_maxandargmax_not_in_graph(theano_function): def check_if_gpu_reduce_not_in_graph(theano_function):
assert len([node for node in theano_function.maker.fgraph.apply_nodes assert all(not isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0 for node in theano_function.maker.fgraph.apply_nodes)
class BaseTest: class BaseTest:
...@@ -105,7 +107,7 @@ class BaseTest: ...@@ -105,7 +107,7 @@ class BaseTest:
M = self.get_host_tensor() M = self.get_host_tensor()
f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)], f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
name='shape:' + str(test_tensor.shape) + '/axis:' + str(axis) + '/HOST', mode=mode_without_gpu) name='shape:' + str(test_tensor.shape) + '/axis:' + str(axis) + '/HOST', mode=mode_without_gpu)
check_if_gpu_maxandargmax_not_in_graph(f) check_if_gpu_reduce_not_in_graph(f)
f(test_tensor) f(test_tensor)
theano_max, theano_argmax = f(test_tensor) theano_max, theano_argmax = f(test_tensor)
ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis) ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
...@@ -116,7 +118,7 @@ class BaseTest: ...@@ -116,7 +118,7 @@ class BaseTest:
M = self.get_gpu_tensor() M = self.get_gpu_tensor()
f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)], f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
name='shape:' + str(test_gpu_tensor.shape) + '/axis:' + str(axis) + '/GPU', mode=mode_with_gpu) name='shape:' + str(test_gpu_tensor.shape) + '/axis:' + str(axis) + '/GPU', mode=mode_with_gpu)
check_if_gpu_maxandargmax_in_graph(f) check_if_gpu_reduce_in_graph(f)
f(test_gpu_tensor) f(test_gpu_tensor)
theano_max, theano_argmax = f(test_gpu_tensor) theano_max, theano_argmax = f(test_gpu_tensor)
ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis) ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)
......
...@@ -14,7 +14,7 @@ import theano ...@@ -14,7 +14,7 @@ import theano
from theano.compat import izip from theano.compat import izip
from theano.configparser import config from theano.configparser import config
from theano import gof from theano import gof
from theano.gof import Apply, Constant, Op, Variable from theano.gof import Apply, Constant, Op, Variable, ParamsType
from theano.gof.type import Generic from theano.gof.type import Generic
from theano.tensor import elemwise from theano.tensor import elemwise
...@@ -1429,21 +1429,31 @@ class Argmax(Op): ...@@ -1429,21 +1429,31 @@ class Argmax(Op):
nin = 2 # tensor, axis nin = 2 # tensor, axis
nout = 1 nout = 1
E_axis = 'invalid axis' E_axis = 'invalid axis'
__props__ = () __props__ = ('axis',)
_f16_ok = True _f16_ok = True
params_type = ParamsType(c_axis=scal.int64)
def __init__(self, axis):
if axis is not None:
axis = tuple(axis)
self.axis = tuple(axis)
def get_params(self, node):
if self.axis is not None and len(self.axis) == 1:
c_axis = np.int64(self.axis[0])
else:
# The value here doesn't matter, it won't be used
c_axis = np.int64(-1)
return self.params_type.get_params(c_axis=c_axis)
def make_node(self, x, axis=None): def make_node(self, x, axis=None):
x = _as_tensor_variable(x) x = _as_tensor_variable(x)
# Check axis and convert it to a Python list of integers. if self.axis is None:
axis = check_and_normalize_axes(x, axis)
if len(axis) == 0:
axis = NoneConst.clone()
all_axes = list(range(x.ndim)) all_axes = list(range(x.ndim))
else: else:
all_axes = axis all_axes = self.axis
axis = _as_tensor_variable(axis) inputs = [x]
assert axis.ndim == 1
inputs = [x, axis]
# We keep the original broadcastable flags for dimensions on which # We keep the original broadcastable flags for dimensions on which
# we do not perform the argmax. # we do not perform the argmax.
...@@ -1452,13 +1462,16 @@ class Argmax(Op): ...@@ -1452,13 +1462,16 @@ class Argmax(Op):
outputs = [tensor('int64', broadcastable, name='argmax')] outputs = [tensor('int64', broadcastable, name='argmax')]
return Apply(self, inputs, outputs) return Apply(self, inputs, outputs)
def perform(self, node, inp, outs): def prepare_node(self, node, storage_map, compute_map, impl):
x, axes = inp if len(node.inputs) == 2:
raise ValueError('You are trying to compile a graph with an old Argmax node. Either reoptimize your graph or rebuild it to get the new node format.')
def perform(self, node, inp, outs, params):
x, = inp
axes = self.axis
max_idx, = outs max_idx, = outs
if axes is None: if axes is None:
axes = tuple(range(x.ndim)) axes = tuple(range(x.ndim))
else:
axes = tuple(int(ax) for ax in axes)
# Numpy does not support multiple axes for argmax # Numpy does not support multiple axes for argmax
# Work around # Work around
...@@ -1476,18 +1489,18 @@ class Argmax(Op): ...@@ -1476,18 +1489,18 @@ class Argmax(Op):
dtype='int64') dtype='int64')
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
x, axis = inp x, = inp
argmax, = out argmax, = out
fail = sub["fail"] fail = sub["fail"]
if NoneConst.equals(node.inputs[1]): params = sub["params"]
if self.axis is None:
axis_code = "axis = NPY_MAXDIMS;" axis_code = "axis = NPY_MAXDIMS;"
else: else:
assert node.inputs[1].ndim == 1 if len(self.axis) > 1:
# Fall back to perform() if there are multiple axes
if len(node.inputs[1].data) > 1:
raise NotImplementedError() raise NotImplementedError()
# params is only used here for now
axis_code = """ axis_code = """
axis = ((dtype_%(axis)s*)PyArray_DATA(%(axis)s))[0]; axis = %(params)s->c_axis;
if(axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)){ if(axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)){
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"Argmax, bad axis argument"); "Argmax, bad axis argument");
...@@ -1522,28 +1535,20 @@ class Argmax(Op): ...@@ -1522,28 +1535,20 @@ class Argmax(Op):
return ret % locals() return ret % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (1,)
def infer_shape(self, node, shapes): def infer_shape(self, node, shapes):
ishape, axis_shape = shapes ishape, = shapes
axis = node.inputs[1] if self.axis is None:
if axis.data is None:
return [()] return [()]
rval = tuple([ishape[i] for (i, b) in enumerate( rval = tuple([ishape[i] for (i, b) in enumerate(
node.inputs[0].type.broadcastable) if i not in axis.data]) node.inputs[0].type.broadcastable) if i not in self.axis])
return [rval] return [rval]
def grad(self, inp, grads): def grad(self, inp, grads):
x, axis = inp x, = inp
axis_grad = grad_undefined(
self, 1, axis,
"argmax is not defined for non-integer axes so"
" argmax(x, axis+eps) is undefined")
return [x.zeros_like(), axis_grad]
_argmax = Argmax() return [x.zeros_like()]
def makeKeepDims(x, y, axis): def makeKeepDims(x, y, axis):
......
...@@ -1333,9 +1333,9 @@ def test_argmax_pushdown(): ...@@ -1333,9 +1333,9 @@ def test_argmax_pushdown():
# for node in fgraph.toposort(): # for node in fgraph.toposort():
# print node.op # print node.op
assert len(fgraph.toposort()) == 1 assert len(fgraph.toposort()) == 1
assert fgraph.toposort()[0].op == tensor.basic._argmax assert isinstance(fgraph.toposort()[0].op, tensor.basic.Argmax)
assert check_stack_trace( assert check_stack_trace(
fgraph, ops_to_check=tensor.basic._argmax) fgraph, ops_to_check=tensor.basic.Argmax)
x = tensor.matrix() x = tensor.matrix()
# test that the max_and_argmax is not pushed down if the max is used # test that the max_and_argmax is not pushed down if the max is used
out = tensor.max_and_argmax( out = tensor.max_and_argmax(
......
...@@ -60,7 +60,7 @@ def local_max_and_argmax(node): ...@@ -60,7 +60,7 @@ def local_max_and_argmax(node):
return [new, None] return [new, None]
if len(node.outputs[0].clients) == 0: if len(node.outputs[0].clients) == 0:
return [None, T._argmax(node.inputs[0], axis)] return [None, T.Argmax(axis)(node.inputs[0])]
@register_uncanonicalize @register_uncanonicalize
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论