提交 ebf8f12a authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #6116 from abergeron/dnn_redux2

Use GpuDnnReduction to replace GpuMaxAndArgmax when possible.
......@@ -12,7 +12,7 @@ import theano
from theano import Op, Apply, tensor, config, Variable
from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
int32 as int_t, bool as bool_t, uint32 as uint32_t)
from theano.tensor import as_tensor_variable
from theano.tensor import as_tensor_variable, Argmax
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
from theano.gof.cmodule import GCC_compiler
......@@ -37,6 +37,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, GpuAllocEmpty,
empty_like, GpuArrayType, HostFromGpu)
from .elemwise import GpuElemwise, GpuCAReduceCuda
from .reduction import GpuMaxAndArgmax
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
......@@ -1592,8 +1593,9 @@ class GpuDnnReduction(DnnBase):
self.c_axis = self._convert_axis(axis)
# axis is a list of axes to reduce on
self.axis = axis
if return_indices and (red_op != 'max' and red_op != 'min'):
raise ValueError("Can't request indices for something other than min or max")
if return_indices and (red_op != 'maximum' and red_op != 'minimum'):
raise ValueError("Can't request indices for something other than"
" minimum or maximum")
self.return_indices = return_indices
def _convert_axis(self, axis):
......@@ -1897,7 +1899,7 @@ class GpuDnnDropoutOp(DnnBase):
return Apply(self, [inp, descriptor, state],
[inp.type(), state.type(), gpudata_type()])
def prepare_node(self, node, storage_map, compute_map):
def prepare_node(self, node, storage_map, compute_map, impl):
assert self.inplace, "GpuDnnDropoutOp not inplace"
......@@ -3123,6 +3125,66 @@ def local_dnn_reduction(node):
False)(node.inputs[0]),)
@register_opt('cudnn')
@local_optimizer([GpuMaxAndArgmax])
def local_cudnn_maxandargmax(node):
if not isinstance(node.op, GpuMaxAndArgmax):
return
if not dnn_available(node.inputs[0].type.context_name):
return
if version(raises=False) < 6000:
return
if node.inputs[0].ndim > 8:
return
if node.inputs[0].dtype != node.outputs[0].dtype:
return
if node.inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
# order of the axes influences the output indices
if (node.op.axis is not None and
tuple(sorted(node.op.axis)) != node.op.axis):
return
max, arg = GpuDnnReduction('maximum', node.op.axis, node.outputs[0].dtype,
node.outputs[0].dtype, True)(node.inputs[0])
# cudnn can only return int32 indices
return (max, as_gpuarray_variable(arg.astype('int64'),
node.outputs[1].type.context_name))
@register_opt('cudnn', 'fast_compile')
@op_lifter([Argmax])
@register_opt2([Argmax], 'fast_compile', 'cudnn')
def local_dnn_argmax(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
return
if version(raises=False) < 6000:
return
if inputs[0].ndim > 8:
return
if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
# order of the axes influences the output indices
if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
return
max, arg = GpuDnnReduction('maximum', op.axis, inputs[0].dtype,
inputs[0].dtype, True)(*inputs)
return [as_gpuarray_variable(arg.astype('int64'), ctx_name)]
class NoCuDNNRaise(Optimizer):
def apply(self, fgraph):
......
......@@ -61,11 +61,6 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
static float fbeta = 0.0f;
static double dbeta = 0.0;
if (!GpuArray_IS_C_CONTIGUOUS(&input->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0)
return 1;
......@@ -83,7 +78,7 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
if (indices != NULL) {
if (theano_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0)
return 1;
indsize = PyGpuArray_SIZE(*indices);
indsize = PyGpuArray_SIZE(*indices) * 4;
}
if (p == input->ga.nd || rsz == 1) {
......
......@@ -37,8 +37,8 @@ class GpuMaxAndArgmax(Op):
broadcastable = [b for i, b in enumerate(X.type.broadcastable)
if i not in all_axes]
inputs = [as_gpuarray_variable(X, context_name)]
outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name, name='max')(),
GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name, name='argmax')()]
outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name)(),
GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name)()]
return Apply(self, inputs, outputs)
def c_headers(self):
......
......@@ -18,7 +18,7 @@ from theano.tensor.nnet import bn
from .. import dnn
from ..basic_ops import GpuAllocEmpty
from ..type import gpuarray_shared_constructor
from ..type import gpuarray_shared_constructor, GpuArrayType
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast
from . import test_nnet
......@@ -26,6 +26,11 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
try:
import pygpu
except ImportError:
pass
mode_with_gpu = mode_with_gpu.including()
# Globally disabled for mode_without_gpu
mode_with_gpu.check_py_code = False
......@@ -1506,6 +1511,55 @@ def test_dnn_reduction_opt():
yield dnn_reduction, 2, idtype, adtype, odtype
def dnn_reduction_strides(shp, shuffle, slice):
utt.fetch_seed()
inp = GpuArrayType('float32', (False,) * len(shp),
context_name=test_ctx_name)()
tmp = inp.dimshuffle(shuffle)[slice]
res = tmp.sum(acc_dtype='float32', dtype='float32')
f = theano.function([inp], res, mode=mode_with_gpu)
assert any(isinstance(n.op, dnn.GpuDnnReduction)
for n in f.maker.fgraph.apply_nodes)
data = np.random.random(shp).astype('float32')
res = np.sum(data)
gdata = pygpu.array(data, context=inp.type.context)
gres = f(gdata)
utt.assert_allclose(res, np.array(gres))
def test_dnn_reduction_strides():
yield dnn_reduction_strides, (2, 3, 2), (1, 0, 2), slice(None, None, None)
yield dnn_reduction_strides, (2, 3, 2), (0, 1, 2), slice(None, None, -1)
def dnn_maxargmax(nd, idtype, axis):
inp = T.TensorType(idtype, (False,) * nd)()
res = T.max_and_argmax(inp, axis=axis)
f = theano.function([inp], res, mode=mode_with_gpu)
assert any(isinstance(n.op, dnn.GpuDnnReduction)
for n in f.maker.fgraph.apply_nodes)
def test_dnn_maxandargmax_opt():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
for nd in range(1, 9):
yield dnn_maxargmax, nd, 'float32', None
for idtype in ('float64', 'float16'):
yield dnn_maxargmax, 2, idtype, None
yield dnn_maxargmax, 3, 'float32', (0, 1)
yield dnn_maxargmax, 3, 'float32', (0, 2)
yield dnn_maxargmax, 3, 'float32', (1, 2)
yield dnn_maxargmax, 3, 'float32', (0, 1, 2)
yield dnn_maxargmax, 3, 'float32', (0,)
yield dnn_maxargmax, 3, 'float32', (1,)
yield dnn_maxargmax, 3, 'float32', (2,)
yield dnn_maxargmax, 3, 'float32', ()
def test_dnn_batchnorm_train():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
......
......@@ -10,6 +10,8 @@ from theano.tests.unittest_tools import SkipTest
from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import rand_gpuarray
from .. import GpuArrayType
from ..reduction import GpuMaxAndArgmax
from ..dnn import GpuDnnReduction
import math
......@@ -53,14 +55,14 @@ def numpy_maxandargmax(X, axis=None):
return (ref_max, np.argmax(reshaped_x, axis=-1))
def check_if_gpu_maxandargmax_in_graph(theano_function):
assert len([node for node in theano_function.maker.fgraph.apply_nodes
if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0
def check_if_gpu_reduce_in_graph(theano_function):
assert any(isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
for node in theano_function.maker.fgraph.apply_nodes)
def check_if_gpu_maxandargmax_not_in_graph(theano_function):
assert len([node for node in theano_function.maker.fgraph.apply_nodes
if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0
def check_if_gpu_reduce_not_in_graph(theano_function):
assert all(not isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
for node in theano_function.maker.fgraph.apply_nodes)
class BaseTest:
......@@ -105,7 +107,7 @@ class BaseTest:
M = self.get_host_tensor()
f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
name='shape:' + str(test_tensor.shape) + '/axis:' + str(axis) + '/HOST', mode=mode_without_gpu)
check_if_gpu_maxandargmax_not_in_graph(f)
check_if_gpu_reduce_not_in_graph(f)
f(test_tensor)
theano_max, theano_argmax = f(test_tensor)
ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
......@@ -116,7 +118,7 @@ class BaseTest:
M = self.get_gpu_tensor()
f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
name='shape:' + str(test_gpu_tensor.shape) + '/axis:' + str(axis) + '/GPU', mode=mode_with_gpu)
check_if_gpu_maxandargmax_in_graph(f)
check_if_gpu_reduce_in_graph(f)
f(test_gpu_tensor)
theano_max, theano_argmax = f(test_gpu_tensor)
ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)
......
......@@ -14,7 +14,7 @@ import theano
from theano.compat import izip
from theano.configparser import config
from theano import gof
from theano.gof import Apply, Constant, Op, Variable
from theano.gof import Apply, Constant, Op, Variable, ParamsType
from theano.gof.type import Generic
from theano.tensor import elemwise
......@@ -1429,21 +1429,31 @@ class Argmax(Op):
nin = 2 # tensor, axis
nout = 1
E_axis = 'invalid axis'
__props__ = ()
__props__ = ('axis',)
_f16_ok = True
params_type = ParamsType(c_axis=scal.int64)
def __init__(self, axis):
if axis is not None:
axis = tuple(axis)
self.axis = tuple(axis)
def get_params(self, node):
if self.axis is not None and len(self.axis) == 1:
c_axis = np.int64(self.axis[0])
else:
# The value here doesn't matter, it won't be used
c_axis = np.int64(-1)
return self.params_type.get_params(c_axis=c_axis)
def make_node(self, x, axis=None):
x = _as_tensor_variable(x)
# Check axis and convert it to a Python list of integers.
axis = check_and_normalize_axes(x, axis)
if len(axis) == 0:
axis = NoneConst.clone()
if self.axis is None:
all_axes = list(range(x.ndim))
else:
all_axes = axis
axis = _as_tensor_variable(axis)
assert axis.ndim == 1
inputs = [x, axis]
all_axes = self.axis
inputs = [x]
# We keep the original broadcastable flags for dimensions on which
# we do not perform the argmax.
......@@ -1452,13 +1462,16 @@ class Argmax(Op):
outputs = [tensor('int64', broadcastable, name='argmax')]
return Apply(self, inputs, outputs)
def perform(self, node, inp, outs):
x, axes = inp
def prepare_node(self, node, storage_map, compute_map, impl):
if len(node.inputs) == 2:
raise ValueError('You are trying to compile a graph with an old Argmax node. Either reoptimize your graph or rebuild it to get the new node format.')
def perform(self, node, inp, outs, params):
x, = inp
axes = self.axis
max_idx, = outs
if axes is None:
axes = tuple(range(x.ndim))
else:
axes = tuple(int(ax) for ax in axes)
# Numpy does not support multiple axes for argmax
# Work around
......@@ -1476,18 +1489,18 @@ class Argmax(Op):
dtype='int64')
def c_code(self, node, name, inp, out, sub):
x, axis = inp
x, = inp
argmax, = out
fail = sub["fail"]
if NoneConst.equals(node.inputs[1]):
params = sub["params"]
if self.axis is None:
axis_code = "axis = NPY_MAXDIMS;"
else:
assert node.inputs[1].ndim == 1
# Fall back to perform() if there are multiple axes
if len(node.inputs[1].data) > 1:
if len(self.axis) > 1:
raise NotImplementedError()
# params is only used here for now
axis_code = """
axis = ((dtype_%(axis)s*)PyArray_DATA(%(axis)s))[0];
axis = %(params)s->c_axis;
if(axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)){
PyErr_SetString(PyExc_ValueError,
"Argmax, bad axis argument");
......@@ -1522,28 +1535,20 @@ class Argmax(Op):
return ret % locals()
def c_code_cache_version(self):
return (0,)
return (1,)
def infer_shape(self, node, shapes):
ishape, axis_shape = shapes
axis = node.inputs[1]
if axis.data is None:
ishape, = shapes
if self.axis is None:
return [()]
rval = tuple([ishape[i] for (i, b) in enumerate(
node.inputs[0].type.broadcastable) if i not in axis.data])
node.inputs[0].type.broadcastable) if i not in self.axis])
return [rval]
def grad(self, inp, grads):
x, axis = inp
axis_grad = grad_undefined(
self, 1, axis,
"argmax is not defined for non-integer axes so"
" argmax(x, axis+eps) is undefined")
return [x.zeros_like(), axis_grad]
x, = inp
_argmax = Argmax()
return [x.zeros_like()]
def makeKeepDims(x, y, axis):
......
......@@ -1333,9 +1333,9 @@ def test_argmax_pushdown():
# for node in fgraph.toposort():
# print node.op
assert len(fgraph.toposort()) == 1
assert fgraph.toposort()[0].op == tensor.basic._argmax
assert isinstance(fgraph.toposort()[0].op, tensor.basic.Argmax)
assert check_stack_trace(
fgraph, ops_to_check=tensor.basic._argmax)
fgraph, ops_to_check=tensor.basic.Argmax)
x = tensor.matrix()
# test that the max_and_argmax is not pushed down if the max is used
out = tensor.max_and_argmax(
......
......@@ -60,7 +60,7 @@ def local_max_and_argmax(node):
return [new, None]
if len(node.outputs[0].clients) == 0:
return [None, T._argmax(node.inputs[0], axis)]
return [None, T.Argmax(axis)(node.inputs[0])]
@register_uncanonicalize
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论