提交 3347480a authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #5168 from notoraptor/master

This is my proposal for GpuMaxAndArgmax (issue #1399).
......@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined, _get_props)
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial
from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction
def transfer(x, target):
try:
......
......@@ -65,6 +65,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax
_logger = logging.getLogger("theano.gpuarray.opt")
......@@ -1775,6 +1776,14 @@ def _scan_type_infer(node):
context_name=context_name)
return typebuild
# Add optimization : maxandargmax (CPU -> GPU)
@register_opt('fast_compile')
@op_lifter([tensor.MaxAndArgmax])
@register_opt2([tensor.MaxAndArgmax], 'fast_compile')
def local_gpu_maxandargmax(op, context_name, inputs, outputs):
return GpuMaxAndArgmax(op.get_params(None))
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace',
......
from __future__ import print_function, absolute_import, division
import os
import theano
from theano.gof import Op, Apply
from theano.gof.type import Generic
from .basic_ops import (infer_context_name, as_gpuarray_variable)
from .type import GpuArrayType
try:
import pygpu
except ImportError as e:
pass
class GpuMaxAndArgmax(Op):
"""
GPU version of MaxAndArgmax
"""
params_type = Generic()
__props__ = ('axis',)
argmax_dtype = "int64"
def __init__(self, axis):
assert isinstance(axis, (list, tuple))
self.axis = tuple(axis)
def get_params(self, node):
return self.axis
def make_node(self, X):
context_name = infer_context_name(X)
# We keep the original broadcastable flags for dimensions on which
# we do not perform the max / argmax.
all_axes = set(self.axis)
broadcastable = [b for i, b in enumerate(X.type.broadcastable)
if i not in all_axes]
inputs = [as_gpuarray_variable(X, context_name)]
outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name, name='max')(),
GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name, name='argmax')()]
return Apply(self, inputs, outputs)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>']
def c_header_dirs(self):
return [pygpu.get_include(), os.path.dirname(__file__)]
def c_code(self, node, name, input_names, output_names, sub):
# Recall: X = input_names[0]
# Recall: axes = sub['params']
# Recall: max, argmax = output_names
# Recall: fail = sub['fail']
max_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
argmax_typecode = pygpu.gpuarray.dtype_to_typecode(self.argmax_dtype)
ret = """
#if PY_MAJOR_VERSION >= 3
#ifndef PyInt_AS_LONG
#define PyInt_AS_LONG PyLong_AS_LONG
#endif
#endif
unsigned %(name)s_redux_len = PyTuple_GET_SIZE(%(axes)s);
unsigned* %(name)s_axes_to_reduce = (unsigned*)malloc(%(name)s_redux_len * sizeof(unsigned));
for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
PyObject* axis_object = PyTuple_GET_ITEM(%(axes)s, i);
%(name)s_axes_to_reduce[i] = (unsigned) PyInt_AS_LONG(axis_object);
Py_XDECREF(axis_object);
}
size_t %(name)s_input_ndim = PyGpuArray_NDIM(%(X)s);
size_t %(name)s_output_ndim = %(name)s_input_ndim - %(name)s_redux_len;
size_t* %(name)s_output_dims = (size_t*)malloc(%(name)s_output_ndim * sizeof(size_t));
if (%(name)s_redux_len == 1) {
for (unsigned i = 0; i < %(name)s_axes_to_reduce[0]; ++i) {
%(name)s_output_dims[i] = PyGpuArray_DIM(%(X)s, i);
}
for (unsigned i = %(name)s_axes_to_reduce[0] + 1; i < %(name)s_input_ndim; ++i) {
%(name)s_output_dims[i-1] = PyGpuArray_DIM(%(X)s, i);
}
} else {
int64_t current_input_pos = -1;
int64_t current_output_pos = -1;
for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
for (++current_input_pos; current_input_pos < %(name)s_axes_to_reduce[i]; ++current_input_pos) {
%(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
}
}
for (++current_input_pos; current_input_pos < %(name)s_input_ndim; ++current_input_pos) {
%(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
}
}
if (theano_prep_output(&%(max)s, %(name)s_output_ndim, %(name)s_output_dims, %(max_typecode)s, GA_C_ORDER, %(X)s->context)) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare max output.");
%(fail)s
}
if (theano_prep_output(&%(argmax)s, %(name)s_output_ndim, %(name)s_output_dims, %(argmax_typecode)s, GA_C_ORDER, %(X)s->context)) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare argmax output.");
%(fail)s
}
if (%(name)s_input_ndim == 0) {
/* GpuArray_maxandargmax can't handle a 0-d array
* because it expects that 1 <= redux_len <= input_ndim.
* As input_ndim == 0, then 1 <= redux_len <= 0 is false.
* To handle this case we copy input to max and we set argmax to 0.
*/
if (GA_NO_ERROR != GpuArray_setarray(&%(max)s->ga, &%(X)s->ga)) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to copy input to max when input is a scalar.");
%(fail)s
}
if (GA_NO_ERROR != GpuArray_memset(&%(argmax)s->ga, 0)) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to set argmax to 0 when input is a scalar.");
%(fail)s
}
} else if (GA_NO_ERROR !=
GpuArray_maxandargmax(&%(max)s->ga, &%(argmax)s->ga, &%(X)s->ga, %(name)s_redux_len, %(name)s_axes_to_reduce)
) {
PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to compute gpuarray maxandargmax.");
%(fail)s
}
"""
if theano.config.gpuarray.sync:
ret += """
GpuArray_sync(&%(max)s->ga);
GpuArray_sync(&%(argmax)s->ga);
"""
return ret % {'X': input_names[0], 'axes': sub['params'], 'max': output_names[0], 'argmax': output_names[1],
'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
'name': name, 'fail': sub['fail']}
def c_code_cleanup(self, node, name, inputs, outputs, sub):
return """
free(%(name)s_output_dims);
free(%(name)s_axes_to_reduce);
""" % {'name': name, 'X': inputs[0]}
from __future__ import print_function, absolute_import, division
from unittest import TestCase
import numpy as np
import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest
from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import rand_gpuarray
from .. import GpuArrayType
import math
# Number of values to be used in test tensors (except with 0-D tensors!).
test_size = 10000000
# NB: This order of "unsorted axes" is arbitrary and is here
# just to have the same informations on profile output
# from one test to another.
unsorted_axes = (2, 4, 0, 3, 1)
np.random.seed()
def numpy_random_array(shapes):
size = 1
for dimsize in shapes:
size *= dimsize
return np.random.normal(size=size).astype(theano.config.floatX).reshape(shapes)
def numpy_maxandargmax(X, axis=None):
if axis is None:
axis = list(range(X.ndim))
elif not isinstance(axis, (tuple, list)):
axis = [int(axis)]
axis = list(set(axis)) # remove duplicated values.
axis.sort()
axis = tuple(axis)
ref_max = np.max(X, axis=axis)
# Following code is copied from MaxAndArgmax.perform():
# Numpy does not support multiple axes for argmax. Work around.
keep_axes = np.array([i for i in range(X.ndim) if i not in axis], dtype='int64')
# Not-reduced axes in front
transposed_x = np.transpose(X, np.concatenate((keep_axes, axis)))
kept_shape = transposed_x.shape[:len(keep_axes)]
reduced_shape = transposed_x.shape[len(keep_axes):]
new_shape = kept_shape + (np.prod(reduced_shape),)
new_shape = tuple(int(i) for i in new_shape)
reshaped_x = transposed_x.reshape(new_shape)
return (ref_max, np.argmax(reshaped_x, axis=-1))
def check_if_gpu_maxandargmax_in_graph(theano_function):
assert len([node for node in theano_function.maker.fgraph.apply_nodes
if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0
def check_if_gpu_maxandargmax_not_in_graph(theano_function):
assert len([node for node in theano_function.maker.fgraph.apply_nodes
if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0
class BaseTest:
# This attribute must be set in subclasses.
tensor_size = None
shape = None
dtype = theano.config.floatX
def get_shape(self):
if self.tensor_size == 0:
return []
return [int(math.ceil(math.pow(test_size, 1 / self.tensor_size)))] * self.tensor_size
def setUp(self):
if not isinstance(self.tensor_size, int):
raise SkipTest("No tensor ndim defined.")
if self.tensor_size < 0 or self.tensor_size > 5:
raise SkipTest("We allow from 0 (included) to 5 (inclued) dimensons for these tests.")
if self.shape is None:
self.shape = self.get_shape()
def get_host_tensor(self):
broadcastable = (False,) * self.tensor_size
return T.tensor(self.dtype, broadcastable)
def get_gpu_tensor(self):
broadcastable = (False,) * self.tensor_size
return GpuArrayType(self.dtype, broadcastable)()
def get_host_value(self):
return numpy_random_array(self.shape)
def get_gpu_value(self):
return rand_gpuarray(*self.shape)
# NB: In compute_host() and compute_gpu(),
# the first call of the theano function should be ignored in profiling,
# with Theano config flag profiling.ignore_first_call=True.
def compute_host(self, test_tensor, axis):
M = self.get_host_tensor()
f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
name='shape:' + str(test_tensor.shape) + '/axis:' + str(axis) + '/HOST', mode=mode_without_gpu)
check_if_gpu_maxandargmax_not_in_graph(f)
f(test_tensor)
theano_max, theano_argmax = f(test_tensor)
ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
utt.assert_allclose(ref_max, theano_max)
utt.assert_allclose(ref_argmax, theano_argmax)
def compute_gpu(self, test_gpu_tensor, test_host_tensor, axis):
M = self.get_gpu_tensor()
f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
name='shape:' + str(test_gpu_tensor.shape) + '/axis:' + str(axis) + '/GPU', mode=mode_with_gpu)
check_if_gpu_maxandargmax_in_graph(f)
f(test_gpu_tensor)
theano_max, theano_argmax = f(test_gpu_tensor)
ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)
utt.assert_allclose(ref_max, theano_max)
utt.assert_allclose(ref_argmax, theano_argmax)
def compute(self, axis=None):
# We want to run CPU op and GPU op on the same tensor randomly generated.
test_gpu_tensor = self.get_gpu_value()
test_host_tensor = np.asarray(test_gpu_tensor)
self.compute_host(test_host_tensor, axis)
self.compute_gpu(test_gpu_tensor, test_host_tensor, axis)
def compute_axis(self, pos):
if self.tensor_size != 1 and 0 <= pos < self.tensor_size:
self.compute(pos)
def compute_some_axes(self, count):
if 0 <= count < self.tensor_size:
self.compute([i for i in unsorted_axes if i < self.tensor_size][:count])
# Equivalent to test reduction on all axes.
def test_none(self):
self.compute(None)
def test_axis_1(self):
self.compute_axis(0)
def test_axis_2(self):
self.compute_axis(1)
def test_axis_3(self):
self.compute_axis(2)
def test_axis_4(self):
self.compute_axis(3)
def test_axis_5(self):
self.compute_axis(4)
# For the tests below, we expect CPU op to run with Python implementation.
def test_2_axes(self):
self.compute_some_axes(2)
def test_3_axes(self):
self.compute_some_axes(3)
def test_4_axes(self):
self.compute_some_axes(4)
class TestScalar(BaseTest, TestCase):
tensor_size = 0
class TestVector(BaseTest, TestCase):
tensor_size = 1
# Special case
class TestRow(BaseTest, TestCase):
tensor_size = 2
shape = [1, test_size]
# Special case
class TestColumn(BaseTest, TestCase):
tensor_size = 2
shape = [test_size, 1]
class TestMatrix(BaseTest, TestCase):
tensor_size = 2
class TestTensor5(BaseTest, TestCase):
tensor_size = 5
差异被折叠。
......@@ -1568,9 +1568,9 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
@opt.register_specialize('fast_compile_gpu')
@gof.local_optimizer([tensor._max_and_argmax])
@gof.local_optimizer([tensor.MaxAndArgmax])
def local_argmax_pushdown(node):
if node.op == tensor._max_and_argmax and node.inputs[0].owner and \
if isinstance(node.op, tensor.MaxAndArgmax) and node.inputs[0].owner and \
len(node.outputs[0].clients) > 0 and node.inputs[0].owner.op in \
(softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
softmax_with_bias):
......@@ -1584,23 +1584,24 @@ def local_argmax_pushdown(node):
"warning set the Theano flags 'warn.argmax_pushdown_bug' "
"to False")
if (node.op == tensor._max_and_argmax and
if (isinstance(node.op, tensor.MaxAndArgmax) and
node.inputs[0].owner and len(node.outputs[0].clients) == 0):
x_max, x_argmax = node.outputs
x, axis = node.inputs
x = node.inputs[0]
axis = node.op.get_params(node)
# TODO: Make a list/set of monotonic ops...
if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp,
tensor.log, tensor.tanh, sigmoid):
pre_x, = x.owner.inputs
ret = tensor._max_and_argmax(pre_x, axis)
ret = tensor.max_and_argmax(pre_x, axis)
copy_stack_trace(x_max, ret)
return ret
if x.owner and x.owner.op == softmax_with_bias:
pre_x, pre_bias = x.owner.inputs
ret = tensor._max_and_argmax(pre_x +
tensor.DimShuffle(
pre_bias.broadcastable,
('x', 0))(pre_bias), axis)
ret = tensor.max_and_argmax(pre_x +
tensor.DimShuffle(
pre_bias.broadcastable,
('x', 0))(pre_bias), axis)
# copy both stack traces
copy_stack_trace(x_max, ret)
return ret
......
......@@ -41,8 +41,6 @@ from theano.tensor.elemwise import CAReduce
from theano.tensor import basic as T
from theano.tensor import DimShuffle
from theano.tensor.basic import (get_scalar_constant_value,
NotScalarConstantError)
from theano.tensor.opt import register_uncanonicalize
from theano import scalar as scal
......@@ -50,31 +48,19 @@ _logger = logging.getLogger('theano.tensor.opt')
@register_uncanonicalize
@gof.local_optimizer([T._max_and_argmax])
@gof.local_optimizer([T.MaxAndArgmax])
def local_max_and_argmax(node):
"""
If we don't use the argmax, change it to a max only.
"""
if node.op == T._max_and_argmax:
if isinstance(node.op, T.MaxAndArgmax):
axis = node.op.get_params(node)
if len(node.outputs[1].clients) == 0:
# MaxAndArgmax support variable axis,
# but CAReduce support only constant axis.
if node.inputs[1].data is None:
axis = None
else:
try:
axis = get_scalar_constant_value(node.inputs[1])
except NotScalarConstantError:
axis = node.inputs[1]
if not isinstance(axis, T.TensorConstant):
return False
axis = axis.data
new = CAReduce(scal.maximum, axis)(node.inputs[0])
return [new, None]
if len(node.outputs[0].clients) == 0:
return [None, T._argmax(node.inputs[0], node.inputs[1])]
return [None, T._argmax(node.inputs[0], axis)]
@register_uncanonicalize
......
......@@ -7619,23 +7619,23 @@ class TestInferShape(utt.InferShapeTester):
# MaxAndArgmax,
adtens3_val = rand(4, 5, 3)
self._compile_and_check([adtens3],
MaxAndArgmax()(adtens3, None),
max_and_argmax(adtens3, None),
[adtens3_val], MaxAndArgmax)
self._compile_and_check([adtens3],
MaxAndArgmax()(adtens3, 0),
max_and_argmax(adtens3, 0),
[adtens3_val], MaxAndArgmax)
self._compile_and_check([adtens3],
MaxAndArgmax()(adtens3, 1),
max_and_argmax(adtens3, 1),
[adtens3_val], MaxAndArgmax)
self._compile_and_check([adtens3],
MaxAndArgmax()(adtens3, 2),
max_and_argmax(adtens3, 2),
[adtens3_val], MaxAndArgmax)
self._compile_and_check([adtens3],
MaxAndArgmax()(adtens3, [0, 1, 2]),
max_and_argmax(adtens3, [0, 1, 2]),
[adtens3_val], MaxAndArgmax)
# ARange
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论