提交 218d006c authored 作者: notoraptor's avatar notoraptor

This is my proposal for GpuMaxAndArgmax (issue #1399).

GpuMaxAndArgmax op is implemented in a separate file in theano/gpuarray. A test script has also been written. To integrate this op in Theano, an optimization has been added to ttheano/gpuarray/opt.py.
上级 29af0e5b
import os
import numpy
from six import integer_types
import theano
from theano.gof import Variable, Op, Apply
from theano.tensor.type_other import NoneConst
from theano.tensor.var import TensorConstant
# from theano.tensor import as_tensor_variable
from .basic_ops import (infer_context_name, as_gpuarray_variable)
from .type import GpuArrayType
try:
import pygpu
except ImportError as e:
pass
class GpuMaxAndArgmax(Op):
"""
GPU version of MaxAndArgmax
"""
__props__ = ()
argmax_dtype = "int64"
def make_node(self, X, axis=None):
context_name = infer_context_name(X)
# Check axis and convert it to Python variable.
if (isinstance(axis, (integer_types, numpy.integer)) or
(isinstance(axis, numpy.ndarray) and axis.ndim == 0)):
axis = [int(axis)]
elif isinstance(axis, (tuple, list, numpy.ndarray)):
axis = [int(a) for a in axis]
elif isinstance(axis, Variable):
if NoneConst.equals(axis):
axis = None
elif not isinstance(axis, TensorConstant):
raise TypeError("MaxAndArgmax needs a constant axis. Got %s" % axis)
else:
assert (axis.dtype.startswith("int") or axis.dtype.startswith("uint"))
if (isinstance(axis.data, (integer_types, numpy.integer)) or
(isinstance(axis.data, numpy.ndarray) and axis.data.ndim == 0)):
axis = [int(axis.data)]
elif isinstance(axis.data, (list, numpy.ndarray)):
axis = [int(i) for i in axis.data]
# Make axis entries non-negative, and verify that axes are valid.
if isinstance(axis, list):
for idx in xrange(len(axis)):
if axis[idx] < 0:
axis[idx] += X.type.ndim
if axis[idx] < 0 or axis[idx] >= X.type.ndim:
raise ValueError('Invalid axis: %s (the number of dimensions of the '
'input is: %s)' % (axis[idx], X.type.ndim))
# Sort axes and make them unique.
axis_set = set() # used to build "broadcastable" variable below.
all_axes = []
if isinstance(axis, list):
axis_set = set(axis)
all_axes = list(axis_set)
all_axes.sort()
if all_axes == range(X.type.ndim):
axis = None
else:
all_axes = range(X.ndim)
axis_set = set(all_axes)
if axis is None:
axis = NoneConst.clone()
else:
axis = theano.tensor.as_tensor_variable(all_axes)
# assert axis.ndim == 1
inputs = [as_gpuarray_variable(X, context_name), axis]
# We keep the original broadcastable flags for dimensions on which
# we do not perform the max / argmax.
broadcastable = [b for i, b in enumerate(X.type.broadcastable)
if i not in axis_set]
outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name, name='max')(),
GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name, name='argmax')()]
return Apply(self, inputs, outputs)
def perform(self, node, inputs, outputs):
X, axes = inputs
max, max_idx = outputs
if axes is None:
axes = tuple(range(X.ndim))
else:
axes = tuple(axes)
# axes = tuple(int(ax) for ax in axes)
max[0] = theano._asarray(numpy.max(X, axes), dtype=node.outputs[0].dtype)
# Numpy does not support multiple axes for argmax
# Work around
keep_axes = numpy.array([i for i in range(X.ndim) if i not in axes], dtype='int64')
# Not-reduced axes in front
transposed_x = numpy.transpose(X, numpy.concatenate((keep_axes, axes)))
kept_shape = transposed_x.shape[:len(keep_axes)]
reduced_shape = transposed_x.shape[len(keep_axes):]
new_shape = kept_shape + (numpy.prod(reduced_shape),)
reshaped_x = transposed_x.reshape(new_shape)
max_idx[0] = theano._asarray(numpy.argmax(reshaped_x, axis=-1), dtype='int64')
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>']
def c_header_dirs(self):
return [pygpu.get_include(), os.path.dirname(__file__)]
def c_code(self, node, name, input_names, output_names, sub):
# Recall: X, axes = input_names
# Recall: max, argmax = output_names
# Recall: fail = sub['fail']
max_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
argmax_typecode = pygpu.gpuarray.dtype_to_typecode(self.argmax_dtype)
# axes_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[1].dtype)
axes_ctype = 'int64_t'
ret = """
GpuArray* %(name)s_input = &%(X)s->ga;
size_t %(name)s_input_ndim = PyGpuArray_NDIM(%(X)s);
"""
if NoneConst.equals(node.inputs[1]):
ret += """
unsigned %(name)s_redux_len = %(name)s_input_ndim;
unsigned* %(name)s_axes_to_reduce = new unsigned[%(name)s_redux_len];
for(unsigned i = 0; i < %(name)s_redux_len; ++i) {
%(name)s_axes_to_reduce[i] = i;
}
"""
else:
assert node.inputs[1].ndim == 1
ret += """
unsigned %(name)s_redux_len = PyArray_DIM(%(axes)s, 0);
unsigned* %(name)s_axes_to_reduce = new unsigned[%(name)s_redux_len];
for(unsigned i = 0; i < %(name)s_redux_len; ++i) {
%(name)s_axes_to_reduce[i] =
(unsigned)( ((%(axes_ctype)s*)PyArray_DATA(%(axes)s)) [i * (PyArray_STRIDES(%(axes)s)[0] / sizeof(%(axes_ctype)s))] );
}
"""
ret += """
size_t %(name)s_output_ndim = %(name)s_input_ndim - %(name)s_redux_len;
size_t* %(name)s_output_dims = NULL;
if(%(name)s_output_ndim == 0) {
/* Current backend function GpuArray_maxandargmax does not work when
* all axes need to be reduced. So to handle this case, we create a view
* of the input as a matrix with 1 row and as many columns as elements
* in the input, so that the 2nd dimenson of the matrix will be reduced. */
size_t total_size = 1;
for(size_t i = 0; i < %(name)s_input_ndim; ++i) {
total_size *= PyGpuArray_DIM(%(X)s, i);
}
size_t newdims[2] = {1, total_size};
%(name)s_input = new GpuArray;
if(GA_NO_ERROR !=
GpuArray_reshape(%(name)s_input, &%(X)s->ga, 2, newdims, GA_ANY_ORDER, 0)
) {
%(fail)s
}
%(name)s_redux_len = 1;
%(name)s_axes_to_reduce[0] = 1;
} else {
%(name)s_output_dims = new size_t[%(name)s_output_ndim];
if(%(name)s_redux_len == 1) {
for(unsigned i = 0; i < %(name)s_axes_to_reduce[0]; ++i) {
%(name)s_output_dims[i] = PyGpuArray_DIM(%(X)s, i);
}
for(unsigned i = %(name)s_axes_to_reduce[0] + 1; i < %(name)s_input_ndim; ++i) {
%(name)s_output_dims[i-1] = PyGpuArray_DIM(%(X)s, i);
}
} else {
int64_t current_input_pos = -1;
int64_t current_output_pos = -1;
for(unsigned i = 0; i < %(name)s_redux_len; ++i) {
for(++current_input_pos; current_input_pos < %(name)s_axes_to_reduce[i]; ++current_input_pos) {
%(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
}
}
for(++current_input_pos; current_input_pos < %(name)s_input_ndim; ++current_input_pos) {
%(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
}
}
}
if(theano_prep_output(&%(max)s, %(name)s_output_ndim, %(name)s_output_dims, %(max_typecode)s, GA_C_ORDER, %(X)s->context)) {
%(fail)s
}
if(theano_prep_output(&%(argmax)s, %(name)s_output_ndim, %(name)s_output_dims, %(argmax_typecode)s, GA_C_ORDER, %(X)s->context)) {
%(fail)s
}
if(GA_NO_ERROR !=
GpuArray_maxandargmax(&%(max)s->ga, &%(argmax)s->ga, %(name)s_input, %(name)s_redux_len, %(name)s_axes_to_reduce)
) {
%(fail)s
}
"""
if theano.config.gpuarray.sync:
ret += """
GpuArray_sync(&%(max)s->ga);
GpuArray_sync(&%(argmax)s->ga);
"""
return ret % {'X': input_names[0], 'axes': input_names[1], 'max': output_names[0], 'argmax': output_names[1],
'axes_ctype': axes_ctype, 'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
'name': name, 'fail': sub['fail']}
def c_code_cleanup(self, node, name, inputs, outputs, sub):
return """
delete[] %(name)s_output_dims;
if(%(name)s_input != &%(X)s->ga) delete %(name)s_input;
delete[] %(name)s_axes_to_reduce;
""" % {'name': name, 'X': inputs[0]}
gpu_maxandargmax = GpuMaxAndArgmax()
...@@ -1775,6 +1775,14 @@ def _scan_type_infer(node): ...@@ -1775,6 +1775,14 @@ def _scan_type_infer(node):
context_name=context_name) context_name=context_name)
return typebuild return typebuild
## Add optimization : maxandargmax (CPU -> GPU)
@register_opt('fast_compile')
@op_lifter([tensor.MaxAndArgmax])
@register_opt2([tensor.MaxAndArgmax], 'fast_compile')
def local_gpu_maxandargmax(op, context_name, inputs, outputs):
return gpu_maxandargmax
##
# Do not register in fast_run or fast_compile. # Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled. # It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
......
from unittest import TestCase
from theano.tests import unittest_tools as utt
import numpy as np
import theano
import theano.tensor as T
def randomTensor(*shapes):
dimlist = shapes
size = 1
for dimsize in dimlist:
size *= dimsize
return np.random.normal(size=size).astype(np.float32).reshape(dimlist)
def numpyMaxAndArgmax(X, axis=None):
if axis is None:
axis = range(X.ndim)
elif not isinstance(axis, (tuple, list)):
axis = [axis]
axis = list(set(axis)) # remove duplicated values.
axis.sort()
axis = tuple(axis)
ref_max = np.max(X, axis=axis)
# Numpy does not support multiple axes for argmax. Work around
# Code copied from MaxAndArgmax.perform()
keep_axes = np.array([i for i in range(X.ndim) if i not in axis], dtype='int64')
# Not-reduced axes in front
transposed_x = np.transpose(X, np.concatenate((keep_axes, axis)))
kept_shape = transposed_x.shape[:len(keep_axes)]
reduced_shape = transposed_x.shape[len(keep_axes):]
new_shape = kept_shape + (np.prod(reduced_shape),)
reshaped_x = transposed_x.reshape(new_shape)
return (ref_max, np.argmax(reshaped_x, axis=-1))
class TestGpuMaxAndArgmax(TestCase):
# We run all tests with 5-D tensors of 10 000 000 elements.
# NB: In each test, any first call of theano function should be ignored
# with Theano config flag profiling.ignore_first_call=True.
# To just check if GpuMaxAndArgmax is called:
# $ theano-cache purge && THEANO_FLAGS=floatX=float32,device=cuda,profile=True,profiling.ignore_first_call=True \
# nosetests --verbose theano/gpuarray/tests/test_GpuMaxAndArgmax.py:TestGpuMaxAndArgmax.test_none
def _basic_test_tensor5(self, axis=None):
M = T.tensor5()
max_M = T.max(M, axis=axis)
argmax_M = T.argmax(M, axis=axis)
f = theano.function([M], [max_M, argmax_M])
test_matrix = randomTensor(1000, 100, 10, 5, 2)
f(test_matrix)
theano_max, theano_argmax = f(test_matrix)
ref_max, ref_argmax = numpyMaxAndArgmax(test_matrix, axis=axis)
utt.assert_allclose(ref_max, theano_max)
utt.assert_allclose(ref_argmax, theano_argmax)
def _basic_test_assert_equals(self, axis1, axis2):
M1 = T.tensor5()
M2 = T.tensor5()
f1 = theano.function([M1], [T.max(M1, axis=axis1), T.argmax(M1, axis=axis1)])
f2 = theano.function([M2], [T.max(M2, axis=axis2), T.argmax(M2, axis=axis2)])
test_matrix = randomTensor(1000, 100, 10, 5, 2)
f1(test_matrix)
f2(test_matrix)
theano1 = f1(test_matrix)
theano2 = f2(test_matrix)
ref1 = numpyMaxAndArgmax(test_matrix, axis1)
ref2 = numpyMaxAndArgmax(test_matrix, axis2)
utt.assert_allclose(ref1, ref2)
utt.assert_allclose(theano1, theano2)
utt.assert_allclose(ref1, theano1)
def test_none(self):
self._basic_test_tensor5(None)
def test_all_axes(self):
self._basic_test_tensor5((0, 1, 2, 3, 4))
def test_1_axe(self):
self._basic_test_tensor5(3)
def test_2_axes(self):
self._basic_test_tensor5((0, 3))
def test_3_axes(self):
self._basic_test_tensor5((0, 3, 4))
def test_4_axes(self):
self._basic_test_tensor5((0, 1, 2, 4))
def test_simple(self):
self._basic_test_tensor5(None)
self._basic_test_tensor5((0, 1, 2, 3, 4))
self._basic_test_tensor5((4, 1, 3, 2))
def test_assert_equals(self):
self._basic_test_assert_equals(None, (0, 1, 2, 3, 4))
self._basic_test_assert_equals(0, (0, 0))
self._basic_test_assert_equals((4, 1, 3, 2), (1, 2, 3, 4))
self._basic_test_assert_equals((4, 3, 2, 1, 0), None)
self._basic_test_assert_equals((1, 3, 4), (1, 4, 4, 1, 3, 1, 3, 4, 3, 1, 1, 3, 1, 4, 1, 4))
def test_simple_1_axis(self):
self._basic_test_tensor5(0)
self._basic_test_tensor5(1)
self._basic_test_tensor5(2)
self._basic_test_tensor5(3)
self._basic_test_tensor5(4)
def test_simple_2_axis(self):
self._basic_test_tensor5((0, 0))
self._basic_test_tensor5((0, 1))
self._basic_test_tensor5((0, 2))
self._basic_test_tensor5((0, 3))
self._basic_test_tensor5((0, 4))
self._basic_test_tensor5((1, 0))
self._basic_test_tensor5((1, 1))
self._basic_test_tensor5((1, 2))
self._basic_test_tensor5((1, 3))
self._basic_test_tensor5((1, 4))
self._basic_test_tensor5((2, 0))
self._basic_test_tensor5((2, 1))
self._basic_test_tensor5((2, 2))
self._basic_test_tensor5((2, 3))
self._basic_test_tensor5((2, 4))
self._basic_test_tensor5((3, 0))
self._basic_test_tensor5((3, 1))
self._basic_test_tensor5((3, 2))
self._basic_test_tensor5((3, 3))
self._basic_test_tensor5((3, 4))
self._basic_test_tensor5((4, 0))
self._basic_test_tensor5((4, 1))
self._basic_test_tensor5((4, 2))
self._basic_test_tensor5((4, 3))
self._basic_test_tensor5((4, 4))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论