提交 4a8fed96 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5923 from nouiz/float16

don't move op that don't support float16 on the GPU
......@@ -50,6 +50,7 @@ class ViewOp(gof.Op):
# the output variable is %(oname)s.
c_code_and_version = {}
__props__ = ()
_f16_ok = True
def make_node(self, x):
return gof.Apply(self, [x], [x.type()])
......@@ -151,6 +152,7 @@ class DeepCopyOp(gof.Op):
check_input = False
__props__ = ()
_f16_ok = True
def __init__(self):
pass
......@@ -659,6 +661,7 @@ class Rebroadcast(gof.Op):
check_input = False
__props__ = ("axis",)
_f16_ok = True
def __init__(self, *axis):
# Sort them to make sure we merge all possible case.
......@@ -820,6 +823,7 @@ class SpecifyShape(gof.Op):
# the output variable is %(oname)s.
c_code_and_version = {}
__props__ = ()
_f16_ok = True
def make_node(self, x, shape):
if not isinstance(x, gof.Variable):
......
......@@ -14,189 +14,6 @@ if os.path.exists(os.path.join(config.compiledir, 'cutils_ext.so')):
os.remove(os.path.join(config.compiledir, 'cutils_ext.so'))
def compile_cutils_code():
types = ['npy_' + t for t in ['int8', 'int16', 'int32', 'int64', 'int128',
'int256', 'uint8', 'uint16', 'uint32',
'uint64', 'uint128', 'uint256',
'float16', 'float32', 'float64',
'float80', 'float96', 'float128',
'float256']]
complex_types = ['npy_' + t for t in ['complex32', 'complex64',
'complex128', 'complex160',
'complex192', 'complex512']]
inplace_map_template = """
#if defined(%(typen)s)
static void %(type)s_inplace_add(PyArrayMapIterObject *mit,
PyArrayIterObject *it, int inc_or_set)
{
int index = mit->size;
while (index--) {
%(op)s
PyArray_MapIterNext(mit);
PyArray_ITER_NEXT(it);
}
}
#endif
"""
floatadd = ("((%(type)s*)mit->dataptr)[0] = "
"(inc_or_set ? ((%(type)s*)mit->dataptr)[0] : 0)"
" + ((%(type)s*)it->dataptr)[0];")
complexadd = """
((%(type)s*)mit->dataptr)[0].real =
(inc_or_set ? ((%(type)s*)mit->dataptr)[0].real : 0)
+ ((%(type)s*)it->dataptr)[0].real;
((%(type)s*)mit->dataptr)[0].imag =
(inc_or_set ? ((%(type)s*)mit->dataptr)[0].imag : 0)
+ ((%(type)s*)it->dataptr)[0].imag;
"""
fns = ''.join([inplace_map_template % {'type': t, 'typen': t.upper(),
'op': floatadd % {'type': t}}
for t in types] +
[inplace_map_template % {'type': t, 'typen': t.upper(),
'op': complexadd % {'type': t}}
for t in complex_types])
def gen_binop(type, typen):
return """
#if defined(%(typen)s)
%(type)s_inplace_add,
#endif
""" % dict(type=type, typen=typen)
fn_array = ("static inplace_map_binop addition_funcs[] = {" +
''.join([gen_binop(type=t, typen=t.upper())
for t in types + complex_types]) + "NULL};\n")
def gen_num(typen):
return """
#if defined(%(typen)s)
%(typen)s,
#endif
""" % dict(type=type, typen=typen)
type_number_array = ("static int type_numbers[] = {" +
''.join([gen_num(typen=t.upper())
for t in types + complex_types]) + "-1000};")
code = ("""
#if NPY_API_VERSION >= 0x00000008
typedef void (*inplace_map_binop)(PyArrayMapIterObject *,
PyArrayIterObject *, int inc_or_set);
""" + fns + fn_array + type_number_array + """
static int
map_increment(PyArrayMapIterObject *mit, PyObject *op,
inplace_map_binop add_inplace, int inc_or_set)
{
PyArrayObject *arr = NULL;
PyArrayIterObject *it;
PyArray_Descr *descr;
if (mit->ait == NULL) {
return -1;
}
descr = PyArray_DESCR(mit->ait->ao);
Py_INCREF(descr);
arr = (PyArrayObject *)PyArray_FromAny(op, descr,
0, 0, NPY_ARRAY_FORCECAST, NULL);
if (arr == NULL) {
return -1;
}
if ((mit->subspace != NULL) && (mit->consec)) {
PyArray_MapIterSwapAxes(mit, (PyArrayObject **)&arr, 0);
if (arr == NULL) {
return -1;
}
}
it = (PyArrayIterObject*)
PyArray_BroadcastToShape((PyObject*)arr, mit->dimensions, mit->nd);
if (it == NULL) {
Py_DECREF(arr);
return -1;
}
(*add_inplace)(mit, it, inc_or_set);
Py_DECREF(arr);
Py_DECREF(it);
return 0;
}
static PyObject *
inplace_increment(PyObject *dummy, PyObject *args)
{
PyObject *arg_a = NULL, *index=NULL, *inc=NULL;
int inc_or_set = 1;
PyArrayObject *a;
inplace_map_binop add_inplace = NULL;
int type_number = -1;
int i = 0;
PyArrayMapIterObject * mit;
if (!PyArg_ParseTuple(args, "OOO|i", &arg_a, &index,
&inc, &inc_or_set)) {
return NULL;
}
if (!PyArray_Check(arg_a)) {
PyErr_SetString(PyExc_ValueError,
"needs an ndarray as first argument");
return NULL;
}
a = (PyArrayObject *) arg_a;
if (PyArray_FailUnlessWriteable(a, "input/output array") < 0) {
return NULL;
}
if (PyArray_NDIM(a) == 0) {
PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed.");
return NULL;
}
type_number = PyArray_TYPE(a);
while (type_numbers[i] >= 0 && addition_funcs[i] != NULL){
if (type_number == type_numbers[i]) {
add_inplace = addition_funcs[i];
break;
}
i++ ;
}
if (add_inplace == NULL) {
PyErr_SetString(PyExc_TypeError, "unsupported type for a");
return NULL;
}
mit = (PyArrayMapIterObject *) PyArray_MapIterArray(a, index);
if (mit == NULL) {
goto fail;
}
if (map_increment(mit, inc, add_inplace, inc_or_set) != 0) {
goto fail;
}
Py_DECREF(mit);
Py_INCREF(Py_None);
return Py_None;
fail:
Py_XDECREF(mit);
return NULL;
}
#endif
""")
return code
def compile_cutils():
"""
Do just the compilation of cutils_ext.
......@@ -204,7 +21,6 @@ def compile_cutils():
"""
code = ("""
#include <Python.h>
#include "numpy/arrayobject.h"
#include "theano_mod_helper.h"
extern "C"{
......@@ -226,18 +42,10 @@ def compile_cutils():
int failure = fn(it);
return Py_BuildValue("i", failure);
}""")
code += compile_cutils_code()
code += ("""static PyMethodDef CutilsExtMethods[] = {
}
static PyMethodDef CutilsExtMethods[] = {
{"run_cthunk", run_cthunk, METH_VARARGS|METH_KEYWORDS,
"Run a theano cthunk."},
#if NPY_API_VERSION >= 0x00000008
{"inplace_increment", inplace_increment,
METH_VARARGS,
"increments a numpy array inplace at the passed indexes."},
#endif
{NULL, NULL, 0, NULL} /* Sentinel */
};""")
if PY3:
......@@ -256,7 +64,6 @@ def compile_cutils():
PyMODINIT_FUNC
PyInit_cutils_ext(void) {
import_array();
return PyModule_Create(&moduledef);
}
}
......@@ -266,7 +73,6 @@ def compile_cutils():
PyMODINIT_FUNC
initcutils_ext(void)
{
import_array();
(void) Py_InitModule("cutils_ext", CutilsExtMethods);
}
} //extern C
......
......@@ -261,7 +261,7 @@ class GpuCholesky(Op):
raise RuntimeError('CUSOLVER is not available and '
'GpuCholesky Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
warnings.warn('The GpuCholesky op requires scikit-cuda > 0.5.1 to work with CUDA 8')
if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.')
......@@ -382,6 +382,7 @@ class GpuMagmaSVD(COp):
A = as_gpuarray_variable(A, ctx_name)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
assert A.dtype == 'float32'
if self.compute_uv:
return theano.Apply(self, [A],
[A.type(),
......@@ -476,6 +477,7 @@ class GpuMagmaMatrixInverse(COp):
def make_node(self, x):
ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, ctx_name)
assert x.dtype == 'float32'
if x.ndim != 2:
raise LinAlgError("Matrix rank error")
return theano.Apply(self, [x], [x.type()])
......
......@@ -1181,6 +1181,14 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
@register_opt2([tensor.blas.Gemv], 'fast_compile')
def local_gpua_gemv(op, context_name, inputs, outputs):
if inputs[0].dtype == 'float16':
# Use gemm implementation as cublas gemv don't support float16
return gpugemm_no_inplace(inputs[0][:, None],
inputs[1],
inputs[2],
inputs[3][:, None],
inputs[4]).dimshuffle(0)
if inputs[0].dtype not in ['float32', 'float64']:
return
if op.inplace:
......@@ -1351,6 +1359,8 @@ theano.tensor.nnet.conv2d()
@op_lifter([SparseBlockGemv])
@register_opt2([SparseBlockGemv], 'fast_compile')
def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
if inputs[0].dtype == 'float16':
return
if op.inplace:
return gpu_sparse_block_gemv_inplace
else:
......@@ -1361,6 +1371,8 @@ def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
@op_lifter([SparseBlockOuter])
@register_opt2([SparseBlockOuter], 'fast_compile')
def local_gpua_sparseblockouter(op, context_name, inputs, outputs):
if inputs[0].dtype == 'float16':
return
if op.inplace:
return gpu_sparse_block_outer_inplace
else:
......@@ -1998,7 +2010,13 @@ def _scan_type_infer(node):
@op_lifter([tensor.MaxAndArgmax])
@register_opt2([tensor.MaxAndArgmax], 'fast_compile')
def local_gpu_maxandargmax(op, context_name, inputs, outputs):
return GpuMaxAndArgmax(op.get_params(None))
op = GpuMaxAndArgmax(op.get_params(None))
if inputs[0].dtype == "float16":
# For now it is better to copy/cast on the GPU then transfer to the CPU
casted_inputs = inputs[0].astype('float32')
ret = op(casted_inputs)
return [ret[0].astype('float16'), ret[1]]
return op
# solve
......@@ -2008,9 +2026,15 @@ def local_gpu_maxandargmax(op, context_name, inputs, outputs):
def local_gpu_solve(op, context_name, inputs, outputs):
if not cusolver_available:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
return
return GpuCusolverSolve(A_structure=op.A_structure)
op = GpuCusolverSolve(A_structure=op.A_structure)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32'),
inputs[1].astype('float32')).astype('float16')
return op
@register_inplace()
......@@ -2028,7 +2052,13 @@ def local_inplace_gpu_solve(node):
def local_gpu_cholesky(op, context_name, inputs, outputs):
if not cusolver_available:
return
return GpuCholesky(lower=op.lower, inplace=op.destructive)
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuCholesky(lower=op.lower, inplace=op.destructive)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
@register_inplace()
......@@ -2044,7 +2074,12 @@ def local_inplace_cholesky(node):
def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
return GpuMagmaMatrixInverse()
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaMatrixInverse()
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
@register_inplace()
......@@ -2061,9 +2096,13 @@ def local_inplace_matrix_inverse_inplace(node):
def local_gpu_svd(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
return GpuMagmaSVD(full_matrices=op.full_matrices,
compute_uv=op.compute_uv)
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaSVD(full_matrices=op.full_matrices,
compute_uv=op.compute_uv)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
......
......@@ -232,7 +232,7 @@ class GpuIncSubtensor(IncSubtensor):
if not self.set_instead_of_inc:
# sub_x += y
iadd = get_iadd(node.inputs[0], node.inputs[1])
iadd(sub_x, y, broadcast=False)
iadd(sub_x, y)
else:
# sub_x[...] = y
x.__setitem__(cdata, y)
......@@ -403,6 +403,8 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
"""
AdvancedSubrensor1 on the GPU.
"""
_f16_ok = True
def make_node(self, x, ilist):
ctx_name = infer_context_name(x, ilist)
x_ = as_gpuarray_variable(x, ctx_name)
......@@ -807,7 +809,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
"""
ctx_name = infer_context_name(x, y, ilist)
x_ = as_gpuarray_variable(x, ctx_name)
y_ = as_gpuarray_variable(y, ctx_name)
y_ = as_gpuarray_variable(y.astype(x.dtype), ctx_name)
ilist_ = as_gpuarray_variable(ilist, ctx_name)
assert x_.type.ndim >= y_.type.ndim
......@@ -1088,6 +1090,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
class GpuExtractDiag(Op):
__props__ = ("offset", "axis1", "axis2", "view")
_f16_ok = True
def __init__(self, offset=0, axis1=0, axis2=1, view=False):
self.view = view
......
......@@ -8,7 +8,7 @@ import theano
from theano import config
from theano import tensor
from theano.tests import unittest_tools as utt
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22, batched_dot
from theano.tensor.blas import gemv, gemv_inplace, gemm_inplace, _dot22, batched_dot
from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor
......@@ -18,7 +18,7 @@ from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace,
gpugemmbatch_no_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22)
GpuGer, GpuGemm, gpu_dot22)
GpuGemvTester = makeTester(
......@@ -42,6 +42,22 @@ GpuGemvTester = makeTester(
def test_float16():
# gemv (gemm called)
float16_data = [rand(3).astype('float16'),
np.asarray(1, dtype=np.float32),
rand(3, 3).astype('float16'),
rand(3).astype('float16'),
np.asarray(0.5, dtype=np.float32)]
float16_shared = [gpuarray_shared_constructor(val, target=test_ctx_name)
for val in float16_data]
o = gemv(*float16_shared)
f = theano.function([], o, mode=mode_with_gpu)
y, alpha, A, x, beta = float16_data
out = f()
utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
topo = f.maker.fgraph.toposort()
assert any([isinstance(n.op, GpuGemm) for n in topo])
# gemm
float16_data = [rand(3, 3).astype('float16'),
np.asarray(1, dtype=np.float32),
......
......@@ -49,6 +49,33 @@ class G_subtensor(test_subtensor.T_subtensor):
assert self.sub == GpuSubtensor
class G_subtensorF16(test_subtensor.T_subtensor):
def shortDescription(self):
return None
def __init__(self, name):
def shared(x, **kwargs):
return gpuarray_shared_constructor(x, target=test_ctx_name,
**kwargs)
test_subtensor.T_subtensor.__init__(
self, name,
shared=shared,
sub=GpuSubtensor,
inc_sub=GpuIncSubtensor,
adv_sub1=GpuAdvancedSubtensor1,
adv_incsub1=GpuAdvancedIncSubtensor1,
dimshuffle=GpuDimShuffle,
mode=mode_with_gpu,
# avoid errors with limited devices
dtype='float16', # use floatX?
ignore_topo=(HostFromGpu, GpuFromHost,
DeepCopyOp, GpuContiguous))
# GPU opt can't run in fast_compile only.
self.fast_compile = False
assert self.sub == GpuSubtensor
def test_advinc_subtensor1():
# Test the second case in the opt local_gpu_advanced_incsubtensor1
for shp in [(3, 3), (3, 3, 3)]:
......@@ -73,7 +100,9 @@ def test_advinc_subtensor1():
def test_advinc_subtensor1_dtype():
# Test the mixed dtype case
shp = (3, 4)
for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64')]:
for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64'),
('float16', 'int8'), ('float16', 'float64'),
('float16', 'float16')]:
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
yval = np.empty((2,) + shp[1:], dtype=dtype2)
......@@ -95,7 +124,9 @@ def test_advinc_subtensor1_dtype():
def test_advinc_subtensor1_vector_scalar():
# Test the case where x is a vector and y a scalar
shp = (3,)
for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64')]:
for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64'),
('float16', 'int8'), ('float16', 'float64'),
('float16', 'float16')]:
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
yval = np.asarray(10, dtype=dtype2)
......@@ -105,7 +136,8 @@ def test_advinc_subtensor1_vector_scalar():
name='y')
expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1_dev20)
assert sum([isinstance(node.op, (GpuAdvancedIncSubtensor1_dev20,
GpuAdvancedIncSubtensor1))
for node in f.maker.fgraph.toposort()]) == 1
rval = f(yval)
rep = xval.copy()
......@@ -169,7 +201,26 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
sub=GpuAdvancedSubtensor,
mode=mode_with_gpu,
# avoid errors with limited devices
dtype='float32',
dtype='float32', # floatX?
ignore_topo=(HostFromGpu, GpuFromHost,
DeepCopyOp))
# GPU opt can't run in fast_compile only.
self.fast_compile = False
assert self.sub == GpuAdvancedSubtensor
class G_advancedsubtensorF16(test_subtensor.TestAdvancedSubtensor):
def shortDescription(self):
return None
def __init__(self, name):
test_subtensor.TestAdvancedSubtensor.__init__(
self, name,
shared=gpuarray_shared_constructor,
sub=GpuAdvancedSubtensor,
mode=mode_with_gpu,
# avoid errors with limited devices
dtype='float16', # floatX?
ignore_topo=(HostFromGpu, GpuFromHost,
DeepCopyOp))
# GPU opt can't run in fast_compile only.
......@@ -218,6 +269,17 @@ class test_gpuextractdiag(unittest.TestCase):
GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
np_x.diagonal(offset, axis1, axis2))
def test_tensor_float16(self):
x = tensor.tensor4()
np_x = np.arange(30107).reshape(7, 11, 17, 23).astype('float16')
for offset, axis1, axis2 in [
(1, 0, 1), (-1, 0, 1), (0, 1, 0), (-2, 1, 0),
(-3, 1, 0), (-2, 2, 0), (3, 3, 0), (-1, 3, 2),
(2, 2, 3), (-1, 2, 1), (1, 3, 1), (-1, 1, 3)]:
assert np.allclose(
GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
np_x.diagonal(offset, axis1, axis2))
class test_gpuallocdiag(unittest.TestCase):
def test_matrix(self):
......
......@@ -7,7 +7,7 @@ import numpy as np
import theano
from theano.compat import PY3
from theano import config
from theano.compile import DeepCopyOp
from theano.compile import DeepCopyOp, Rebroadcast, ViewOp
from theano.misc.pkl_utils import CompatUnpickler
# Disabled for now
......@@ -21,16 +21,45 @@ import pygpu
def test_deep_copy():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
for dtype in ['float16', 'float32']:
a = rand_gpuarray(20, dtype=dtype)
g = GpuArrayType(dtype=dtype, broadcastable=(False,))('g')
f = theano.function([g], g)
assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
res = f(a)
assert GpuArrayType.values_eq(res, a)
def test_view():
for dtype in ['float16', 'float32']:
a = rand_gpuarray(20, dtype=dtype)
g = GpuArrayType(dtype=dtype, broadcastable=(False,))('g')
f = theano.function([g], g)
f = theano.function([g], ViewOp()(g))
assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
assert isinstance(f.maker.fgraph.toposort()[0].op, ViewOp)
res = f(a)
res = f(a)
assert GpuArrayType.values_eq(res, a)
assert GpuArrayType.values_eq(res, a)
def test_rebroadcast():
for dtype in ['float16', 'float32']:
a = rand_gpuarray(1, dtype=dtype)
g = GpuArrayType(dtype=dtype, broadcastable=(False,))('g')
f = theano.function([g], Rebroadcast((0, True))(g))
assert isinstance(f.maker.fgraph.toposort()[0].op, Rebroadcast)
res = f(a)
assert GpuArrayType.values_eq(res, a)
def test_values_eq_approx():
......@@ -45,10 +74,11 @@ def test_values_eq_approx():
def test_specify_shape():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
f = theano.function([g], theano.tensor.specify_shape(g, [20]))
f(a)
for dtype in ['float16', 'float32']:
a = rand_gpuarray(20, dtype=dtype)
g = GpuArrayType(dtype=dtype, broadcastable=(False,))('g')
f = theano.function([g], theano.tensor.specify_shape(g, [20]))
f(a)
def test_filter_float():
......
......@@ -1482,7 +1482,11 @@ class numeric_grad(object):
The tuple (abs_err, rel_err) is returned
"""
abs_err = abs(a - b)
rel_err = abs_err / np.maximum(abs(a) + abs(b), 1e-8)
# 1e-8 is to prevent division by zeros.
# [] is to make sure that if a and b are float16, 1e-8 don't get
# dowcasted to float16 as that give 0! This would add back the
# division by zero
rel_err = abs_err / np.maximum(abs(a) + abs(b), [1e-8])
# The numpy.asarray are needed as if a or b is a sparse matrix
# this would result in a numpy.matrix and not a numpy.ndarray
# and the behave differently causing problem later.
......
from __future__ import absolute_import, print_function, division
def inc_code():
types = ['npy_' + t for t in ['int8', 'int16', 'int32', 'int64',
'uint8', 'uint16', 'uint32', 'uint64',
'float16', 'float32', 'float64']]
complex_types = ['npy_' + t for t in ['complex32', 'complex64',
'complex128']]
inplace_map_template = """
#if defined(%(typen)s)
static void %(type)s_inplace_add(PyArrayMapIterObject *mit,
PyArrayIterObject *it, int inc_or_set)
{
int index = mit->size;
while (index--) {
%(op)s
PyArray_MapIterNext(mit);
PyArray_ITER_NEXT(it);
}
}
#endif
"""
floatadd = ("((%(type)s*)mit->dataptr)[0] = "
"(inc_or_set ? ((%(type)s*)mit->dataptr)[0] : 0)"
" + ((%(type)s*)it->dataptr)[0];")
complexadd = """
((%(type)s*)mit->dataptr)[0].real =
(inc_or_set ? ((%(type)s*)mit->dataptr)[0].real : 0)
+ ((%(type)s*)it->dataptr)[0].real;
((%(type)s*)mit->dataptr)[0].imag =
(inc_or_set ? ((%(type)s*)mit->dataptr)[0].imag : 0)
+ ((%(type)s*)it->dataptr)[0].imag;
"""
fns = ''.join([inplace_map_template % {'type': t, 'typen': t.upper(),
'op': floatadd % {'type': t}}
for t in types] +
[inplace_map_template % {'type': t, 'typen': t.upper(),
'op': complexadd % {'type': t}}
for t in complex_types])
def gen_binop(type, typen):
return """
#if defined(%(typen)s)
%(type)s_inplace_add,
#endif
""" % dict(type=type, typen=typen)
fn_array = ("static inplace_map_binop addition_funcs[] = {" +
''.join([gen_binop(type=t, typen=t.upper())
for t in types + complex_types]) + "NULL};\n")
def gen_num(typen):
return """
#if defined(%(typen)s)
%(typen)s,
#endif
""" % dict(type=type, typen=typen)
type_number_array = ("static int type_numbers[] = {" +
''.join([gen_num(typen=t.upper())
for t in types + complex_types]) + "-1000};")
code = ("""
typedef void (*inplace_map_binop)(PyArrayMapIterObject *,
PyArrayIterObject *, int inc_or_set);
""" + fns + fn_array + type_number_array + """
static int
map_increment(PyArrayMapIterObject *mit, PyArrayObject *op,
inplace_map_binop add_inplace, int inc_or_set)
{
PyArrayObject *arr = NULL;
PyArrayIterObject *it;
PyArray_Descr *descr;
if (mit->ait == NULL) {
return -1;
}
descr = PyArray_DESCR(mit->ait->ao);
Py_INCREF(descr);
arr = (PyArrayObject *)PyArray_FromAny((PyObject *)op, descr,
0, 0, NPY_ARRAY_FORCECAST, NULL);
if (arr == NULL) {
return -1;
}
if ((mit->subspace != NULL) && (mit->consec)) {
PyArray_MapIterSwapAxes(mit, (PyArrayObject **)&arr, 0);
if (arr == NULL) {
return -1;
}
}
it = (PyArrayIterObject*)
PyArray_BroadcastToShape((PyObject*)arr, mit->dimensions, mit->nd);
if (it == NULL) {
Py_DECREF(arr);
return -1;
}
(*add_inplace)(mit, it, inc_or_set);
Py_DECREF(arr);
Py_DECREF(it);
return 0;
}
static int
inplace_increment(PyArrayObject *a, PyObject *index, PyArrayObject *inc,
int inc_or_set)
{
inplace_map_binop add_inplace = NULL;
int type_number = -1;
int i = 0;
PyArrayMapIterObject * mit;
if (PyArray_FailUnlessWriteable(a, "input/output array") < 0) {
return -1;
}
if (PyArray_NDIM(a) == 0) {
PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed.");
return -1;
}
type_number = PyArray_TYPE(a);
while (type_numbers[i] >= 0 && addition_funcs[i] != NULL){
if (type_number == type_numbers[i]) {
add_inplace = addition_funcs[i];
break;
}
i++ ;
}
if (add_inplace == NULL) {
PyErr_SetString(PyExc_TypeError, "unsupported type for a");
return -1;
}
mit = (PyArrayMapIterObject *) PyArray_MapIterArray(a, index);
if (mit == NULL) {
goto fail;
}
if (map_increment(mit, inc, add_inplace, inc_or_set) != 0) {
goto fail;
}
Py_DECREF(mit);
Py_INCREF(Py_None);
return 0;
fail:
Py_XDECREF(mit);
return -1;
}
""")
return code
......@@ -22,9 +22,7 @@ from theano.tensor.elemwise import DimShuffle
from theano.tensor.type_other import NoneConst, SliceType, NoneTypeT, make_slice
from theano import config
if config.cxx:
import theano.gof.cutils # needed to import cutils_ext
from cutils_ext.cutils_ext import inplace_increment
from .inc_code import inc_code
_logger = logging.getLogger("theano.tensor.subtensor")
......@@ -1943,8 +1941,7 @@ class AdvancedIncSubtensor1(Op):
NPY_ARRAY_ENSURECOPY, NULL)""" % locals()
def c_support_code(self):
from theano.gof.cutils import compile_cutils_code
return compile_cutils_code()
return inc_code()
def c_code(self, node, name, input_names, output_names, sub):
numpy_ver = [int(n) for n in np.__version__.split('.')[:2]]
......@@ -1976,17 +1973,14 @@ class AdvancedIncSubtensor1(Op):
Py_XDECREF(%(out)s);
%(out)s = %(copy_of_x)s;
}
PyObject *arglist = Py_BuildValue("OOOi",%(out)s, %(idx)s, %(y)s, %(inc_or_set)d);
rval = inplace_increment(NULL, arglist);
Py_XDECREF(arglist);
if (rval == NULL) {
if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, %(inc_or_set)d)) {
%(fail)s;
}
Py_XDECREF(rval);
""" % locals()
def c_code_cache_version(self):
return (3,)
return (4,)
def perform(self, node, inp, out_):
# TODO opt to make this inplace
......@@ -2001,36 +1995,10 @@ class AdvancedIncSubtensor1(Op):
if self.set_instead_of_inc:
x[idx] = y
else:
if config.cxx and node.inputs[0].dtype != 'float16':
increment = inplace_increment
else:
increment = self.inplace_increment1d_slow
increment(x, idx, y)
np.add.at(x, idx, y)
out[0] = x
def inplace_increment1d_slow(self, x, idx, y):
# If `y` has as many dimensions as `x`, then we want to iterate
# jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
if len(y) == 1:
# Allow broadcasting of y[0]
y_0 = y[0]
for i in idx:
x[i] += y_0
else:
assert len(y) == len(idx)
j = 0
for i in idx:
x[i] += y[j]
j += 1
else:
for i in idx:
x[i] += y
def infer_shape(self, node, ishapes):
x, y, ilist = ishapes
return [x]
......@@ -2246,14 +2214,8 @@ class AdvancedIncSubtensor(Op):
if self.set_instead_of_inc:
out[0][inputs[2:]] = inputs[1]
elif config.cxx:
inplace_increment(out[0], tuple(inputs[2:]), inputs[1])
else:
raise NotImplementedError(
'Could not import inplace_increment, so advanced '
'indexing is disabled. '
'Please make sure that you have a working C++ compiler '
'and that config.cxx is correctly set.')
np.add.at(out[0], tuple(inputs[2:]), inputs[1])
def infer_shape(self, node, ishapes):
return [ishapes[0]]
......
......@@ -2936,10 +2936,7 @@ def test_local_IncSubtensor_serialize():
i = T.vector('i', dtype='int64')
j = T.vector('j', dtype='int64')
t = T.scalar('t')
if theano.tensor.subtensor.inplace_increment:
y = (W[i] + W[j] + W[1] + W[i, j]).sum()
else:
y = (W[i] + W[j] + W[1]).sum()
y = (W[i] + W[j] + W[1] + W[i, j]).sum()
cost = T.sqr(t - y)
dW = theano.grad(cost, W)
mode = theano.compile.mode.get_default_mode().excluding('fusion')
......
......@@ -5,7 +5,6 @@ import sys
import unittest
import numpy as np
from nose.plugins.skip import SkipTest
from nose.tools import assert_equal
from numpy.testing import assert_array_equal
from six import StringIO
......@@ -524,10 +523,11 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
gn = theano.grad(t.sum(), n)
g = self.function([], gn, op=self.adv_incsub1)
utt.verify_grad(lambda m: m[[1, 3]],
[np.random.rand(5, 5).astype(self.dtype)])
[np.random.rand(5, 5).astype(self.dtype)],
mode=self.mode)
g()
utt.verify_grad(lambda m: m[idx],
[data])
[data], mode=self.mode)
def test_noncontiguous_idx(self):
data = rand(4, 2, 3)
......@@ -597,17 +597,20 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(np.allclose(g_00, 2))
utt.verify_grad(lambda m: m[[1, 3]],
[np.random.rand(5, 5).astype(self.dtype)])
[np.random.rand(5, 5).astype(self.dtype)],
mode=self.mode)
def fun(x, y):
return advanced_inc_subtensor1(x, y, [1, 3])
utt.verify_grad(fun, [np.random.rand(5, 5).astype(self.dtype),
np.random.rand(2, 5).astype(self.dtype)])
np.random.rand(2, 5).astype(self.dtype)],
mode=self.mode)
def fun(x, y):
return advanced_set_subtensor1(x, y, [1, 3])
utt.verify_grad(fun, [np.random.rand(5, 5).astype(self.dtype),
np.random.rand(2, 5).astype(self.dtype)])
np.random.rand(2, 5).astype(self.dtype)],
mode=self.mode)
# test set_subtensor broadcast
self.dtype = 'float32'
......@@ -872,12 +875,12 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
def fct(t):
return theano.tensor.sum(t[idx_])
utt.verify_grad(fct, [data])
utt.verify_grad(fct, [data], mode=self.mode)
# Test the grad of the grad (e.i. AdvancedIncSubtensor1.grad)
def fct2(t):
return theano.tensor.grad(theano.tensor.sum(t[idx_]), t)
utt.verify_grad(fct2, [data])
utt.verify_grad(fct2, [data], mode=self.mode)
# Test shape of AdvancedIncSubtensor1 and AdvancedSubtensor1
if not self.fast_compile:
......@@ -958,7 +961,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
# vector
utt.verify_grad(
inc_slice(slice(2, 4, None)),
(np.asarray([0, 1, 2, 3, 4, 5.]), np.asarray([9, 9.]),))
(np.asarray([0, 1, 2, 3, 4, 5.]), np.asarray([9, 9.]),),
mode=self.mode)
# matrix
utt.verify_grad(
......@@ -1498,9 +1502,6 @@ class TestAdvancedSubtensor(unittest.TestCase):
utt.assert_allclose(rval, aval)
def test_inc_adv_subtensor_w_2vec(self):
if not config.cxx:
raise SkipTest('config.cxx empty')
subt = self.m[self.ix1, self.ix12]
a = inc_subtensor(subt, subt)
......@@ -1519,9 +1520,6 @@ class TestAdvancedSubtensor(unittest.TestCase):
[.5, .3 * 2, .15]]), aval
def test_inc_adv_subtensor_with_broadcasting(self):
if not config.cxx:
raise SkipTest('config.cxx empty')
inc = dscalar()
a = inc_subtensor(self.m[self.ix1, self.ix12], inc)
g_inc = tensor.grad(a.sum(), inc)
......@@ -1542,9 +1540,6 @@ class TestAdvancedSubtensor(unittest.TestCase):
assert np.allclose(gval, 3.0), gval
def test_inc_adv_subtensor1_with_broadcasting(self):
if not config.cxx:
raise SkipTest('config.cxx empty')
inc = dscalar()
a = inc_subtensor(self.m[self.ix1], inc)
g_inc = tensor.grad(a.sum(), inc)
......@@ -1564,9 +1559,6 @@ class TestAdvancedSubtensor(unittest.TestCase):
assert np.allclose(gval, 9.0), gval
def test_inc_adv_subtensor_with_index_broadcasting(self):
if not config.cxx:
raise SkipTest('config.cxx empty')
a = inc_subtensor(self.m[self.ix1, self.ix2], 2.1)
assert a.type == self.m.type, (a.type, self.m.type)
......@@ -1640,17 +1632,20 @@ class TestAdvancedSubtensor(unittest.TestCase):
self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor))
utt.verify_grad(lambda m: m[[1, 3], [2, 4]],
[np.random.rand(5, 5).astype(self.dtype)])
[np.random.rand(5, 5).astype(self.dtype)],
mode=self.mode)
def fun(x, y):
return advanced_inc_subtensor(x, y, [1, 3], [2, 4])
utt.verify_grad(fun, [np.random.rand(5, 5).astype(self.dtype),
np.random.rand(2).astype(self.dtype)])
np.random.rand(2).astype(self.dtype)],
mode=self.mode)
def fun(x, y):
return advanced_set_subtensor(x, y, [1, 3], [2, 4])
utt.verify_grad(fun, [np.random.rand(5, 5).astype(self.dtype),
np.random.rand(2).astype(self.dtype)])
np.random.rand(2).astype(self.dtype)],
mode=self.mode)
class TestInferShape(utt.InferShapeTester):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论