提交 2f0ab791 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #4763 from abergeron/gpuadvsub

Gpuadvsub
...@@ -52,6 +52,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx, ...@@ -52,6 +52,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY, gpu_ca_reduce_cuda) GpuCAReduceCPY, gpu_ca_reduce_cuda)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
...@@ -971,10 +972,17 @@ def local_gpua_inc_subtensor(op, context_name, inputs, outputs): ...@@ -971,10 +972,17 @@ def local_gpua_inc_subtensor(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1]) @op_lifter([tensor.AdvancedSubtensor1])
@register_opt2([tensor.AdvancedSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
def local_gpua_advanced_subtensor(op, context_name, inputs, outputs): def local_gpua_advanced_subtensor1(op, context_name, inputs, outputs):
return GpuAdvancedSubtensor1() return GpuAdvancedSubtensor1()
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor])
@register_opt2([tensor.AdvancedSubtensor], 'fast_compile')
def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
return GpuAdvancedSubtensor()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
......
...@@ -472,6 +472,107 @@ if (err != GA_NO_ERROR) { ...@@ -472,6 +472,107 @@ if (err != GA_NO_ERROR) {
return (0,) return (0,)
class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor):
"""
AdvancedSubtensor On the GPU.
"""
def make_node(self, x, *inputs):
ctx_name = infer_context_name(x)
rval = tensor.AdvancedSubtensor.make_node(self, x, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable,
context_name=ctx_name)
x = as_gpuarray_variable(x, ctx_name)
return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
def perform(self, node, inputs, out_):
out, = out_
x = inputs[0]
idx = inputs[1:]
# detect and transpose array indices
nidx = []
nshp = list(x.shape)
for k, i in enumerate(idx):
if i is None:
nidx.append(slice(None))
nshp.insert(k, 1)
else:
nidx.append(i)
x = x.reshape(nshp)
narrays = 0
transp = list(range(x.ndim))
p = 0
# ap gives the position of the array in case there is only one.
# if there are more than one (narray > 1) it should be ignored.
ap = 0
for k, i in enumerate(list(nidx)):
if (isinstance(i, numpy.ndarray) and
i.ndim != 0):
transp.remove(k)
transp.insert(p, k)
ap += k
i = nidx.pop(k)
nidx.insert(p, i)
p += 1
narrays += 1
else:
if narrays == 0:
try:
i.__index__()
# We shift back the position of the array by the
# number of dimensions that are removed by
# indexing. If ap is bigger than 0 it means we
# have encountered at least one array.
if ap >= 0:
ap -= 1
# If this index is before the first array then
# we will not move the array back to its
# position. Mark this by faking that there
# are more than two arrays. This is crazy
# numpy behaviour so blame them.
narrays = 2
except Exception:
pass
x = x.transpose(*transp)
idx_ = ([slice(None)] * p + nidx[p:])
x = x.__getitem__(idx_)
# flatten the array-indexed dimensions
shape = ((numpy.prod(x.shape[0: p]),) +
x.shape[p:])
input_flat = x.reshape(shape)
# build the strides
strides = [1]
for i in range(p - 1, 0, -1):
stride = x.shape[i] * strides[-1]
strides.insert(0, stride)
# build the indices and use it
take_idx = sum((i * s for i, s in zip(nidx, strides)))
out_flat = input_flat.take1(pygpu.asarray(take_idx.flatten(),
context=x.context))
# finish up
out_flat_shp = take_idx.shape + x.shape[p:]
o = out_flat.reshape(out_flat_shp)
# If there was only one array we need to move the indexed
# dimension(s) back to the position of the array, which is
# stored in ap. Note that ap is invalid is narrays != 1.
if narrays == 1:
ntransp = list(range(take_idx.ndim, o.ndim))
ntransp[ap:ap] = list(range(take_idx.ndim))
o = o.transpose(*ntransp)
out[0] = o
class GpuAdvancedIncSubtensor1(Op): class GpuAdvancedIncSubtensor1(Op):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
......
...@@ -10,6 +10,7 @@ from ..basic_ops import HostFromGpu, GpuFromHost ...@@ -10,6 +10,7 @@ from ..basic_ops import HostFromGpu, GpuFromHost
from ..elemwise import GpuDimShuffle from ..elemwise import GpuDimShuffle
from ..subtensor import (GpuIncSubtensor, GpuSubtensor, from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedSubtensor,
GpuAdvancedIncSubtensor1) GpuAdvancedIncSubtensor1)
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor
...@@ -40,7 +41,7 @@ class G_subtensor(test_subtensor.T_subtensor): ...@@ -40,7 +41,7 @@ class G_subtensor(test_subtensor.T_subtensor):
def test_advinc_subtensor1(): def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """ # Test the second case in the opt local_gpu_advanced_incsubtensor1
for shp in [(3, 3), (3, 3, 3)]: for shp in [(3, 3), (3, 3, 3)]:
shared = gpuarray_shared_constructor shared = gpuarray_shared_constructor
xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1 xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
...@@ -87,3 +88,41 @@ def test_incsub_f16(): ...@@ -87,3 +88,41 @@ def test_incsub_f16():
rep = xval.copy() rep = xval.copy()
rep[1:] += yval rep[1:] += yval
assert numpy.allclose(rval, rep) assert numpy.allclose(rval, rep)
class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
def shortDescription(self):
return None
def __init__(self, name):
test_subtensor.TestAdvancedSubtensor.__init__(
self, name,
shared=gpuarray_shared_constructor,
sub=GpuAdvancedSubtensor,
mode=mode_with_gpu,
# avoid errors with limited devices
dtype='float32',
ignore_topo=(HostFromGpu, GpuFromHost,
DeepCopyOp))
# GPU opt can't run in fast_compile only.
self.fast_compile = False
assert self.sub == GpuAdvancedSubtensor
def test_adv_subtensor():
# Test the advancedsubtensor on gpu.
shp = (2, 3, 4)
shared = gpuarray_shared_constructor
xval = numpy.arange(numpy.prod(shp), dtype=theano.config.floatX).reshape(shp)
idx1, idx2 = tensor.ivectors('idx1', 'idx2')
idxs = [idx1, None, slice(0, 2, 1), idx2, None]
x = shared(xval, name='x')
expr = x[idxs]
f = theano.function([idx1, idx2], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, GpuAdvancedSubtensor)
for node in f.maker.fgraph.toposort()]) == 1
idx1_val = [0, 1]
idx2_val = [0, 1]
rval = f(idx1_val, idx2_val)
rep = xval[idx1_val, None, slice(0, 2, 1), idx2_val, None]
assert numpy.allclose(rval, rep)
...@@ -1009,23 +1009,20 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor): ...@@ -1009,23 +1009,20 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
def shortDescription(self): def shortDescription(self):
return None return None
shared = staticmethod(cuda.shared_constructor)
sub = cuda.GpuSubtensor
inc_sub = cuda.GpuIncSubtensor
adv_sub1 = cuda.GpuAdvancedSubtensor1
adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
dimshuffle = cuda.GpuDimShuffle
mode = mode_with_gpu
dtype = 'float32'
type = tcn.CudaNdarrayType
ignore_topo = (B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp)
fast_compile = False
ops = (cuda.GpuSubtensor, cuda.GpuIncSubtensor,
cuda.GpuAdvancedSubtensor1, cuda.GpuAdvancedIncSubtensor1)
def __init__(self, name): def __init__(self, name):
return super(theano.tensor.tests.test_subtensor.T_subtensor, super(T_subtensor, self).__init__(
self).__init__(name) name,
shared=cuda.shared_constructor,
sub=cuda.GpuSubtensor,
inc_sub=cuda.GpuIncSubtensor,
adv_sub1=cuda.GpuAdvancedSubtensor1,
adv_incsub1=cuda.GpuAdvancedIncSubtensor1,
dimshuffle=cuda.GpuDimShuffle,
mode=mode_with_gpu,
dtype='float32',
type=tcn.CudaNdarrayType,
ignore_topo=(B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp))
self.fast_compile = False
def test_adv_sub1_fast(self): def test_adv_sub1_fast(self):
"""We check that the special cases of advanced indexing that """We check that the special cases of advanced indexing that
......
...@@ -20,7 +20,7 @@ from theano.tensor.basic import alloc ...@@ -20,7 +20,7 @@ from theano.tensor.basic import alloc
from theano.tensor.basic import (addbroadcast, clip, get_scalar_constant_value, from theano.tensor.basic import (addbroadcast, clip, get_scalar_constant_value,
ARange, TensorType, NotScalarConstantError) ARange, TensorType, NotScalarConstantError)
from theano.tensor.elemwise import DimShuffle from theano.tensor.elemwise import DimShuffle
from theano.tensor.type_other import NoneConst, SliceType, make_slice from theano.tensor.type_other import NoneConst, SliceType, NoneTypeT, make_slice
from theano import config from theano import config
inplace_increment = None inplace_increment = None
...@@ -2077,6 +2077,8 @@ def as_index_variable(idx): ...@@ -2077,6 +2077,8 @@ def as_index_variable(idx):
return make_slice(idx) return make_slice(idx)
if isinstance(idx, gof.Variable) and isinstance(idx.type, SliceType): if isinstance(idx, gof.Variable) and isinstance(idx.type, SliceType):
return idx return idx
if isinstance(idx, gof.Variable) and isinstance(idx.type, NoneTypeT):
return idx
idx = theano.tensor.as_tensor_variable(idx) idx = theano.tensor.as_tensor_variable(idx)
if idx.type.dtype[:3] not in ('int', 'uin'): if idx.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('index must be integers') raise TypeError('index must be integers')
...@@ -2165,17 +2167,8 @@ class AdvancedSubtensor(Op): ...@@ -2165,17 +2167,8 @@ class AdvancedSubtensor(Op):
# TODO: in general, we need to re-pack the inputs into a valid # TODO: in general, we need to re-pack the inputs into a valid
# index, just like subtensor # index, just like subtensor
out[0] = inputs[0].__getitem__(inputs[1:]) out[0] = inputs[0].__getitem__(inputs[1:])
if (numpy.__version__ <= '1.6.1' and
out[0].size != numpy.uint32(out[0].size)):
warnings.warn(
'Numpy versions 1.6.1 and below have a bug preventing '
'advanced indexing from correctly filling arrays that '
'are too big (>= 2^32 elements). It is possible that '
'out[0] (%s), with shape %s, is not correctly filled.'
% (out[0], out[0].shape))
def connection_pattern(self, node): def connection_pattern(self, node):
rval = [[True]] rval = [[True]]
for ipt in node.inputs[1:]: for ipt in node.inputs[1:]:
......
...@@ -6692,14 +6692,11 @@ class test_arithmetic_cast(unittest.TestCase): ...@@ -6692,14 +6692,11 @@ class test_arithmetic_cast(unittest.TestCase):
config.int_division == 'floatX'): config.int_division == 'floatX'):
assert theano_dtype == config.floatX assert theano_dtype == config.floatX
continue continue
numpy_version = [int(v) for v in
numpy.__version__.split('.')[:2]]
if (cfg == 'numpy+floatX' and if (cfg == 'numpy+floatX' and
a_type == 'complex128' and a_type == 'complex128' and
(b_type == 'float32' or (b_type == 'float32' or
b_type == 'float16') and b_type == 'float16') and
combo == ('scalar', 'array') and combo == ('scalar', 'array') and
bool(numpy_version >= [1, 6]) and
theano_dtype == 'complex128' and theano_dtype == 'complex128' and
numpy_dtype == 'complex64'): numpy_dtype == 'complex64'):
# In numpy 1.6.x adding a complex128 with # In numpy 1.6.x adding a complex128 with
...@@ -6707,7 +6704,7 @@ class test_arithmetic_cast(unittest.TestCase): ...@@ -6707,7 +6704,7 @@ class test_arithmetic_cast(unittest.TestCase):
# of 1.9.2. this is still the case so it is # of 1.9.2. this is still the case so it is
# probably by design # probably by design
raise SkipTest("Known issue with" raise SkipTest("Known issue with"
"numpy >= 1.6.x see #761") "numpy see #761")
# In any other situation: something wrong is # In any other situation: something wrong is
# going on! # going on!
assert False assert False
......
...@@ -20,8 +20,8 @@ from theano.compile import DeepCopyOp ...@@ -20,8 +20,8 @@ from theano.compile import DeepCopyOp
from theano.tensor import (MakeSlice, NotScalarConstantError, _shared, from theano.tensor import (MakeSlice, NotScalarConstantError, _shared,
as_tensor_variable, cscalar, ctensor3, dmatrix, as_tensor_variable, cscalar, ctensor3, dmatrix,
dscalar, dtensor4, dvector, fmatrix, fscalar, dscalar, dtensor4, dvector, fmatrix, fscalar,
fvector, iscalar, lmatrix, lrow, lvector, matrix, fvector, ftensor4, iscalar, lmatrix, lrow, lvector,
vector) matrix, vector)
from theano.tensor.basic import DimShuffle from theano.tensor.basic import DimShuffle
from theano.tensor.subtensor import (AdvancedIncSubtensor, from theano.tensor.subtensor import (AdvancedIncSubtensor,
AdvancedIncSubtensor1, AdvancedSubtensor, AdvancedIncSubtensor1, AdvancedSubtensor,
...@@ -55,6 +55,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -55,6 +55,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
inc_sub=tensor.IncSubtensor, inc_sub=tensor.IncSubtensor,
adv_sub1=tensor.AdvancedSubtensor1, adv_sub1=tensor.AdvancedSubtensor1,
adv_incsub1=tensor.AdvancedIncSubtensor1, adv_incsub1=tensor.AdvancedIncSubtensor1,
adv_sub=tensor.AdvancedSubtensor,
mode=None, mode=None,
dtype=theano.config.floatX, dtype=theano.config.floatX,
type=tensor.TensorType, type=tensor.TensorType,
...@@ -65,6 +66,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -65,6 +66,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.inc_sub = inc_sub self.inc_sub = inc_sub
self.adv_sub1 = adv_sub1 self.adv_sub1 = adv_sub1
self.adv_incsub1 = adv_incsub1 self.adv_incsub1 = adv_incsub1
self.adv_sub = adv_sub
self.dimshuffle = dimshuffle self.dimshuffle = dimshuffle
if mode is None: if mode is None:
mode = theano.compile.mode.get_default_mode() mode = theano.compile.mode.get_default_mode()
...@@ -354,13 +356,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -354,13 +356,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
(3, DimShuffle, self.dimshuffle, (3, DimShuffle, self.dimshuffle,
numpy.index_exp[..., [0, 2, 3]]), numpy.index_exp[..., [0, 2, 3]]),
(1, DimShuffle, self.dimshuffle, (1, DimShuffle, self.dimshuffle,
numpy.index_exp[numpy.newaxis, ...])] numpy.index_exp[numpy.newaxis, ...]),
# The following test case is not supported by numpy before 1.9 (1, AdvancedSubtensor, self.adv_sub,
numpy_version = [int(v) for v in numpy.version.version.split('.')[0:2]] numpy.index_exp[..., numpy.newaxis, [1, 2]])]
if numpy_version >= [1, 9]:
test_cases.append(
(1, AdvancedSubtensor, AdvancedSubtensor,
numpy.index_exp[..., numpy.newaxis, [1, 2]]))
for length, op_type, op_type_opt, slice_ in test_cases: for length, op_type, op_type_opt, slice_ in test_cases:
numpy_tval = numpy_n[slice_] numpy_tval = numpy_n[slice_]
...@@ -1351,6 +1349,7 @@ class TestAdvancedSubtensor(unittest.TestCase): ...@@ -1351,6 +1349,7 @@ class TestAdvancedSubtensor(unittest.TestCase):
self.v = fvector() self.v = fvector()
self.m = dmatrix() self.m = dmatrix()
self.t = ctensor3() self.t = ctensor3()
self.ft4 = ftensor4()
self.ix1 = lvector() # advanced 1d query self.ix1 = lvector() # advanced 1d query
self.ix12 = lvector() self.ix12 = lvector()
...@@ -1421,11 +1420,57 @@ class TestAdvancedSubtensor(unittest.TestCase): ...@@ -1421,11 +1420,57 @@ class TestAdvancedSubtensor(unittest.TestCase):
a = inc_subtensor(subt, subt) a = inc_subtensor(subt, subt)
assert a.type == self.v.type, (a.type, self.v.type) assert a.type == self.v.type, (a.type, self.v.type)
f = theano.function([self.v, self.ix2], a, allow_input_downcast=True) f = theano.function([self.v, self.ix2], a, allow_input_downcast=True,
mode=self.mode)
aval = f([.4, .9, .1], [[1, 2], aval = f([.4, .9, .1], [[1, 2],
[1, 2]]) [1, 2]])
assert numpy.allclose(aval, [.4, .9 * 3, .1 * 3]) assert numpy.allclose(aval, [.4, .9 * 3, .1 * 3])
def test_adv_subtensor_w_int_and_matrix(self):
subt = self.ft4[0, :, self.ix2, :]
f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
ix2v = numpy.asarray([[0, 1], [1, 0]])
aval = f(ft4v, ix2v)
rval = ft4v[0, :, ix2v, :]
utt.assert_allclose(rval, aval)
def test_adv_subtensor_w_none_and_matrix(self):
subt = self.ft4[:, None, :, self.ix2, :]
f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
ix2v = numpy.asarray([[0, 1], [1, 0]])
aval = f(ft4v, ix2v)
rval = ft4v[:, None, :, ix2v, :]
utt.assert_allclose(rval, aval)
def test_adv_subtensor_w_slice_and_matrix(self):
subt = self.ft4[:, 0:1, self.ix2, :]
f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
ix2v = numpy.asarray([[0, 1], [1, 0]])
aval = f(ft4v, ix2v)
rval = ft4v[:, 0:1, ix2v, :]
utt.assert_allclose(rval, aval)
def test_adv_subtensor_w_matrix_and_int(self):
subt = self.ft4[:, :, self.ix2, 0]
f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
ix2v = numpy.asarray([[0, 1], [1, 0]])
aval = f(ft4v, ix2v)
rval = ft4v[:, :, ix2v, 0]
utt.assert_allclose(rval, aval)
def test_adv_subtensor_w_matrix_and_none(self):
subt = self.ft4[:, :, self.ix2, None, :]
f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
ix2v = numpy.asarray([[0, 1], [1, 0]])
aval = f(ft4v, ix2v)
rval = ft4v[:, :, ix2v, None, :]
utt.assert_allclose(rval, aval)
def test_inc_adv_subtensor_w_2vec(self): def test_inc_adv_subtensor_w_2vec(self):
if inplace_increment is None: if inplace_increment is None:
raise inplace_increment_missing raise inplace_increment_missing
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论