Merge pull request #4763 from abergeron/gpuadvsub

Gpuadvsub

Merge pull request #4763 from abergeron/gpuadvsub
2f0ab791 · Pascal Lamblin · GitHub · ee4c4e21 · de5e3064 · 2f0ab791
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -52,6 +52,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
 from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
                       GpuCAReduceCPY, gpu_ca_reduce_cuda)
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
+                        GpuAdvancedSubtensor,
                        GpuAdvancedSubtensor1,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
@@ -971,10 +972,17 @@ def local_gpua_inc_subtensor(op, context_name, inputs, outputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedSubtensor1])
 @register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
-def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
+def local_gpua_advanced_subtensor1(op, context_name, inputs, outputs):
    return GpuAdvancedSubtensor1()
+@register_opt('fast_compile')
+@op_lifter([tensor.AdvancedSubtensor])
+@register_opt2([tensor.AdvancedSubtensor], 'fast_compile')
+def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
+    return GpuAdvancedSubtensor()
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
 @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -472,6 +472,107 @@ if (err != GA_NO_ERROR) {
        return (0,)
+class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor):
+    """
+    AdvancedSubtensor On the GPU.
+    """
+    def make_node(self, x, *inputs):
+        ctx_name = infer_context_name(x)
+        rval = tensor.AdvancedSubtensor.make_node(self, x, *inputs)
+        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
+                             broadcastable=rval.outputs[0].type.broadcastable,
+                             context_name=ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
+    def perform(self, node, inputs, out_):
+        out, = out_
+        x = inputs[0]
+        idx = inputs[1:]
+        # detect and transpose array indices
+        nidx = []
+        nshp = list(x.shape)
+        for k, i in enumerate(idx):
+            if i is None:
+                nidx.append(slice(None))
+                nshp.insert(k, 1)
+            else:
+                nidx.append(i)
+        x = x.reshape(nshp)
+        narrays = 0
+        transp = list(range(x.ndim))
+        p = 0
+        # ap gives the position of the array in case there is only one.
+        # if there are more than one (narray > 1) it should be ignored.
+        ap = 0
+        for k, i in enumerate(list(nidx)):
+            if (isinstance(i, numpy.ndarray) and
+                    i.ndim != 0):
+                transp.remove(k)
+                transp.insert(p, k)
+                ap += k
+                i = nidx.pop(k)
+                nidx.insert(p, i)
+                p += 1
+                narrays += 1
+            else:
+                if narrays == 0:
+                    try:
+                        i.__index__()
+                        # We shift back the position of the array by the
+                        # number of dimensions that are removed by
+                        # indexing.  If ap is bigger than 0 it means we
+                        # have encountered at least one array.
+                        if ap >= 0:
+                            ap -= 1
+                        # If this index is before the first array then
+                        # we will not move the array back to its
+                        # position.  Mark this by faking that there
+                        # are more than two arrays.  This is crazy
+                        # numpy behaviour so blame them.
+                        narrays = 2
+                    except Exception:
+                        pass
+        x = x.transpose(*transp)
+        idx_ = ([slice(None)] * p + nidx[p:])
+        x = x.__getitem__(idx_)
+        # flatten the array-indexed dimensions
+        shape = ((numpy.prod(x.shape[0: p]),) +
+                 x.shape[p:])
+        input_flat = x.reshape(shape)
+        # build the strides
+        strides = [1]
+        for i in range(p - 1, 0, -1):
+            stride = x.shape[i] * strides[-1]
+            strides.insert(0, stride)
+        # build the indices and use it
+        take_idx = sum((i * s for i, s in zip(nidx, strides)))
+        out_flat = input_flat.take1(pygpu.asarray(take_idx.flatten(),
+                                                  context=x.context))
+        # finish up
+        out_flat_shp = take_idx.shape + x.shape[p:]
+        o = out_flat.reshape(out_flat_shp)
+        # If there was only one array we need to move the indexed
+        # dimension(s) back to the position of the array, which is
+        # stored in ap.  Note that ap is invalid is narrays != 1.
+        if narrays == 1:
+            ntransp = list(range(take_idx.ndim, o.ndim))
+            ntransp[ap:ap] = list(range(take_idx.ndim))
+            o = o.transpose(*ntransp)
+        out[0] = o
 class GpuAdvancedIncSubtensor1(Op):
    """
    Implement AdvancedIncSubtensor1 on the gpu.

--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -10,6 +10,7 @@ from ..basic_ops import HostFromGpu, GpuFromHost
 from ..elemwise import GpuDimShuffle
 from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
                         GpuAdvancedSubtensor1,
+                         GpuAdvancedSubtensor,
                         GpuAdvancedIncSubtensor1)
 from ..type import gpuarray_shared_constructor
@@ -40,7 +41,7 @@ class G_subtensor(test_subtensor.T_subtensor):
 def test_advinc_subtensor1():
-    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
+    # Test the second case in the opt local_gpu_advanced_incsubtensor1
    for shp in [(3, 3), (3, 3, 3)]:
        shared = gpuarray_shared_constructor
        xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
@@ -87,3 +88,41 @@ def test_incsub_f16():
    rep = xval.copy()
    rep[1:] += yval
    assert numpy.allclose(rval, rep)
+class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
+    def shortDescription(self):
+        return None
+    def __init__(self, name):
+        test_subtensor.TestAdvancedSubtensor.__init__(
+            self, name,
+            shared=gpuarray_shared_constructor,
+            sub=GpuAdvancedSubtensor,
+            mode=mode_with_gpu,
+            # avoid errors with limited devices
+            dtype='float32',
+            ignore_topo=(HostFromGpu, GpuFromHost,
+                         DeepCopyOp))
+        # GPU opt can't run in fast_compile only.
+        self.fast_compile = False
+        assert self.sub == GpuAdvancedSubtensor
+def test_adv_subtensor():
+    # Test the advancedsubtensor on gpu.
+    shp = (2, 3, 4)
+    shared = gpuarray_shared_constructor
+    xval = numpy.arange(numpy.prod(shp), dtype=theano.config.floatX).reshape(shp)
+    idx1, idx2 = tensor.ivectors('idx1', 'idx2')
+    idxs = [idx1, None, slice(0, 2, 1), idx2, None]
+    x = shared(xval, name='x')
+    expr = x[idxs]
+    f = theano.function([idx1, idx2], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op, GpuAdvancedSubtensor)
+               for node in f.maker.fgraph.toposort()]) == 1
+    idx1_val = [0, 1]
+    idx2_val = [0, 1]
+    rval = f(idx1_val, idx2_val)
+    rep = xval[idx1_val, None, slice(0, 2, 1), idx2_val, None]
+    assert numpy.allclose(rval, rep)
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1009,23 +1009,20 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
    def shortDescription(self):
        return None
-    shared = staticmethod(cuda.shared_constructor)
-    sub = cuda.GpuSubtensor
-    inc_sub = cuda.GpuIncSubtensor
-    adv_sub1 = cuda.GpuAdvancedSubtensor1
-    adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
-    dimshuffle = cuda.GpuDimShuffle
-    mode = mode_with_gpu
-    dtype = 'float32'
-    type = tcn.CudaNdarrayType
-    ignore_topo = (B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp)
-    fast_compile = False
-    ops = (cuda.GpuSubtensor, cuda.GpuIncSubtensor,
-           cuda.GpuAdvancedSubtensor1, cuda.GpuAdvancedIncSubtensor1)
    def __init__(self, name):
-        return super(theano.tensor.tests.test_subtensor.T_subtensor,
+        super(T_subtensor, self).__init__(
-                     self).__init__(name)
+            name,
+            shared=cuda.shared_constructor,
+            sub=cuda.GpuSubtensor,
+            inc_sub=cuda.GpuIncSubtensor,
+            adv_sub1=cuda.GpuAdvancedSubtensor1,
+            adv_incsub1=cuda.GpuAdvancedIncSubtensor1,
+            dimshuffle=cuda.GpuDimShuffle,
+            mode=mode_with_gpu,
+            dtype='float32',
+            type=tcn.CudaNdarrayType,
+            ignore_topo=(B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp))
+        self.fast_compile = False
    def test_adv_sub1_fast(self):
        """We check that the special cases of advanced indexing that

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -20,7 +20,7 @@ from theano.tensor.basic import alloc
 from theano.tensor.basic import (addbroadcast, clip, get_scalar_constant_value,
                                 ARange, TensorType, NotScalarConstantError)
 from theano.tensor.elemwise import DimShuffle
-from theano.tensor.type_other import NoneConst, SliceType, make_slice
+from theano.tensor.type_other import NoneConst, SliceType, NoneTypeT, make_slice
 from theano import config
 inplace_increment = None
@@ -2077,6 +2077,8 @@ def as_index_variable(idx):
        return make_slice(idx)
    if isinstance(idx, gof.Variable) and isinstance(idx.type, SliceType):
        return idx
+    if isinstance(idx, gof.Variable) and isinstance(idx.type, NoneTypeT):
+        return idx
    idx = theano.tensor.as_tensor_variable(idx)
    if idx.type.dtype[:3] not in ('int', 'uin'):
        raise TypeError('index must be integers')
@@ -2165,17 +2167,8 @@ class AdvancedSubtensor(Op):
        # TODO: in general, we need to re-pack the inputs into a valid
        # index, just like subtensor
        out[0] = inputs[0].__getitem__(inputs[1:])
-        if (numpy.__version__ <= '1.6.1' and
-                out[0].size != numpy.uint32(out[0].size)):
-            warnings.warn(
-                'Numpy versions 1.6.1 and below have a bug preventing '
-                'advanced indexing from correctly filling arrays that '
-                'are too big (>= 2^32 elements). It is possible that '
-                'out[0] (%s), with shape %s, is not correctly filled.'
-                % (out[0], out[0].shape))
    def connection_pattern(self, node):
        rval = [[True]]
        for ipt in node.inputs[1:]:

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -6692,14 +6692,11 @@ class test_arithmetic_cast(unittest.TestCase):
                                    config.int_division == 'floatX'):
                                    assert theano_dtype == config.floatX
                                    continue
-                                numpy_version = [int(v) for v in
-                                                 numpy.__version__.split('.')[:2]]
                                if (cfg == 'numpy+floatX' and
                                    a_type == 'complex128' and
                                    (b_type == 'float32' or
                                     b_type == 'float16') and
                                    combo == ('scalar', 'array') and
-                                    bool(numpy_version >= [1, 6]) and
                                    theano_dtype == 'complex128' and
                                    numpy_dtype == 'complex64'):
                                    # In numpy 1.6.x adding a complex128 with
@@ -6707,7 +6704,7 @@ class test_arithmetic_cast(unittest.TestCase):
                                    # of 1.9.2. this is still the case so it is
                                    # probably by design
                                    raise SkipTest("Known issue with"
-                                                   "numpy >= 1.6.x see #761")
+                                                   "numpy see #761")
                                # In any other situation: something wrong is
                                # going on!
                                assert False

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -20,8 +20,8 @@ from theano.compile import DeepCopyOp
 from theano.tensor import (MakeSlice, NotScalarConstantError, _shared,
                           as_tensor_variable, cscalar, ctensor3, dmatrix,
                           dscalar, dtensor4, dvector, fmatrix, fscalar,
-                           fvector, iscalar, lmatrix, lrow, lvector, matrix,
+                           fvector, ftensor4, iscalar, lmatrix, lrow, lvector,
-                           vector)
+                           matrix, vector)
 from theano.tensor.basic import DimShuffle
 from theano.tensor.subtensor import (AdvancedIncSubtensor,
                                     AdvancedIncSubtensor1, AdvancedSubtensor,
@@ -55,6 +55,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
                 inc_sub=tensor.IncSubtensor,
                 adv_sub1=tensor.AdvancedSubtensor1,
                 adv_incsub1=tensor.AdvancedIncSubtensor1,
+                 adv_sub=tensor.AdvancedSubtensor,
                 mode=None,
                 dtype=theano.config.floatX,
                 type=tensor.TensorType,
@@ -65,6 +66,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.inc_sub = inc_sub
        self.adv_sub1 = adv_sub1
        self.adv_incsub1 = adv_incsub1
+        self.adv_sub = adv_sub
        self.dimshuffle = dimshuffle
        if mode is None:
            mode = theano.compile.mode.get_default_mode()
@@ -354,13 +356,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            (3, DimShuffle, self.dimshuffle,
             numpy.index_exp[..., [0, 2, 3]]),
            (1, DimShuffle, self.dimshuffle,
-             numpy.index_exp[numpy.newaxis, ...])]
+             numpy.index_exp[numpy.newaxis, ...]),
-        # The following test case is not supported by numpy before 1.9
+            (1, AdvancedSubtensor, self.adv_sub,
-        numpy_version = [int(v) for v in numpy.version.version.split('.')[0:2]]
+             numpy.index_exp[..., numpy.newaxis, [1, 2]])]
-        if numpy_version >= [1, 9]:
-            test_cases.append(
-                (1, AdvancedSubtensor, AdvancedSubtensor,
-                 numpy.index_exp[..., numpy.newaxis, [1, 2]]))
        for length, op_type, op_type_opt, slice_ in test_cases:
            numpy_tval = numpy_n[slice_]
@@ -1351,6 +1349,7 @@ class TestAdvancedSubtensor(unittest.TestCase):
        self.v = fvector()
        self.m = dmatrix()
        self.t = ctensor3()
+        self.ft4 = ftensor4()
        self.ix1 = lvector()  # advanced 1d query
        self.ix12 = lvector()
@@ -1421,11 +1420,57 @@ class TestAdvancedSubtensor(unittest.TestCase):
        a = inc_subtensor(subt, subt)
        assert a.type == self.v.type, (a.type, self.v.type)
-        f = theano.function([self.v, self.ix2], a, allow_input_downcast=True)
+        f = theano.function([self.v, self.ix2], a, allow_input_downcast=True,
+                            mode=self.mode)
        aval = f([.4, .9, .1], [[1, 2],
                                [1, 2]])
        assert numpy.allclose(aval, [.4, .9 * 3, .1 * 3])
+    def test_adv_subtensor_w_int_and_matrix(self):
+        subt = self.ft4[0, :, self.ix2, :]
+        f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
+        ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
+        ix2v = numpy.asarray([[0, 1], [1, 0]])
+        aval = f(ft4v, ix2v)
+        rval = ft4v[0, :, ix2v, :]
+        utt.assert_allclose(rval, aval)
+    def test_adv_subtensor_w_none_and_matrix(self):
+        subt = self.ft4[:, None, :, self.ix2, :]
+        f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
+        ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
+        ix2v = numpy.asarray([[0, 1], [1, 0]])
+        aval = f(ft4v, ix2v)
+        rval = ft4v[:, None, :, ix2v, :]
+        utt.assert_allclose(rval, aval)
+    def test_adv_subtensor_w_slice_and_matrix(self):
+        subt = self.ft4[:, 0:1, self.ix2, :]
+        f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
+        ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
+        ix2v = numpy.asarray([[0, 1], [1, 0]])
+        aval = f(ft4v, ix2v)
+        rval = ft4v[:, 0:1, ix2v, :]
+        utt.assert_allclose(rval, aval)
+    def test_adv_subtensor_w_matrix_and_int(self):
+        subt = self.ft4[:, :, self.ix2, 0]
+        f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
+        ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
+        ix2v = numpy.asarray([[0, 1], [1, 0]])
+        aval = f(ft4v, ix2v)
+        rval = ft4v[:, :, ix2v, 0]
+        utt.assert_allclose(rval, aval)
+    def test_adv_subtensor_w_matrix_and_none(self):
+        subt = self.ft4[:, :, self.ix2, None, :]
+        f = theano.function([self.ft4, self.ix2], subt, mode=self.mode)
+        ft4v = numpy.random.random((2, 3, 4, 5)).astype('float32')
+        ix2v = numpy.asarray([[0, 1], [1, 0]])
+        aval = f(ft4v, ix2v)
+        rval = ft4v[:, :, ix2v, None, :]
+        utt.assert_allclose(rval, aval)
    def test_inc_adv_subtensor_w_2vec(self):
        if inplace_increment is None:
            raise inplace_increment_missing