implemented GpuAdvancedSubtensor1 and test it.

c5727d8c · Frederic Bastien · cb15c1c6 · c5727d8c · c5727d8c · c5727d8c
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -137,10 +137,12 @@ outdated!""")

    import basic_ops
    from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
-            GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous,
-            GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape, GpuAlloc,
-            GpuJoin,fscalar, fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4
-                           , scalar, vector, matrix, row, col, tensor3, tensor4)
+                           GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous,
+                           GpuSubtensor, GpuAdvancedSubtensor1, GpuIncSubtensor,
+                           GpuFlatten, GpuShape, GpuAlloc,
+                           GpuJoin,fscalar, fscalar, fvector, fmatrix, frow, fcol,
+                           ftensor3, ftensor4, scalar, vector, matrix, row, col,
+                           tensor3, tensor4)
    from basic_ops import host_from_gpu, gpu_from_host
    import opt
    import cuda_ndarray

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1720,6 +1720,32 @@ class GpuSubtensor(tensor.Subtensor):
            cdata = cdata[0]
        out[0] = x.__getitem__(cdata)

+class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1):
+    def make_node(self, x, ilist):
+        x_ = as_cuda_ndarray_variable(x)
+        ilist_ = tensor.as_tensor_variable(ilist)
+        if ilist_.type.dtype[:3] not in ('int', 'uin'):
+            raise TypeError('index must be integers')
+        if ilist_.type.broadcastable != (False,):
+            raise TypeError('index must be vector')
+        if x_.type.ndim == 0:
+            raise TypeError('cannot index into a scalar')
+        if x_.type.broadcastable[0]:
+            # the caller should have made a copy of x len(ilist) times
+            raise TypeError('cannot index into a broadcastable dimension')
+
+        return Apply(self, [x_, ilist_], [x_.type()])
+
+    def perform(self, node, inp, out_):
+        # This don't work as CudaNdarray_Subscript() don't support it.
+        #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
+        x, idx = inp
+        out, = out_
+        o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),)+x.shape[1:])
+        for (j,i) in enumerate(idx):
+            o[j] = x[i]
+        out[0] = o
+
 class GpuIncSubtensor(tensor.IncSubtensor):
    def make_node(self, x, y, *inputs):
        assert isinstance(x.type, CudaNdarrayType)

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -500,6 +500,23 @@ def local_gpu_subtensor(node):
            return [host_from_gpu(GpuSubtensor(node.op.idx_list)(gpu_x, *coords))]
    return False

+@register_opt()
+@local_optimizer([])
+def local_gpu_advanced_subtensor1(node):
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op, tensor.AdvancedSubtensor1):
+            x = host_input.owner.inputs[0]
+            coords = host_input.owner.inputs[1:]
+            return [GpuAdvancedSubtensor1()(gpu_from_host(x), *coords)]
+    if isinstance(node.op, tensor.AdvancedSubtensor1):
+        x  = node.inputs[0]
+        coords = node.inputs[1:]
+        if x.owner and x.owner.op == host_from_gpu:
+            gpu_x, = x.owner.inputs
+            return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
+    return False
+
 @register_opt()
 @local_optimizer([])
 def local_gpu_incsubtensor(node):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -783,6 +783,18 @@ def test_gpualloc_output_to_gpu():
    assert numpy.allclose(numpy.ones(a.value.shape)+9,f_gpu(9))
    assert numpy.allclose(f(5),f_gpu(5))

+import theano.tensor.tests.test_basic
+# This is to don't duplicate test.
+# TODO: the source class test only Adv_subtensor1 test on gpu. All other are tested only on the cpu!
+class T_Adv_subtensor1(theano.tensor.tests.test_basic.T_subtensor):
+    shared=staticmethod(cuda.shared_constructor)
+    adv_sub1=cuda.GpuAdvancedSubtensor1
+    mode=mode_with_gpu
+    dtype='float32'
+    ignore_topo=(B.HostFromGpu, B.GpuFromHost)
+    def __init__(self, name):
+        return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name)
+
 def test_inc_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1372,6 +1372,17 @@ class T_min_max(unittest.TestCase):
        #check_grad_max(data,eval_outputs(grad(max_and_argmax(n,axis=1)[0],n)),axis=1)

 class T_subtensor(unittest.TestCase):
+    def __init__(self, name, shared=shared,
+                 adv_sub1=theano.tensor.basic.AdvancedSubtensor1, mode=None,
+                 dtype=theano.config.floatX,
+                 ignore_topo=()):
+        self.shared = shared
+        self.adv_sub1 = adv_sub1
+        self.mode = mode
+        self.dtype=dtype
+        self.ignore_topo=ignore_topo
+        return super(T_subtensor, self).__init__(name)
+
    def setUp(self):
        Subtensor.debug = False
        utt.seed_rng()
@@ -1582,47 +1593,56 @@ class T_subtensor(unittest.TestCase):
                          (numpy.random.rand(4,2,3), [0,3]),
                          (numpy.random.rand(4,2,3), [3,3,1,1,2,2,0,0]),
                          ]:
-            n = shared(data)
+            data = numpy.asarray(data, dtype=self.dtype)
+            n = self.shared(data)
            t = n[idx]
-            f = function([], t, mode=None)
+            f = function([], t, mode=self.mode)
            topo = f.maker.env.toposort()
-            assert len(topo) == 1
-            assert isinstance(topo[0].op, theano.tensor.basic.AdvancedSubtensor1)
+            topo_ = [node for node in topo if not isinstance(node.op, self.ignore_topo)]
+            assert len(topo_) == 1
+            assert isinstance(topo_[0].op, self.adv_sub1)
            val = f()
            good = data[idx]
            self.failUnless(val.ndim == data.ndim)
            self.failUnless(numpy.allclose(val, good), (val, good))

    def test_err_invalid_list(self):
-        n = shared(numpy.asarray(5))
+        n = self.shared(numpy.asarray(5, dtype=self.dtype))
        self.assertRaises(TypeError, n.__getitem__, [0,0])

    def test_err_invalid_2list(self):
        # TODO the error message is not clear
-        n = shared(numpy.ones((3,3))*5)
+        n = self.shared(numpy.ones((3,3), dtype=self.dtype)*5)
        self.assertRaises(TypeError, n.__getitem__, ([0,0],[1,1]))

    def test_err_bound_list(self):
-        n = shared(numpy.ones((2,3))*5)
+        n = self.shared(numpy.ones((2,3),dtype=self.dtype)*5)
        t = n[[0,4]]
-        self.failUnless(isinstance(t.owner.op, AdvancedSubtensor1))
-        self.assertRaises(IndexError, eval_outputs, [t])
+        # We test again AdvancedSubtensor1 as we transfer data to the cpu.
+        self.failUnless(isinstance(t.owner.op, theano.tensor.basic.AdvancedSubtensor1))
+        f = function([], t, mode=self.mode)
+        topo = f.maker.env.toposort()
+        topo_ = [node for node in topo if not isinstance(node.op, self.ignore_topo)]
+        assert len(topo_)==1
+        self.failUnless(isinstance(topo_[0].op, self.adv_sub1))
+        self.assertRaises(IndexError, f)

    def grad_list_(self, idxs, data):
-        n = shared(data)
+        n = self.shared(data)
        fast_compile = theano.config.mode == 'FAST_COMPILE'

        for idx in idxs:
+            # Should stay on the cpu.
            idx_ = shared(numpy.asarray(idx))
            t = n[idx_]
            gn = grad(sum(exp(t)), n)
-            f = function([], [gn, gn.shape], mode=None)
+            f = function([], [gn, gn.shape], mode=self.mode)
            topo = f.maker.env.toposort()
            if not fast_compile:
                assert any([isinstance(node.op, AdvancedIncSubtensor1) and node.op.inplace for node in topo])
            else:
                assert any([isinstance(node.op, AdvancedIncSubtensor1) for node in topo])
-            assert any([isinstance(node.op, AdvancedSubtensor1) for node in topo])
+            assert any([isinstance(node.op, self.adv_sub1) for node in topo])
            gval, gshape = f()
            good = numpy.zeros_like(data)
            # good[idx] += numpy.exp(data[idx]) don't work when the same index is used many time
@@ -1643,28 +1663,29 @@ class T_subtensor(unittest.TestCase):

            # Test shape of AdvancedIncSubtensor1 and AdvancedSubtensor1
            if idx is idxs[0]:
-                f = function([], [gn.shape, n[idx_].shape], mode=None)
+                f = function([], [gn.shape, n[idx_].shape], mode=self.mode)
                topo = f.maker.env.toposort()
                if not fast_compile:
                    self.failUnless(not any([isinstance(node.op, AdvancedIncSubtensor1) for node in topo]))
-                    self.failUnless(not any([isinstance(node.op, AdvancedSubtensor1) for node in topo]))
+                    self.failUnless(not any([isinstance(node.op, self.adv_sub1) for node in topo]))
                f()


    def test_grad_list(self):
        data = numpy.random.rand(4)
+        data = numpy.asarray(data, dtype=self.dtype)
        idxs = [[i] for i in range(data.shape[0])]
-        debug_mode = isinstance(theano.compile.mode.get_default_mode(),
-                                theano.compile.DebugMode)
        for i in range(data.shape[0]):
            for j in range(0,data.shape[0],2):
                idxs.append([i,j,(i+1)%data.shape[0]])
        self.grad_list_(idxs, data)

        data = numpy.random.rand(4,3)
+        data = numpy.asarray(data, dtype=self.dtype)
        self.grad_list_(idxs, data)

        data = numpy.random.rand(4,3,2)
+        data = numpy.asarray(data, dtype=self.dtype)
        self.grad_list_(idxs, data)

    def test_shape_list(self):
@@ -1674,7 +1695,8 @@ class T_subtensor(unittest.TestCase):
                          (numpy.random.rand(4,2,3), [0,3]),
                          (numpy.random.rand(4,2,3), [3,3,1,2,2,]),
                          ]:
-            n = shared(data)
+            data = numpy.asarray(data, dtype=self.dtype)
+            n = self.shared(data)
            t = n[idx]
            f = function([], t.shape, mode=None)
            topo = f.maker.env.toposort()