New version of GpuAdvancedSubtensor1 with gpu code when input have up to 3…

New version of GpuAdvancedSubtensor1 with gpu code when input have up to 3 dimensions or is c_contiguous.

New version of GpuAdvancedSubtensor1 with gpu code when input have up to 3…
6c955ecf · Frederic · 6005ac2b · 6c955ecf · 6c955ecf · 6c955ecf
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1891,6 +1891,8 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.
    """
+    assert_fast = None
    def make_node(self, x, ilist):
        x_ = as_cuda_ndarray_variable(x)
        ilist_ = tensor.as_tensor_variable(ilist)
@@ -1908,11 +1910,35 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
        #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
        x, idx = inp
        out, = out_
-        o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
+        new_method = True
-                                                        x.shape[1:])
+        #TODO: if more then 3 dims, reshape the inputs if it is contiguous.
-        for (j, i) in enumerate(idx):
+        x_orig = x
-            o[j] = x[i]
+        if x.ndim > 3 and x.is_c_contiguous():
-        out[0] = o
+            x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
+        if x.ndim <= 3:
+            if self.assert_fast is not None:
+                assert self.assert_fast == True, (
+                    "GpuAdvancedSubtensor1 used the fast version")
+            # Support x with dimensions 1,2,3 only.
+            o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")),
+                       0, out_[0][0])  # idx, axis, return[, clipmode]
+            if x is not x_orig:
+                o = o.reshape((len(idx),) + x_orig.shape[1:])
+            out[0] = o
+        else:
+            if self.assert_fast is not None:
+                assert self.assert_fast == False, (
+                    "GpuAdvancedSubtensor1 didn't used the fast version")
+            if (out_[0][0] is None or out_[0][0].shape != (len(idx),) +
+                x.shape[1:]):
+                o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
+                                                                x.shape[1:])
+            else:
+                o = out_[0][0]
+            for (j, i) in enumerate(idx):
+                o[j] = x[i]
+            out[0] = o
 class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -338,6 +338,12 @@ DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
 DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
 DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
+//PyObject* PyArray_TakeFrom(PyArrayObject* self, PyObject* indices, int axis, PyArrayObject* ret, NPY_CLIPMODE clipmode)
+//PyObject*
+//CudaNdarray_TakeFrom(CudaNdarray* self, PyObject* indices, int axis,
+//                     PyArrayObject* ret, NPY_CLIPMODE clipmode);
+PyObject* 
+CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
 static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
-import sys, time, unittest
+import copy
+import sys
+import time
+import unittest
 from theano.compile.pfunc import pfunc
 from theano import tensor
@@ -846,6 +849,47 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
        return super(theano.tensor.tests.test_basic.T_subtensor,
                     self).__init__(name)
+    def test_adv_sub1_fast(self):
+        """ We check that we correctly used the fast version"""
+        rand = numpy.random.rand
+        for data, idx, fast in [(rand(70000), range(70000), True),
+                                (rand(70000, 5), range(70000), True),
+                                (rand(70000, 2, 3), range(70000), True),
+                                (rand(4, 5), [2, 3], True),
+                                (rand(4, 2, 3), [0, 3], True),
+                                (rand(4, 2, 3), [3, 3, 1, 1, 2,
+                                                 2, 0, 0], True),
+                                (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0,
+                                                 0, -1, -2, -3, -4], True),
+                                # Test 4 dims as gpu. code use another algo
+                                # in that case. This new algo is not as much
+                                # optimized for that case.
+                                (rand(4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
+                                                    -1, -2, -3, -4], False),
+                            ]:
+            data = numpy.asarray(data, dtype=self.dtype)
+            n = self.shared(data)
+            # Test with c_contiguous input
+            t = self.adv_sub1()(n, idx)
+            t.owner.op.assert_fast = True  # input c_contiguous, so we reshape
+            val = self.eval_output_and_check(t, list=True)
+            val = numpy.asarray(val)
+            good = data[idx]
+            self.assertTrue(val.ndim == data.ndim)
+            self.assertTrue(numpy.allclose(val, good), (val, good))
+            # Test with input strided
+            t = self.adv_sub1()(n[::-1], idx)
+            t.owner.op.assert_fast = fast
+            val = theano.function([], t, mode=self.mode)()
+            val = numpy.asarray(val)
+            good = data[::-1][idx]
+            self.assertTrue(val.ndim == data.ndim)
+            self.assertTrue(numpy.allclose(val, good), (val, good))
 def test_advinc_subtensor1():
    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """