Merge pull request #715 from nouiz/GpuAdvSub1

New version of GpuAdvancedSubtensor1 with gpu code.

Merge pull request #715 from nouiz/GpuAdvSub1
572bf565 · James Bergstra · 804c7e4b · 5dfcd8b4 · 572bf565 · 572bf565
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1891,6 +1891,10 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.
    """
+    #If True or False, we assert that we use the take version or not
+    #If None, we choose the best one applicable
+    perform_using_take = None
+
    def make_node(self, x, ilist):
        x_ = as_cuda_ndarray_variable(x)
        ilist_ = tensor.as_tensor_variable(ilist)
@@ -1908,11 +1912,44 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
        #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
        x, idx = inp
        out, = out_
-        o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
-                                                        x.shape[1:])
-        for (j, i) in enumerate(idx):
-            o[j] = x[i]
-        out[0] = o
+        x_orig = x
+        #TODO: if more then 3 dims, reshape the inputs even if not all
+        #dimensions are c contiguous
+        if x.ndim > 3 and x.is_c_contiguous():
+            x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
+        out_shape = (len(idx),) + x_orig.shape[1:]
+        if x.ndim <= 3:
+            # CudaNdarray.take only supports ndim <= 3
+            if self.perform_using_take is not None:
+                assert self.perform_using_take == True, (
+                    "GpuAdvancedSubtensor1 used the fast version")
+            if idx.dtype != numpy.int64:
+                if idx.dtype in [numpy.int8, numpyt.int16, numpy.int32,
+                                 numpy.int64, numpy.uint8, numpy.uint16,
+                                 numpy.uint32]:
+                    idx = idx.astype(numpy.int64)
+            if not idx.flags.c_contiguous:
+                idx = numpy.ascontiguousarray(idx)
+
+            idx = idx.view("float32")
+            idx = cuda_ndarray.cuda_ndarray.CudaNdarray(idx)
+            o = x.take(idx,
+                       0,  # axis
+                       out_[0][0])  # return
+            if x is not x_orig:
+                o = o.reshape(out_shape)
+            out[0] = o
+        else:
+            if self.perform_using_take is not None:
+                assert self.perform_using_take == False, (
+                    "GpuAdvancedSubtensor1 didn't use the fast version")
+            if out_[0][0] is None or out_[0][0].shape != out_shape:
+                o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape)
+            else:
+                o = out_[0][0]
+            for (j, i) in enumerate(idx):
+                o[j] = x[i]
+            out[0] = o


 class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -338,6 +338,8 @@ DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
 DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);

 DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
+DllExport PyObject*
+CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);

 static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);


--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
-import sys, time, unittest
+import copy
+import sys
+import time
+import unittest

 from theano.compile.pfunc import pfunc
 from theano import tensor
@@ -846,6 +849,58 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
        return super(theano.tensor.tests.test_basic.T_subtensor,
                     self).__init__(name)

+    def test_adv_sub1_fast(self):
+        """We check that the special cases of advanced indexing that
+        use CudaNdarrayTakeFrom are handled correctly
+
+        """
+        rand = numpy.random.rand
+        # The variable fast is used to set the member perform_using_take of
+        # the Op.  It is only useful for testing that we use the fast
+        # version when we should. Users should not use it.
+        for data, idx, fast in [(rand(70000), range(70000), True),
+                                (rand(70000, 5), range(70000), True),
+                                (rand(70000, 2, 3), range(70000), True),
+                                (rand(1025, 1025), [5, 10], True),
+                                (rand(3, 1025, 1026), [1, 2], True),
+                                (rand(1025, 67000), [5, 10], True),
+                                (rand(3, 10, 68000), [1, 2], True),
+                                (rand(3, 69000, 11), [1, 2], True),
+                                (rand(4, 5), [2, 3], True),
+                                (rand(4, 2, 3), [0, 3], True),
+                                (rand(4, 2, 3), [3, 3, 1, 1, 2,
+                                                 2, 0, 0], True),
+                                (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0,
+                                                 0, -1, -2, -3, -4], True),
+                                # Test 4 dims as gpu. code use another algo
+                                # in that case. This new algo is not as much
+                                # optimized for that case.
+                                (rand(4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
+                                                    -1, -2, -3, -4], False),
+                            ]:
+            data = numpy.asarray(data, dtype=self.dtype)
+            n = self.shared(data)
+
+            # Test with c_contiguous input
+            t = self.adv_sub1()(n, idx)
+            t.owner.op.perform_using_take = True  # input c_contiguous, so we reshape
+            val = self.eval_output_and_check(t, list=True)
+
+            val = numpy.asarray(val)
+            good = data[idx]
+            self.assertTrue(val.ndim == data.ndim)
+            self.assertTrue(numpy.allclose(val, good), (val, good))
+
+            # Test with input strided
+            t = self.adv_sub1()(n[::-1], idx)
+            t.owner.op.perform_using_take = fast
+            val = theano.function([], t, mode=self.mode)()
+
+            val = numpy.asarray(val)
+            good = data[::-1][idx]
+            self.assertTrue(val.ndim == data.ndim)
+            self.assertTrue(numpy.allclose(val, good), (val, good))
+

 def test_advinc_subtensor1():
    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """