Merge pull request #3227 from abergeron/gpua_advsub1

Implement GpuAdvancedSubtensor1 for gpuarray

Merge pull request #3227 from abergeron/gpua_advsub1
975e0d2b · Frédéric Bastien · c13853ad · e3474eda · 975e0d2b · 975e0d2b
--- a/theano/sandbox/gpuarray/gpuarray_helper.h
+++ b/theano/sandbox/gpuarray/gpuarray_helper.h
@@ -4,6 +4,8 @@
 #include <string.h>
 #include <gpuarray_api.h>
 #include <numpy_compat.h>
+#include <gpuarray/util.h>
+

 static int theano_size_check(PyGpuArrayObject *a, unsigned int nd,
                             const size_t *dims, int typecode) {
@@ -42,9 +44,14 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
  return out;
 }

-/* This is guaranteed to work and return the raw CUDA/OpenCL object on
- * all recent (as of June 2015) version of libgpuarray. This is also
- * promised to keep working in future versions. */
-#define PyGpuArray_DEV_DATA(ary) (*(void **)((ary)->ga.data))
+static inline void *PyGpuArray_DEV_DATA(PyGpuArrayObject *a) {
+  /* This is guaranteed to work and return the raw CUDA/OpenCL object on
+   * all recent (as of June 2015) version of libgpuarray. This is also
+   * promised to keep working in future versions. */
+  char * p = *((char **)a->ga.data);
+
+  /* This only works on cuda since we have a real pointer. */
+  return (void *)(p + a->ga.offset);
+}

 #endif
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -35,6 +35,7 @@ from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
 from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
                       GpuCAReduceCPY)
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
+                        GpuAdvancedSubtensor1,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)

@@ -488,6 +489,12 @@ def local_gpua_incsubtensor(node):
                           node.op.destroyhandler_tolerate_aliased)


+@register_opt('fast_compile')
+@op_lifter([tensor.AdvancedSubtensor1])
+def local_gpua_advanced_subtensor(node):
+    return GpuAdvancedSubtensor1()
+
+
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
 def local_gpua_advanced_incsubtensor(node):
@@ -496,7 +503,16 @@ def local_gpua_advanced_incsubtensor(node):
    if pygpu.get_default_context().kind != "cuda":
        return None

-    x, y = node.inputs[0:2]
+    x, y, ilist = node.inputs
+
+    # Gpu Ops needs both inputs to have the same dtype
+    if (x.type.dtype != y.type.dtype):
+        dtype = scalar.upcast(x.type.dtype, y.type.dtype)
+        if x.type.dtype != dtype:
+            x = tensor.cast(x, dtype)
+        if y.type.dtype != dtype:
+            y = tensor.cast(y, dtype)
+
    set_instead_of_inc = node.op.set_instead_of_inc
    active_device_no = theano.sandbox.cuda.active_device_number()
    device_properties = theano.sandbox.cuda.device_properties
@@ -504,11 +520,11 @@ def local_gpua_advanced_incsubtensor(node):
    compute_capability = device_properties(active_device_no)['major']

    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
-        return GpuAdvancedIncSubtensor1(
-            set_instead_of_inc=set_instead_of_inc)
+        return [GpuAdvancedIncSubtensor1(
+                set_instead_of_inc=set_instead_of_inc)(x, y, ilist)]
    else:
-        return GpuAdvancedIncSubtensor1_dev20(
-            set_instead_of_inc=set_instead_of_inc)
+        return [GpuAdvancedIncSubtensor1_dev20(
+                set_instead_of_inc=set_instead_of_inc)(x, y, ilist)]


 @register_opt('fast_compile')

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
@@ -7,6 +7,7 @@ from theano.tensor.tests import test_subtensor

 from ..basic_ops import HostFromGpu, GpuFromHost
 from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
+                         GpuAdvancedSubtensor1,
                         GpuAdvancedIncSubtensor1)
 from ..type import gpuarray_shared_constructor

@@ -24,6 +25,7 @@ class G_subtensor(test_subtensor.T_subtensor):
            shared=gpuarray_shared_constructor,
            sub=GpuSubtensor,
            inc_sub=GpuIncSubtensor,
+            adv_sub1=GpuAdvancedSubtensor1,
            adv_incsub1=GpuAdvancedIncSubtensor1,
            mode=mode_with_gpu,
            # avoid errors with limited devices

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -515,8 +515,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            self.assertRaises(IndexError, g, shp)

    def test_adv_sub1_broadcast(self):
-        ones = numpy.ones((1, 3), dtype=self.dtype)
-        n = self.shared(ones * 5, broadcastable=(True, False))
+        v = numpy.arange(3, dtype=self.dtype).reshape((1, 3))
+        n = self.shared(v*5, broadcastable=(True, False))
        idx = tensor.lvector()
        t = n[idx]
        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
@@ -529,10 +529,10 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
        f_0 = f([0])
        self.assertTrue(f_0.shape == (1, 3))
-        self.assertTrue(numpy.allclose(f_0, ones[0] * 5))
+        self.assertTrue(numpy.allclose(f_0, v*5))
        f_00 = f([0, 0])
        self.assertTrue(f_00.shape == (2, 3))
-        self.assertTrue(numpy.allclose(f_00, 5))
+        self.assertTrue(numpy.allclose(f_00, v*5))
        self.assertRaises(IndexError, f, [0, 1])

        # Test the gradient

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -160,7 +160,6 @@ whitelist_flake8 = [
    "sandbox/linalg/tests/test_linalg.py",
    "sandbox/gpuarray/basic_ops.py",
    "sandbox/gpuarray/nnet.py",
-    "sandbox/gpuarray/subtensor.py",
    "sandbox/gpuarray/elemwise.py",
    "sandbox/gpuarray/type.py",
    "sandbox/gpuarray/__init__.py",