Make GpuAdvancedSubtensor1 use int64 for indices to make sure we support all index number.

894d6665 · Frederic · e6b2160a · 894d6665 · 894d6665
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1891,6 +1891,8 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.
    """
+    #If True or False, we assert that we use the take version or not
+    #If None, we choose the best one applicable
    perform_using_take = None

    def make_node(self, x, ilist):
@@ -1910,8 +1912,9 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
        #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
        x, idx = inp
        out, = out_
-        #TODO: if more then 3 dims, reshape the inputs if it is contiguous.
        x_orig = x
+        #TODO: if more then 3 dims, reshape the inputs even if not all
+        #dimensions are c contiguous
        if x.ndim > 3 and x.is_c_contiguous():
            x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
        out_shape = (len(idx),) + x_orig.shape[1:]
@@ -1920,8 +1923,17 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
            if self.perform_using_take is not None:
                assert self.perform_using_take == True, (
                    "GpuAdvancedSubtensor1 used the fast version")
-
-            o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")),  # idx
+            if idx.dtype != numpy.int64:
+                if idx.dtype in [numpy.int8, numpyt.int16, numpy.int32,
+                                 numpy.int64, numpy.uint8, numpy.uint16,
+                                 numpy.uint32]:
+                    idx = idx.astype(numpy.int64)
+            if not idx.flags.c_contiguous:
+                idx = numpy.ascontiguousarray(idx)
+
+            idx = idx.view("float32")
+            idx = cuda_ndarray.cuda_ndarray.CudaNdarray(idx)
+            o = x.take(idx,
                       0,  # axis
                       out_[0][0])  # return
            if x is not x_orig:

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -701,14 +701,14 @@ enum operator_t
 */
 template <int operator_num>
 __global__ void k_take_3(const int d0, const int d1, const int d2,
-                         const float* indices,
+                         const npy_int64* indices,
                         float* a,
                         const int sA0, const int sA1, const int sA2,
                         const float* b, const int dB0,
                         const int sB0, const int sB1, const int sB2,
                         int* err){
    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
-        int idx = (int)indices[i0];
+        npy_int64 idx = indices[i0];
        if (idx<0)
            idx += dB0; // To allow negative indexing.
        if ((idx < 0) || (idx >= dB0))
@@ -737,8 +737,9 @@ static int* err_var = NULL;
 // We try to be similat to the PyArray_TakeFrom function
 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
 //TODO: support other clip mode then raise(clip, wrap)
-//TODO: what if the indices take more then 32 bits?
 //self is the input that we copy data from.
+//The indices that we receive MUST be an CudaNdarray(float32)
+//    that is in fact a view to int64 indices
 PyObject*
 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    int verbose = 0;
@@ -761,7 +762,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
        if (verbose) printf("cudandarray indices\n");
        indices = (CudaNdarray*) indices_obj;
        Py_INCREF(indices);
-    } else if (PyArray_Check(indices_obj)) {
+    } else if (0 && PyArray_Check(indices_obj)) {
        PyErr_SetString(PyExc_NotImplementedError, "CudaNdarray_TakeFrom: The indices must cudandarray with float32 value.");
        return NULL;

@@ -800,9 +801,10 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
            return NULL;
        }
        Py_DECREF(indices_float32);
-
    } else {
-        PyErr_SetString(PyExc_TypeError, "CudaNdarray_TakeFrom: need a CudaNdarray for indices");
+        PyErr_SetString(PyExc_TypeError,
+                        "CudaNdarray_TakeFrom: need a CudaNdarray(float32) that"
+                        " is a view from int64 data for indices");
        return NULL;
    }

@@ -815,11 +817,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    }
    if (verbose) printf("after print of object\n");
    if(!CudaNdarray_is_c_contiguous(indices) != 0) {
-        PyErr_SetString(PyExc_NotImplementedError, "CudaNdarray_TakeFrom: The indices must be contiguous in memory.");
+        PyErr_SetString(PyExc_NotImplementedError,
+                        "CudaNdarray_TakeFrom: The indices must be contiguous in memory.");
        Py_DECREF(indices_obj);
        return NULL;
    }
-    int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices);
+    int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices) / 2;// int64 are 8 bytes, float32 are 4 bytes

    //Check argument axis
    //TODO: implement the default and other axis
@@ -885,7 +888,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
        Py_DECREF(clipmode_obj);
    }
    void (*k3)(const int, const int, const int,
-               const float*,
+               const npy_int64*,
               float*, const int, const int, const int,
               const float*, const int,
               const int, const int, const int,
@@ -923,7 +926,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                        dims[0],
                        1,
                        1,
-                        CudaNdarray_DEV_DATA(indices),
+                        (npy_int64*) CudaNdarray_DEV_DATA(indices),
                        CudaNdarray_DEV_DATA(out),
                        CudaNdarray_HOST_STRIDES(out)[0], //strides
                        1,
@@ -947,7 +950,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                        dims[0], //dimensions
                        dims[1],
                        1,
-                        CudaNdarray_DEV_DATA(indices),
+                        (npy_int64*) CudaNdarray_DEV_DATA(indices),
                        CudaNdarray_DEV_DATA(out),
                        CudaNdarray_HOST_STRIDES(out)[0], //strides
                        CudaNdarray_HOST_STRIDES(out)[1],
@@ -973,7 +976,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                        dims[0], //dimensions
                        dims[1],
                        dims[2],
-                        CudaNdarray_DEV_DATA(indices),
+                        (npy_int64*) CudaNdarray_DEV_DATA(indices),
                        CudaNdarray_DEV_DATA(out),
                        CudaNdarray_HOST_STRIDES(out)[0], //strides
                        CudaNdarray_HOST_STRIDES(out)[1],