Allow to create fortran order memory region the gpu.

c2888726 · Frederic · 19c0dbcb · c2888726 · c2888726
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -5093,7 +5093,7 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-        const int * dims)
+                            const int * dims, int fortran)
 {
    bool allocated = false;
    if (*arr == NULL)
@@ -5105,7 +5105,7 @@ int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
        allocated = true;
    }
-    if (CudaNdarray_alloc_contiguous(*arr, nd, dims))
+    if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
    {
        if (allocated)
        {

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -326,10 +326,13 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
 * Allocate storage space for a tensor of rank 'nd' and given dimensions.
 * (No-op if self already has a contiguous tensor of the right dimensions)
 *
+ * If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
+ *
 * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
 */
 template<typename inttype>
-static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
+static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
+                                        const inttype * dim, int fortran=0)
 {
    // allocate an empty ndarray with c_contiguous access
    // return 0 on success
@@ -342,11 +345,23 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
    {
        return -1;
    }
-    for (int i = nd-1; i >= 0; --i)
+    if (fortran)
+    {
+        for (int i = 0; i < nd; i++)
+        {
+            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
+            CudaNdarray_set_dim(self, i, dim[i]);
+            size = size * dim[i];
+        }
+    }
+    else
    {
-        CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
+        for (int i = nd-1; i >= 0; --i)
-        CudaNdarray_set_dim(self, i, dim[i]);
+        {
-        size = size * dim[i];
+            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
+            CudaNdarray_set_dim(self, i, dim[i]);
+            size = size * dim[i];
+        }
    }
    // If the allocated buffer is already of the right size, we don't need to
@@ -525,8 +540,9 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
 // *arr may initially be NULL, a pointer to an ndarray of the wrong size,
 // or a pointer to an ndarray of the right size. In the last case it will
 // not change.
+// If fortran is non-zero, a fortran order is expected/created
 DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-        const int * dims);
+                                      const int * dims, int fortran = 0);
 DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){
    if(CUBLAS_STATUS_SUCCESS == err)