Merge pull request #1788 from nouiz/cuda_fortran

Cuda fortran order

Merge pull request #1788 from nouiz/cuda_fortran
7194cb2b · abergeron · 19c0dbcb · 757a2c5d · 7194cb2b · 7194cb2b
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -5093,7 +5093,7 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)


 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-        const int * dims)
+                            const int * dims, int fortran)
 {
    bool allocated = false;
    if (*arr == NULL)
@@ -5105,7 +5105,7 @@ int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
        allocated = true;
    }

-    if (CudaNdarray_alloc_contiguous(*arr, nd, dims))
+    if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
    {
        if (allocated)
        {

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -160,6 +160,12 @@ CudaNdarray_CheckExact(const PyObject * ob);
 DllExport bool
 CudaNdarray_is_c_contiguous(const CudaNdarray * self);

+/**
+ * Return true for a F-contiguous CudaNdarray, else false
+ */
+DllExport bool
+CudaNdarray_is_f_contiguous(const CudaNdarray * self);
+
 /****
 * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
 */
@@ -326,10 +332,13 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
 * Allocate storage space for a tensor of rank 'nd' and given dimensions.
 * (No-op if self already has a contiguous tensor of the right dimensions)
 *
+ * If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
+ *
 * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
 */
 template<typename inttype>
-static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
+static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
+                                        const inttype * dim, int fortran=0)
 {
    // allocate an empty ndarray with c_contiguous access
    // return 0 on success
@@ -342,12 +351,24 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
    {
        return -1;
    }
+    if (fortran)
+    {
+        for (int i = 0; i < nd; i++)
+        {
+            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
+            CudaNdarray_set_dim(self, i, dim[i]);
+            size = size * dim[i];
+        }
+    }
+    else
+    {
        for (int i = nd-1; i >= 0; --i)
        {
            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
            CudaNdarray_set_dim(self, i, dim[i]);
            size = size * dim[i];
        }
+    }

    // If the allocated buffer is already of the right size, we don't need to
    // do anything else.
@@ -497,6 +518,27 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self)
    return c_contiguous;
 }

+/**
+ * True iff the strides look like [1, dim[0], dim[0]*dim[1], ...]
+ */
+DllExport inline bool ALWAYS_INLINE
+CudaNdarray_is_f_contiguous(const CudaNdarray * self)
+{
+    bool f_contiguous = true;
+    int size = 1;
+    for (int i = 0; (i < self->nd) && f_contiguous; i++)
+    {
+        if (CudaNdarray_HOST_DIMS(self)[i] == 1)
+            continue;
+        if (CudaNdarray_HOST_STRIDES(self)[i] != size)
+        {
+            f_contiguous = false;
+        }
+        size = size * CudaNdarray_HOST_DIMS(self)[i];
+    }
+    return f_contiguous;
+}
+
 DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);

 DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
@@ -525,8 +567,9 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
 // *arr may initially be NULL, a pointer to an ndarray of the wrong size,
 // or a pointer to an ndarray of the right size. In the last case it will
 // not change.
+// If fortran is non-zero, a fortran order is expected/created
 DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-        const int * dims);
+                                      const int * dims, int fortran = 0);

 DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){
    if(CUBLAS_STATUS_SUCCESS == err)