Fix GpuGemv with negative strides

2c447bb9 · Pascal Lamblin · 554a55d9 · 2c447bb9 · 2c447bb9
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -305,7 +305,7 @@ class GpuGemv(Op):
        return Apply(self, [z, a, x, y, b], [z.type()])

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
@@ -313,44 +313,46 @@ class GpuGemv(Op):
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
+        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO.StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
        float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
-        """
-        if self.inplace:
-            print >> sio, """
+
+        if (%(inplace)s
+            && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
+                || ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
+                    && (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
+        {
+            // Work inplace on the input
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
-            """
-        else:
-            print >> sio, """
-            if (!%(z_out)s
-                || (%(z_out)s->nd != 1)
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
-                )
+        }
+        else if (%(z_out)s
+                && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
+                    || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
+                        && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
+        {
+            // Work on the output
+            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
-                Py_XDECREF(%(z_out)s);
-                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
-                if (!%(z_out)s)
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            else
+        }
+        else
+        {
+            // Copy
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+            if (!%(z_out)s)
            {
-                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            """
-
+        }

-        print >> sio, """
        if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
        {
            %(fail)s;

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -3029,8 +3029,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    char N = 'N';
    char T = 'T';
    //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
-    //TODO: recognize the negative stride and make a copy of the offending argument,
-    //rather than aborting
+    // There should be no negative stride at that point
 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
    if (sx == 0){sx = 1;}\
    if (sy == 0){sy = 1;}\
@@ -3038,7 +3037,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    if ((sx > 0) && (sy > 0) && (sz > 0)) { \
        cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
    } else { \
-        PyErr_SetString(PyExc_NotImplementedError, "negative stride to sGemm");\
+        PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
        Py_XDECREF(A);\
        Py_XDECREF(B);\
        return -1; \
@@ -3093,22 +3092,53 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
        return -1;
    }

-    // a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
-    // TODO: make a copy instead of returning in error
-    if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
-    { PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemv arg"); return -1; }
+    // If matrix A has non-unit size and non-unit stride in both
+    // dimensions, or negative strides, we cannot operate, but we can
+    // make a copy.
+    if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
+         && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
+         && (CudaNdarray_HOST_DIMS(A)[1] > 1)
+         && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
+        || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
+    {
+        const CudaNdarray* A_new = (CudaNdarray*) CudaNdarray_Copy(A);
+        if (!A_new)
+            return -1;
+        A = A_new;
+    }
+    else
+    {
+        // Incref A, so we can decref it at the end in all cases
+        Py_INCREF(A);
+    }
+
+    // If vector B as a negative stride, we also have to make a copy.
+    if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
+    {
+        const CudaNdarray* B_new = (CudaNdarray*) CudaNdarray_Copy(B);
+        if (!B_new)
+        {
+            Py_XDECREF(A);
+            return -1;
+        }
+        B = B_new;
+    }
+    else
+    {
+        // Incref B, so we can decref it at the end in all cases
+        Py_INCREF(B);
+    }

    // I don't know if cudablas handles negative strides
    if (   (CudaNdarray_HOST_STRIDES(A)[0] < 0)
-        || (CudaNdarray_HOST_STRIDES(A)[1] < 0)
-        || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
-        || (CudaNdarray_HOST_STRIDES(C)[0] < 0))
+        || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
    {
-        PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)x(%i)->(%i)",
+        PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
                CudaNdarray_HOST_STRIDES(A)[0],
-                CudaNdarray_HOST_STRIDES(A)[1],
-                CudaNdarray_HOST_STRIDES(B)[0],
-                CudaNdarray_HOST_STRIDES(C)[0]);
+                CudaNdarray_HOST_STRIDES(A)[1]);
+        Py_XDECREF(A);
+        Py_XDECREF(B);
        return -1;
    }

@@ -3120,32 +3150,46 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
    int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;

-    if (sa_0 == 1)
-    {
-        cublasSgemv('N',
-                CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
-                alpha,
-                CudaNdarray_DEV_DATA(A), sa_1,
-                CudaNdarray_DEV_DATA(B), sb_0,
-                beta,
-                CudaNdarray_DEV_DATA(C), sc_0);
-    }
-    else if (sa_1 == 1)
-    {
-        cublasSgemv('T',
-                CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
-                alpha,
-                CudaNdarray_DEV_DATA(A), sa_0,
-                CudaNdarray_DEV_DATA(B), sb_0,
-                beta,
-                CudaNdarray_DEV_DATA(C), sc_0);
-    }
-    else
-    {
-        PyErr_SetString(PyExc_NotImplementedError, "too many strides strides in sgemv");
-        return -1;
+    if (CudaNdarray_SIZE(C)) {
+        if ((sa_0 == 1) || (sa_0 == 0))
+        {
+            cublasSgemv('N',
+                    CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
+                    alpha,
+                    CudaNdarray_DEV_DATA(A), sa_1,
+                    CudaNdarray_DEV_DATA(B), sb_0,
+                    beta,
+                    CudaNdarray_DEV_DATA(C), sc_0);
+        }
+        else if ((sa_1 == 1) || (sa_1 == 0))
+        {
+            cublasSgemv('T',
+                    CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
+                    alpha,
+                    CudaNdarray_DEV_DATA(A), sa_0,
+                    CudaNdarray_DEV_DATA(B), sb_0,
+                    beta,
+                    CudaNdarray_DEV_DATA(C), sc_0);
+        }
+        else
+        {
+            PyErr_Format(PyExc_AssertionError,
+                         "Unexpected stride pattern in gemv: (%i, %i) x %i -> %i",
+                         sa_0, sa_1, sb_0, sc_0);
+            printf("shapes are: (%i, %i) x %i -> %i\n",
+                   CudaNdarray_HOST_DIMS(A)[0],
+                   CudaNdarray_HOST_DIMS(A)[1],
+                   CudaNdarray_HOST_DIMS(B)[0],
+                   CudaNdarray_HOST_DIMS(C)[0]);
+            Py_XDECREF(A);
+            Py_XDECREF(B);
+            return -1;
+        }
    }
+
    CNDA_THREAD_SYNC;
+    Py_XDECREF(A);
+    Py_XDECREF(B);
    cudaError_t err = cudaGetLastError();
    if (CUBLAS_STATUS_SUCCESS != err)
    {