Merge pull request #437 from lamblin/blas_double_strides

Make blas functions work with all stride patterns

Merge pull request #437 from lamblin/blas_double_strides
1f6b0c73 · nouiz · b96c7d5c · 3316ba1e · 1f6b0c73 · 1f6b0c73
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -192,7 +192,7 @@ class GpuGemm(GpuOp):
        return Apply(self, [z, a, x, y, b], [z.type()])

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
@@ -200,6 +200,7 @@ class GpuGemm(GpuOp):
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
+        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO.StringIO()

@@ -215,39 +216,50 @@ class GpuGemm(GpuOp):
        : (REAL)(((double*)%(b)s->data)[0]);
        #undef REAL

-        """
-        if self.inplace:
-            print >> sio, """
+        if (%(inplace)s
+            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
+            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
+            && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
+                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
+                || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
+                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
+        {
+            // The input has an appropriate layout, we work inplace
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
-            """
-        else:
-            print >> sio, """
-            if (!%(z_out)s
-                || (%(z_out)s->nd != 2)
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
-                )
+        }
+        else if (%(z_out)s
+                && (%(z_out)s->nd == 2)
+                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
+                    == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
+                    == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
+                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
+                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
+                && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
+                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
+                    || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
+                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
+        {
+            // The existing output has an appropriate layout,
+            // copy the input data into it, then work inplace
+            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
-                Py_XDECREF(%(z_out)s);
-                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
-                if (!%(z_out)s)
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            else
+        }
+        else
+        {
+            // Copy the input, use the copy as output
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+            if (!%(z_out)s)
            {
-                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            """
-
+        }

-        print >> sio, """
        if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
        {
            %(fail)s;
@@ -294,7 +306,7 @@ class GpuGemv(GpuOp):
        return Apply(self, [z, a, x, y, b], [z.type()])

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
@@ -302,44 +314,46 @@ class GpuGemv(GpuOp):
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
+        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO.StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
        float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
-        """
-        if self.inplace:
-            print >> sio, """
+
+        if (%(inplace)s
+            && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
+                || ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
+                    && (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
+        {
+            // Work inplace on the input
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
-            """
-        else:
-            print >> sio, """
-            if (!%(z_out)s
-                || (%(z_out)s->nd != 1)
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
-                )
+        }
+        else if (%(z_out)s
+                && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
+                    || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
+                        && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
+        {
+            // Work on the output
+            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
-                Py_XDECREF(%(z_out)s);
-                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
-                if (!%(z_out)s)
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            else
+        }
+        else
+        {
+            // Copy
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+            if (!%(z_out)s)
            {
-                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            """
-
+        }

-        print >> sio, """
        if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
        {
            %(fail)s;
@@ -385,7 +399,7 @@ class GpuGer(GpuOp):
        return Apply(self, [z, a, x, y], [z.type()])

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
@@ -393,44 +407,57 @@ class GpuGer(GpuOp):
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y = inputs
        z_out, = outputs
+        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO.StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
-        """
-        if self.inplace:
-            print >> sio, """
+
+        if (%(inplace)s
+            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
+            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
+            && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
+                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
+                || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
+                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
+        {
+            // The input has an appropriate layout, we work inplace
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
-            """
-        else:
-            print >> sio, """
-            if (!%(z_out)s
-                || (%(z_out)s->nd != 2)
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
-                )
+        }
+        else if (%(z_out)s
+                && (%(z_out)s->nd == 2)
+                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
+                    == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
+                    == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
+                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
+                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
+                && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
+                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
+                    || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
+                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
+        {
+            // The existing output has an appropriate layout,
+            // copy the input data into it, then work inplace
+            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
-                Py_XDECREF(%(z_out)s);
-                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
-                if (!%(z_out)s)
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            else
+        }
+        else
+        {
+            // Copy the input, use the copy as output
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+            if (!%(z_out)s)
            {
-                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            """
-
+        }

-        print >> sio, """
        if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
        {
            %(fail)s;

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -453,7 +453,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)



-PyObject * CudaNdarray_Copy(CudaNdarray * self)
+PyObject * CudaNdarray_Copy(const CudaNdarray * self)
 {
    PyObject * rval = CudaNdarray_New();
    if ((!rval) || (-1 == self->nd))
@@ -1095,7 +1095,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                k3<<<n_blocks, n_threads>>>(
                        1, //d0
                        1, //d1
-			1, //d2
+                        1, //d2
                        CudaNdarray_DEV_DATA(self),
                        1, //strides
                        1,
@@ -2777,7 +2777,7 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
 }

 //copy from other into self
-int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast)
+int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast)
 {
    int verbose = 0;
    if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
@@ -2856,7 +2856,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
                // call worker routine
                unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
                unsigned int threads_per_block = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-                CudaNdarray * cuda_dims = other;
+                const CudaNdarray * cuda_dims = other;
                if(unbroadcast)
                    cuda_dims = self;
                //copy from other into self
@@ -2882,9 +2882,21 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo

 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
 {
-    if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
-    if (B->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
-    if (C->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemm"); return -1; }
+    if (A->nd != 2)
+    {
+        PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm");
+        return -1;
+    }
+    if (B->nd != 2)
+    {
+        PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm");
+        return -1;
+    }
+    if (C->nd != 2)
+    {
+        PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
+        return -1;
+    }

    // We must allow dimensions to be zeros.
    if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
@@ -2901,14 +2913,72 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
        return -1;
    }

-    // a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
-    // TODO: make a copy instead of returning in error
-    if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
-    { PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
-    if (((CudaNdarray_HOST_DIMS(B)[0] > 1) && (CudaNdarray_HOST_STRIDES(B)[0] != 1)) && ((CudaNdarray_HOST_DIMS(B)[1] > 1) && (CudaNdarray_HOST_STRIDES(B)[1] != 1)))
-    { PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
-    if (((CudaNdarray_HOST_DIMS(C)[0] > 1) && (CudaNdarray_HOST_STRIDES(C)[0] != 1)) && ((CudaNdarray_HOST_DIMS(C)[1] > 1) && (CudaNdarray_HOST_STRIDES(C)[1] != 1)))
-    { PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemm arg"); return -1; }
+    // If matrix A or B has non-unit size and non-unit stride in both
+    // dimensions, we can make a copy.
+    if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
+         && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
+         && (CudaNdarray_HOST_DIMS(A)[1] > 1)
+         && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
+        || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
+    {
+        const CudaNdarray* A_new = (CudaNdarray*) CudaNdarray_Copy(A);
+        if (!A_new)
+            return -1;
+        A = A_new;
+    }
+    else
+    {
+        // In the case above, we will need to decref A_new at the end.
+        // To make things simpler, we incref A here, so we can always
+        // decref A.
+        Py_INCREF(A);
+    }
+
+    if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
+         && (CudaNdarray_HOST_STRIDES(B)[0] != 1)
+         && (CudaNdarray_HOST_DIMS(B)[1] > 1)
+         && (CudaNdarray_HOST_STRIDES(B)[1] != 1))
+        || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(B)[1] < 0))
+    {
+        const CudaNdarray* B_new = (CudaNdarray*) CudaNdarray_Copy(B);
+        if (!B_new)
+        {
+            Py_XDECREF(A);
+            return -1;
+        }
+        B = B_new;
+    }
+    else
+    {
+        // In the case above, we will need to decref B_new at the end.
+        // To make things simpler, we incref B here, so we can always
+        // decref B.
+        Py_INCREF(B);
+    }
+
+    // If matrix C has non-unit size and non-unit stride in both
+    // dimensions, or negative strides, we can't operate. We cannot copy
+    // C either, because the calling code will expect the result to be
+    // in the original C container.
+    if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
+         && (CudaNdarray_HOST_STRIDES(C)[0] != 1)
+         && (CudaNdarray_HOST_DIMS(C)[1] > 1)
+         && (CudaNdarray_HOST_STRIDES(C)[1] != 1))
+        || (CudaNdarray_HOST_STRIDES(C)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(C)[1] < 0))
+    {
+        PyErr_Format(PyExc_AssertionError,
+                     "non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
+                     CudaNdarray_HOST_STRIDES(C)[0],
+                     CudaNdarray_HOST_STRIDES(C)[1],
+                     CudaNdarray_HOST_DIMS(C)[0],
+                     CudaNdarray_HOST_DIMS(C)[1]);
+        Py_XDECREF(A);
+        Py_XDECREF(B);
+        return -1;
+    }

    // the unit integer is divided logically into three fields of 4 bits
    // the lowermost 4 bits encode the stride pattern of the output
@@ -2920,47 +2990,31 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,

    // a stride of 0 implies a dimension of 1 - so we can actually define
    // a stride of 0 as a 'unit' stride because gemm will never use it.
+    // If a dimension is 0, its stride will not be used either, so we can
+    // consider it a 'unit' stride too.
    int unit = 0;
-    if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_STRIDES(A)[1] == 0) {
+    if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) {
        unit |= (0x0 << 8);
-    } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_STRIDES(A)[0] == 0) {
+    } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) {
        unit |= (0x1 << 8);
    } else {
        unit |= (0x2 << 8);
    }
-    if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_STRIDES(B)[1] == 0) {
+    if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) {
        unit |= (0x0 << 4);
-    } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_STRIDES(B)[0] == 0) {
+    } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) {
        unit |= (0x1 << 4);
    } else {
        unit |= (0x2 << 4);
    }
-    if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_STRIDES(C)[1] == 0) {
+    if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) {
        unit |= (0x0 << 0);
-    } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_STRIDES(C)[0] == 0) {
+    } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) {
        unit |= (0x1 << 0);
    } else {
        unit |= (0x2 << 0);
    }

-    // I don't know if cudablas handles negative strides
-    if (   (CudaNdarray_HOST_STRIDES(A)[0] < 0)
-        || (CudaNdarray_HOST_STRIDES(A)[1] < 0)
-        || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
-        || (CudaNdarray_HOST_STRIDES(B)[1] < 0)
-        || (CudaNdarray_HOST_STRIDES(C)[0] < 0)
-        || (CudaNdarray_HOST_STRIDES(C)[1] < 0))
-    {
-        PyErr_Format(PyExc_ValueError, "illegal strides in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
-                CudaNdarray_HOST_STRIDES(A)[0],
-                CudaNdarray_HOST_STRIDES(A)[1],
-                CudaNdarray_HOST_STRIDES(B)[0],
-                CudaNdarray_HOST_STRIDES(B)[1],
-                CudaNdarray_HOST_STRIDES(C)[0],
-                CudaNdarray_HOST_STRIDES(C)[1]);
-        return -1;
-    }
-
    /* create appropriate strides for malformed matrices that are row or column
     * vectors
     */
@@ -2977,8 +3031,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    char N = 'N';
    char T = 'T';
    //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
-    //TODO: recognize the negative stride and make a copy of the offending argument,
-    //rather than aborting
+    // There should be no negative stride at that point
 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
    if (sx == 0){sx = 1;}\
    if (sy == 0){sy = 1;}\
@@ -2986,7 +3039,9 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    if ((sx > 0) && (sy > 0) && (sz > 0)) { \
        cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
    } else { \
-        PyErr_SetString(PyExc_NotImplementedError, "negative stride to sGemm");\
+        PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
+        Py_XDECREF(A);\
+        Py_XDECREF(B);\
        return -1; \
    }

@@ -3000,14 +3055,19 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
        case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break;
        case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break;
        case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break;
-        default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%i)", unit);
+        default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit);
                 return -1;
    };
    CNDA_THREAD_SYNC;
-    cudaError_t err = cudaGetLastError();
+    Py_XDECREF(A);
+    Py_XDECREF(B);
+
+    cublasStatus err = cublasGetError();
    if (CUBLAS_STATUS_SUCCESS != err)
    {
-        PyErr_Format(PyExc_RuntimeError, "cublassGemm failed (%s)",cudaGetErrorString(err));
+        PyErr_Format(PyExc_RuntimeError,
+                     "cublasSgemm failed (%i)",
+                     err);
        return -1;
    }
    return 0;
@@ -3037,22 +3097,53 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
        return -1;
    }

-    // a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
-    // TODO: make a copy instead of returning in error
-    if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
-    { PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemv arg"); return -1; }
+    // If matrix A has non-unit size and non-unit stride in both
+    // dimensions, or negative strides, we cannot operate, but we can
+    // make a copy.
+    if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
+         && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
+         && (CudaNdarray_HOST_DIMS(A)[1] > 1)
+         && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
+        || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
+        || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
+    {
+        const CudaNdarray* A_new = (CudaNdarray*) CudaNdarray_Copy(A);
+        if (!A_new)
+            return -1;
+        A = A_new;
+    }
+    else
+    {
+        // Incref A, so we can decref it at the end in all cases
+        Py_INCREF(A);
+    }
+
+    // If vector B as a negative stride, we also have to make a copy.
+    if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
+    {
+        const CudaNdarray* B_new = (CudaNdarray*) CudaNdarray_Copy(B);
+        if (!B_new)
+        {
+            Py_XDECREF(A);
+            return -1;
+        }
+        B = B_new;
+    }
+    else
+    {
+        // Incref B, so we can decref it at the end in all cases
+        Py_INCREF(B);
+    }

-    // I don't know if cudablas handles negative strides
+    // cudablas does not handle negative strides as expected
    if (   (CudaNdarray_HOST_STRIDES(A)[0] < 0)
-        || (CudaNdarray_HOST_STRIDES(A)[1] < 0)
-        || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
-        || (CudaNdarray_HOST_STRIDES(C)[0] < 0))
+        || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
    {
-        PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)x(%i)->(%i)",
+        PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
                CudaNdarray_HOST_STRIDES(A)[0],
-                CudaNdarray_HOST_STRIDES(A)[1],
-                CudaNdarray_HOST_STRIDES(B)[0],
-                CudaNdarray_HOST_STRIDES(C)[0]);
+                CudaNdarray_HOST_STRIDES(A)[1]);
+        Py_XDECREF(A);
+        Py_XDECREF(B);
        return -1;
    }

@@ -3064,42 +3155,71 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
    int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;

-    if (sa_0 == 1)
-    {
-        cublasSgemv('N',
-                CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
-                alpha,
-                CudaNdarray_DEV_DATA(A), sa_1,
-                CudaNdarray_DEV_DATA(B), sb_0,
-                beta,
-                CudaNdarray_DEV_DATA(C), sc_0);
-    }
-    else if (sa_1 == 1)
-    {
-        cublasSgemv('T',
-                CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
-                alpha,
-                CudaNdarray_DEV_DATA(A), sa_0,
-                CudaNdarray_DEV_DATA(B), sb_0,
-                beta,
-                CudaNdarray_DEV_DATA(C), sc_0);
-    }
-    else
-    {
-        PyErr_SetString(PyExc_NotImplementedError, "too many strides strides in sgemv");
-        return -1;
+    if (sa_0 == 0)
+        sa_0 = 1;
+    if (sa_1 == 0)
+        sa_1 = 1;
+
+    if (CudaNdarray_SIZE(C)) {
+        if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
+            || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
+                && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
+        {
+            cublasSgemv('N',
+                    CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
+                    alpha,
+                    CudaNdarray_DEV_DATA(A), sa_1,
+                    CudaNdarray_DEV_DATA(B), sb_0,
+                    beta,
+                    CudaNdarray_DEV_DATA(C), sc_0);
+        }
+        else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
+                || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
+                    && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
+        {
+            cublasSgemv('T',
+                    CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
+                    alpha,
+                    CudaNdarray_DEV_DATA(A), sa_0,
+                    CudaNdarray_DEV_DATA(B), sb_0,
+                    beta,
+                    CudaNdarray_DEV_DATA(C), sc_0);
+        }
+        else
+        {
+            PyErr_Format(PyExc_AssertionError,
+                         "Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n"
+                         "Shapes are: (%i, %i) x %i -> %i\n",
+                         CudaNdarray_HOST_STRIDES(A)[0],
+                         CudaNdarray_HOST_STRIDES(A)[1],
+                         CudaNdarray_HOST_STRIDES(B)[0],
+                         CudaNdarray_HOST_STRIDES(C)[0],
+                         CudaNdarray_HOST_DIMS(A)[0],
+                         CudaNdarray_HOST_DIMS(A)[1],
+                         CudaNdarray_HOST_DIMS(B)[0],
+                         CudaNdarray_HOST_DIMS(C)[0]);
+            Py_XDECREF(A);
+            Py_XDECREF(B);
+            return -1;
+        }
    }
+
    CNDA_THREAD_SYNC;
-    cudaError_t err = cudaGetLastError();
+    Py_XDECREF(A);
+    Py_XDECREF(B);
+
+    cublasStatus err = cublasGetError();
    if (CUBLAS_STATUS_SUCCESS != err)
    {
-        PyErr_Format(PyExc_RuntimeError, "cublassGemv failed (%s)",cudaGetErrorString(err));
+        PyErr_Format(PyExc_RuntimeError,
+                     "cublasSgemv failed (%i)",
+                     err);
        return -1;
    }
    return 0;
 }

-int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
+int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) {
    if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
    if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
    if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
@@ -3115,14 +3235,8 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
        return -1;
    }

-    // Maybe this could work, but be safe for now
-    if (!CudaNdarray_is_c_contiguous(A)) {
-        PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
-        return -1;
-    }
-    // Since Sger expects A in col-major, we invert x and y to fake this.
    int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
-    CudaNdarray * x_ = x;
+    const CudaNdarray * x_ = x;
    if(x_strides == 0){
        if(CudaNdarray_HOST_DIMS(x)[0] != 1){
            PyErr_Format(PyExc_RuntimeError,
@@ -3131,14 +3245,14 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
                         " that have more then 1 elements!");
            return -1;
        }
-        x_strides = 4;
+        x_strides = 1;
    } else if(x_strides < 0){
        x_ = (CudaNdarray*)CudaNdarray_Copy(x);
        x_strides = CudaNdarray_HOST_STRIDES(x_)[0];
    }

    int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
-    CudaNdarray * y_ = y;
+    const CudaNdarray * y_ = y;
    if(y_strides == 0){
        if(CudaNdarray_HOST_DIMS(y)[0] != 1){
            PyErr_Format(PyExc_RuntimeError,
@@ -3147,17 +3261,46 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
                         " that have more then 1 elements!");
            return -1;
        }
-        y_strides = 4;
+        y_strides = 1;
    } else if(y_strides < 0){
        y_ = (CudaNdarray*)CudaNdarray_Copy(y);
        y_strides = CudaNdarray_HOST_STRIDES(y_)[0];
    }

+    // Create appropriate strides if A is a row or column vector
+    int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0]
+                                                 : CudaNdarray_HOST_DIMS(A)[1];
+    int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
+                                                 : CudaNdarray_HOST_DIMS(A)[0];
+
    if(CudaNdarray_SIZE(A)){
-        cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
-                   CudaNdarray_DEV_DATA(y_), y_strides,
-                   CudaNdarray_DEV_DATA(x_), x_strides,
-                   CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
+        // If A is in col-major
+        if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
+            || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
+                && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
+        {
+            cublasSger(CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], alpha,
+                       CudaNdarray_DEV_DATA(x_), x_strides,
+                       CudaNdarray_DEV_DATA(y_), y_strides,
+                       CudaNdarray_DEV_DATA(A), sa_1);
+        }
+        // Since Sger expects A in col-major, we invert x and y to fake this.
+        else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
+                || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
+                    && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
+        {
+            cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
+                       CudaNdarray_DEV_DATA(y_), y_strides,
+                       CudaNdarray_DEV_DATA(x_), x_strides,
+                       CudaNdarray_DEV_DATA(A), sa_0);
+        }
+        // A has to be either c- or f-contiguous, with no negative strides
+        else
+        {
+            PyErr_SetString(PyExc_NotImplementedError,
+                            "non-contiguous A, or negative strides, in sger");
+            return -1;
+        }
    }
    CNDA_THREAD_SYNC;
    if(x_ != x)
@@ -3165,12 +3308,15 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
    if(y_ != y)
        Py_DECREF(y_);

-    cudaError_t err = cudaGetLastError();
+    cublasStatus err = cublasGetError();
    if (CUBLAS_STATUS_SUCCESS != err)
    {
-        PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err));
+        PyErr_Format(PyExc_RuntimeError,
+                     "cublasSger failed (%i)",
+                     err);
        return -1;
    }
+
    return 0;
 }

@@ -3816,7 +3962,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)


 int
-cnda_copy_structure_to_device(CudaNdarray * self)
+cnda_copy_structure_to_device(const CudaNdarray * self)
 {
    cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
    CNDA_THREAD_SYNC;
@@ -3830,7 +3976,7 @@ cnda_copy_structure_to_device(CudaNdarray * self)
 }

 const int *
-CudaNdarray_DEV_DIMS(CudaNdarray * self)
+CudaNdarray_DEV_DIMS(const CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
    {
@@ -3840,7 +3986,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self)
    return self->dev_structure;
 }
 const int *
-CudaNdarray_DEV_STRIDES(CudaNdarray * self)
+CudaNdarray_DEV_STRIDES(const CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
    {
@@ -3850,7 +3996,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self)
    return self->dev_structure + self->nd;
 }
 const int *
-CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
+CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
    {
@@ -3991,4 +4137,4 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
  fill-column:79
  End:
 */
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 :
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -81,7 +81,7 @@ struct CudaNdarray


    //device pointers (allocated by cudaMalloc)
-    int dev_structure_fresh;
+    mutable int dev_structure_fresh;
    //dev_structure should be accessed via macros, otherwise may not be synchronized
    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
    real* devdata; //pointer to data element [0,..,0].
@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
 *
 *  This means: recalculate the log2dims and transfer structure to the card
 */
-DllExport int cnda_copy_structure_to_device(CudaNdarray * self);
+DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);

-DllExport const int *CudaNdarray_DEV_DIMS(CudaNdarray * self);
-DllExport const int *CudaNdarray_DEV_STRIDES(CudaNdarray * self);
-DllExport const int *CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self);
+DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
+DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
+DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
 DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);

 /**
@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
        return -1;
    }

-    assert(size>0);
+    if (size < 0)
+    {
+        PyErr_Format(PyExc_AssertionError,
+                     "size (%i) < 0",
+                     size);
+        return -1;
+    }
+
    self->devdata = (float*)device_malloc(size*sizeof(real));
-    if (!self->devdata)
+    if (size && !self->devdata)
    {
-        CudaNdarray_set_nd(self,-1);
+        CudaNdarray_set_nd(self, -1);
        self->data_allocated = 0;
        self->devdata = 0;
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Could not allocate memory on device");
        return -1;
    }
    if (0)
@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
 /**
 * Return an independent copy of self
 */
-DllExport PyObject * CudaNdarray_Copy(CudaNdarray * self);
+DllExport PyObject * CudaNdarray_Copy(const CudaNdarray * self);

 /**
 * Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
 *
 * self is reallocated to have the correct dimensions if necessary.
 */
-DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast = false);
+DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);

 /**
 * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);

 DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
 DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
-DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
+DllExport int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray* A);

 DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
 DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
  fill-column:79
  End:
 */
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 :
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
+import itertools
 from unittest import TestCase

 from theano.compile.pfunc import pfunc
@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn
 from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad

 import theano.compile.mode
-from theano.tensor.tests.test_blas import BaseGemv, TestGer
+from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace

@@ -32,20 +33,31 @@ else:
 def my_rand(*shape):
    return theano._asarray(numpy.random.rand(*shape),dtype='float32')

+def transpose(cuda_mat):
+    # The easiest way to transpose a cuda matrix for now
+    return tcn.dimshuffle(cuda_mat, [1, 0])
+
 def test_dot22():
    def cmp(a_shp, b_shp):
-        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
+        a0 = my_rand(*a_shp)
+        a = tcn.shared_constructor(a0, 'a')

        b = tensor.fmatrix()

        f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)

-        a0 = a.get_value() * 1.0
        bval = my_rand(*b_shp)
        f(bval)

        assert numpy.allclose(numpy.dot(a0, bval), a.get_value())

+        # Try with a matrix equal to a0, but with strides in both dims
+        a.set_value(a0)
+        a.set_value(
+                a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+        f(bval)
+
    cmp((3,4),(4,5))
    cmp((0,4),(4,5))
    cmp((3,4),(4,0))
@@ -90,7 +102,8 @@ def test_dot22scalar():

 def test_gemm():
    def cmp(a_shp, b_shp):
-        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
+        a0 = my_rand(*a_shp)
+        a = tcn.shared_constructor(a0, 'a')

        b = tensor.fmatrix('b')
        c = tensor.fmatrix('c')
@@ -98,12 +111,19 @@ def test_gemm():
        f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
        assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])

-        a0 = a.get_value() * 1.0
        bval = my_rand(*b_shp)
        cval = my_rand(a_shp[0],b_shp[1])
        f(bval,cval)

        assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value())
+
+        # Try with a matrix equal to a0, but with strides in both dims
+        a.set_value(a0)
+        a.set_value(
+                a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+        f(bval, cval)
+
    cmp((3,4),(4,5))
    cmp((0,4),(4,5))
    cmp((3,4),(4,0))
@@ -114,7 +134,8 @@ def test_gemm():
 def test_gemm_no_inplace():

    def cmp(a_shp, b_shp):
-        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
+        a0 = my_rand(*a_shp)
+        a = tcn.shared_constructor(a0, 'a')
        cval = my_rand(a_shp[0], b_shp[1])
        c = tcn.shared_constructor(cval.copy(), 'c')

@@ -123,7 +144,6 @@ def test_gemm_no_inplace():

        f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)

-        a0 = a.get_value() * 1.0
        assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
        bval = my_rand(*b_shp)
        bval2 = my_rand(*b_shp)
@@ -132,6 +152,13 @@ def test_gemm_no_inplace():
        assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
        assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)

+        # Try with a matrix equal to a0, but with strides in both dims
+        a.set_value(a0)
+        a.set_value(
+                a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+        f(bval, bval2)
+
    cmp((3,4),(4,5))
    cmp((0,4),(4,5))
    cmp((3,4),(4,0))
@@ -139,6 +166,13 @@ def test_gemm_no_inplace():
    cmp((0,4),(4,0))
    cmp((0,0),(0,0))

+
+class TestBlasStridesGpu(TestBlasStrides):
+    dtype = 'float32'
+    shared = staticmethod(tcn.shared_constructor)
+    mode = mode_with_gpu
+
+
 def test_outer():
    x = tcn.shared_constructor(my_rand(8,), 'x')
    y = tcn.shared_constructor(my_rand(6,), 'y')
@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv,
    gemv = gpu_gemv_inplace
    gemv_inplace = gpu_gemv_inplace

+class TestGpuGemvNoTransfer(TestCase, BaseGemv,
+                  unittest_tools.TestOptimizationMixin):
+    mode = mode_with_gpu
+    dtype = 'float32'
+
+    # Mimic shared constructors registry
+    @staticmethod
+    def shared(val):
+        try:
+            return tcn.shared_constructor(val)
+        except TypeError:
+            return theano.shared(val)
+
+    # In this test, inputs are not always transfered to GPU
+    gemv = gpu_gemv_no_inplace
+    gemv_inplace = gpu_gemv_inplace
+

 class TestVectorMatrixDot(TestCase):
    ### Tolerance factor used in this tests
@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase):
        assert sum([node.op is gpu_gemv_inplace for node in
                    gpu_f2.maker.env.toposort() ]) == 1

+        # Check double-strided m
+        m.set_value(
+                m.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+        assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
+        assert numpy.allclose(no_gpu_f(), gpu_f2(), atol=self.atol)
+
+
    def test_dot_mv(self):
        ''' Test matrix dot vector '''
        v = theano.shared( numpy.array(numpy.random.rand(2), dtype='float32'))
@@ -365,6 +424,26 @@ class TestGpuGer(TestGer):
        self.ger = gpu_ger_inplace
        self.gemm = tcn.blas.gpu_gemm_inplace

+class TestGpuGerNoTransfer(TestGer):
+    @staticmethod
+    def shared(val):
+        try:
+            return tcn.shared_constructor(val)
+        except TypeError:
+            return theano.shared(val)
+
+    def setUp(self):
+        self.mode = mode_with_gpu
+        dtype = self.dtype = 'float32'  # optimization isn't dtype-dependent
+        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
+        self.a = tensor.tensor(dtype=dtype, broadcastable=())
+        self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        # data on the gpu make the op always inplace
+        self.ger = gpu_ger_inplace
+        self.ger_destructive = gpu_ger_inplace
+        self.gemm = tcn.blas.gpu_gemm_inplace
+

 class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
    def setUp(self):

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -496,7 +496,9 @@ class GemmRelated(Op):
        if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size)
            || ((Sx[0] != type_size) && (Sx[1] != type_size)))
        {
-            PyArrayObject * _x_copy = PyArray_GETCONTIGUOUS(%(_x)s);
+            PyArrayObject * _x_copy = (PyArrayObject *) PyArray_Copy(%(_x)s);
+            if (!_x_copy)
+                %(fail)s
            Py_XDECREF(%(_x)s);
            %(_x)s = _x_copy;
            Sx = %(_x)s->strides;
@@ -505,7 +507,9 @@ class GemmRelated(Op):
        if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size)
            || ((Sy[0] != type_size) && (Sy[1] != type_size)))
        {
-            PyArrayObject * _y_copy = PyArray_GETCONTIGUOUS(%(_y)s);
+            PyArrayObject * _y_copy = (PyArrayObject *) PyArray_Copy(%(_y)s);
+            if (!_y_copy)
+                %(fail)s
            Py_XDECREF(%(_y)s);
            %(_y)s = _y_copy;
            Sy = %(_y)s->strides;
@@ -514,7 +518,9 @@ class GemmRelated(Op):
        if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size)
            || ((Sz[0] != type_size) && (Sz[1] != type_size)))
        {
-            PyArrayObject * _z_copy = PyArray_GETCONTIGUOUS(%(_zout)s);
+            PyArrayObject * _z_copy = (PyArrayObject *) PyArray_Copy(%(_zout)s);
+            if (!_z_copy)
+                %(fail)s
            Py_XDECREF(%(_zout)s);
            %(_zout)s = _z_copy;
            Sz = %(_zout)s->strides;
@@ -649,7 +655,7 @@ class GemmRelated(Op):
            self.end_switch_typenum), '')

    def build_gemm_version(self):
-        return (10,)
+        return (12,)

 class Gemm(GemmRelated):
    """In-place version of matrix-matrix multiplication (with accumulation):

--- a/theano/tensor/blas_c.py
+++ b/theano/tensor/blas_c.py
@@ -58,9 +58,10 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):

    // copy A if !self.destructive or A is fully strided
    if (!%(destructive)s
+        || (%(A)s->strides[0] < 0)
+        || (%(A)s->strides[1] < 0)
        || ((%(A)s->strides[0] != elemsize)
-            &&
-            (%(A)s->strides[1] != elemsize)))
+            && (%(A)s->strides[1] != elemsize)))
    {
        npy_intp dims[2];
        dims[0] = %(A)s->dimensions[0];
@@ -68,7 +69,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):

        if ((NULL == %(Z)s)
            || (%(Z)s->dimensions[0] != %(A)s->dimensions[0])
-            || (%(Z)s->dimensions[1] != %(A)s->dimensions[1]))
+            || (%(Z)s->dimensions[1] != %(A)s->dimensions[1])
+            || (%(Z)s->strides[0] < 0)
+            || (%(Z)s->strides[1] < 0)
+            || ((%(Z)s->strides[0] != elemsize)
+                && (%(Z)s->strides[1] != elemsize)))
        {
            if (%(Z)s) Py_XDECREF(%(Z)s);
            %(Z)s = (PyArrayObject*)PyArray_SimpleNew(2, dims, PyArray_TYPE(%(A)s));
@@ -77,7 +82,11 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
                %(fail)s
            }
        }
-        assert (%(Z)s != %(A)s);
+        if (%(Z)s == %(A)s)
+        {
+            PyErr_SetString(PyExc_AssertionError, "%(Z)s != %(A)s");
+            %(fail)s
+        }
        if (%(Z)s->descr->type_num == PyArray_FLOAT)
        {
            float * zoutdata = (float*)%(Z)s->data;
@@ -142,8 +151,16 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
        int Sz0 = (Nz0 > 1) ? (%(Z)s->strides[0] / elemsize) : (Nz1 + 1);
        int Sz1 = (Nz1 > 1) ? (%(Z)s->strides[1] / elemsize) : (Nz0 + 1);

-        if (1)
-        {
+        dtype_%(x)s* x_data = (dtype_%(x)s*) %(x)s->data;
+        dtype_%(y)s* y_data = (dtype_%(y)s*) %(y)s->data;
+        // gemv expects pointers to the beginning of memory arrays,
+        // but numpy provides provides a pointer to the first element,
+        // so when the stride is negative, we need to get the last one.
+        if (Sx < 0)
+            x_data += (Nz0 - 1) * Sx;
+        if (Sy < 0)
+            y_data += (Nz1 - 1) * Sy;
+
        if (%(Z)s->strides[0] == elemsize)
        {
            if (%(Z)s->descr->type_num == PyArray_FLOAT)
@@ -151,19 +168,22 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
                //fprintf(stderr, "A\\n");
                float alpha = ((dtype_%(a)s*)%(a)s->data)[0];
                sger_(&Nz0, &Nz1, &alpha,
-                    (float*)(%(x)s->data), &Sx,
-                    (float*)(%(y)s->data), &Sy,
+                    (float*)x_data, &Sx,
+                    (float*)y_data, &Sy,
                    (float*)(%(Z)s->data), &Sz1);
            }
            else if (%(Z)s->descr->type_num == PyArray_DOUBLE)
            {
                double alpha = ((dtype_%(a)s*)%(a)s->data)[0];
                dger_(&Nz0, &Nz1, &alpha,
-                    (double*)(%(x)s->data), &Sx,
-                    (double*)(%(y)s->data), &Sy,
+                    (double*)x_data, &Sx,
+                    (double*)y_data, &Sy,
                    (double*)(%(Z)s->data), &Sz1);
            }
-            else { assert(0); }
+            else {
+                PyErr_SetString(PyExc_NotImplementedError, "not float nor double");
+                %(fail)s
+            }
        }
        else if (%(Z)s->strides[1] == elemsize)
        {
@@ -174,21 +194,30 @@ def ger_c_code(A, a, x, y, Z, destructive, fail):
                //fprintf(stderr, "alpha=%%f\\n", alpha);
                //fprintf(stderr, "sx  sy %%i %%i\\n", Sx, Sy);
                sger_(&Nz1, &Nz0, &alpha,
-                    (float*)(%(y)s->data), &Sy,
-                    (float*)(%(x)s->data), &Sx,
+                    (float*)y_data, &Sy,
+                    (float*)x_data, &Sx,
                    (float*)(%(Z)s->data), &Sz0);
            }
            else if (%(Z)s->descr->type_num == PyArray_DOUBLE)
            {
                double alpha = ((dtype_%(a)s*)%(a)s->data)[0];
                dger_(&Nz1, &Nz0, &alpha,
-                    (double*)(%(y)s->data), &Sy,
-                    (double*)(%(x)s->data), &Sx,
+                    (double*)y_data, &Sy,
+                    (double*)x_data, &Sx,
                    (double*)(%(Z)s->data), &Sz0);
            }
-            else { assert(0); }
+            else
+            {
+                PyErr_SetString(PyExc_NotImplementedError, "not float nor double");
+                %(fail)s
+            }
        }
-        else { assert(0); }
+        else
+        {
+            PyErr_SetString(PyExc_AssertionError,
+                "A is a double-strided matrix, and should have been copied "
+                "into a memory-contiguous one.");
+            %(fail)s
        }
    }

@@ -204,7 +233,7 @@ class CGer(BaseBLAS, Ger):
        return code

    def c_code_cache_version(self):
-        return (4,)
+        return (8,)


 @local_optimizer([ger, ger_destructive])
@@ -282,7 +311,11 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
                %(fail)s
            }
        }
-        assert (%(zz)s != %(aa)s);
+        if (%(zz)s == %(aa)s)
+        {
+            PyErr_SetString(PyExc_AssertionError, "%(zz)s != %(aa)s");
+            %(fail)s
+        }
        if (dbeta != 0)
        {
            if (%(zz)s->descr->type_num == PyArray_FLOAT)
@@ -330,13 +363,52 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
        char NOTRANS = 'N';
        int Nx0 = %(xx)s->dimensions[0];
        int Nx1 = %(xx)s->dimensions[1];
-        int Sx0 = %(xx)s->strides[0] / elemsize;
-        int Sx1 = %(xx)s->strides[1] / elemsize;
+        /* This formula is needed in the case where xx is actually a row or
+         * column matrix, because BLAS sometimes insists that the strides:
+         *  - are not smaller than the number of elements in the array
+         *  - are not 0.
+         */
+        int Sx0 = (Nx0 > 1) ? (%(xx)s->strides[0] / elemsize) : (Nx1 + 1);
+        int Sx1 = (Nx1 > 1) ? (%(xx)s->strides[1] / elemsize) : (Nx0 + 1);
        int Sz = %(zz)s->strides[0] / elemsize;
        int Sy = %(yy)s->strides[0] / elemsize;

+        dtype_%(yy)s* yy_data = (dtype_%(yy)s*) %(yy)s->data;
+        dtype_%(zz)s* zz_data = (dtype_%(zz)s*) %(zz)s->data;
+        // gemv expects pointers to the beginning of memory arrays,
+        // but numpy provides provides a pointer to the first element,
+        // so when the stride is negative, we need to get the last one.
+        if (Sy < 0)
+            yy_data += (Nx1 - 1) * Sy;
+        if (Sz < 0)
+            zz_data += (Nx0 - 1) * Sz;
+
        if (Nx0 * Nx1)
        {
+            // If xx is neither C- nor F-contiguous, we make a copy.
+            // TODO:
+            // - if one stride is equal to "- elemsize", we can still call
+            //   gemv on reversed matrix and vectors
+            // - if the copy is too long, maybe call vector/vector dot on
+            //   each row instead
+            if ((%(xx)s->strides[0] < 0)
+                || (%(xx)s->strides[1] < 0)
+                || ((%(xx)s->strides[0] != elemsize)
+                    && (%(xx)s->strides[1] != elemsize)))
+            {
+                npy_intp dims[2];
+                dims[0] = Nx0;
+                dims[1] = Nx1;
+
+                PyArrayObject * xx_copy = (PyArrayObject *) PyArray_Copy(%(xx)s);
+                if (!xx_copy)
+                    %(fail)s
+                Py_XDECREF(%(xx)s);
+                %(xx)s = xx_copy;
+                Sx0 = (Nx0 > 1) ? (%(xx)s->strides[0] / elemsize) : (Nx1 + 1);
+                Sx1 = (Nx1 > 1) ? (%(xx)s->strides[1] / elemsize) : (Nx0 + 1);
+            }
+
            if (%(xx)s->strides[0] == elemsize)
            {
                if (%(xx)s->descr->type_num == PyArray_FLOAT)
@@ -346,9 +418,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
                    sgemv_(&NOTRANS, &Nx0, &Nx1,
                        &alpha,
                        (float*)(%(xx)s->data), &Sx1,
-                        (float*)(%(yy)s->data), &Sy,
+                        (float*)yy_data, &Sy,
                        &fbeta,
-                        (float*)(%(zz)s->data), &Sz);
+                        (float*)zz_data, &Sz);
                }
                else if (%(xx)s->descr->type_num == PyArray_DOUBLE)
                {
@@ -356,13 +428,14 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
                    dgemv_(&NOTRANS, &Nx0, &Nx1,
                        &alpha,
                        (double*)(%(xx)s->data), &Sx1,
-                        (double*)(%(yy)s->data), &Sy,
+                        (double*)yy_data, &Sy,
                        &dbeta,
-                        (double*)(%(zz)s->data), &Sz);
+                        (double*)zz_data, &Sz);
                }
                else
                {
-                    assert(0);
+                    PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype");
+                    %(fail)s
                }
            }
            else if (%(xx)s->strides[1] == elemsize)
@@ -376,9 +449,9 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
                    sgemv_(&TRANS, &Nx1, &Nx0,
                        &alpha,
                        (float*)(%(xx)s->data), &Sx0,
-                        (float*)(%(yy)s->data), &Sy,
+                        (float*)yy_data, &Sy,
                        &fbeta,
-                        (float*)(%(zz)s->data), &Sz);
+                        (float*)zz_data, &Sz);
                }
                else if (%(xx)s->descr->type_num == PyArray_DOUBLE)
                {
@@ -386,20 +459,22 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
                    dgemv_(&TRANS, &Nx1, &Nx0,
                        &alpha,
                        (double*)(%(xx)s->data), &Sx0,
-                        (double*)(%(yy)s->data), &Sy,
+                        (double*)yy_data, &Sy,
                        &dbeta,
-                        (double*)(%(zz)s->data), &Sz);
+                        (double*)zz_data, &Sz);
                }
                else
                {
-                    assert(0);
+                    PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype");
+                    %(fail)s
                }
            }
            else
            {
-                // if xx is strided in both directions, then just do the gemv with a
-                // pair of for loops.
-                assert (0);
+                PyErr_SetString(PyExc_AssertionError,
+                    "xx is a double-strided matrix, and should have been copied "
+                    "into a memory-contiguous one.");
+                %(fail)s
            }
        }
        else if (dbeta != 1.0)
@@ -429,7 +504,7 @@ class CGemv(BaseBLAS, Gemv):
        return code

    def c_code_cache_version(self):
-        return (2,)
+        return (9,)


 @local_optimizer([gemv_inplace, gemv_no_inplace])

--- a/theano/tensor/blas_scipy.py
+++ b/theano/tensor/blas_scipy.py
@@ -44,7 +44,12 @@ class ScipyGer(Ger):
            # N.B. some versions of scipy (e.g. mine) don't actually work
            # in-place on a, even when I tell it to.
            A = cA[0]
-            if A.flags['C_CONTIGUOUS']:
+            if A.size == 0:
+                # We don't have to do anything, A is empty.
+                # We need this special case because Numpy considers it
+                # C-contiguous, wich is confusing.
+                pass
+            elif A.flags['C_CONTIGUOUS']:
                A = local_ger(calpha[0], cy[0], cx[0], a=A.T,
                        overwrite_a=int(self.destructive)).T
            else:

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
 #from nose.plugins.skip import SkipTest
 #import traceback
-import sys
+import itertools, sys
 import theano.tensor as T
+from theano import tensor
 #from theano.gof import Env
 from theano.printing import pp

@@ -878,6 +879,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):

        # Assert they produce the same output
        assert numpy.allclose(f(), numpy.dot(v.get_value(), m.get_value()))
+        # Assert it works when m has no contiguous dimension
+        m.set_value(
+                m.get_value(borrow=True)[::-1, ::-1],
+                borrow=True)
+        assert numpy.allclose(f(), numpy.dot(v.get_value(), m.get_value()))


    def test_dot_mv(self):
@@ -894,6 +900,11 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):

        # Assert they produce the same output
        assert numpy.allclose(f(), numpy.dot(m.get_value(), v.get_value()))
+        # Assert it works when m has no contiguous dimension
+        m.set_value(
+                m.get_value(borrow=True)[::-1, ::-1],
+                borrow=True)
+        assert numpy.allclose(f(), numpy.dot(m.get_value(), v.get_value()))

    @staticmethod
    def t_gemv1(m_shp):
@@ -915,19 +926,30 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
        assert topo[0].op.inplace==False

        #test the inplace version
-        f = theano.function([], [], updates={v2:v2+theano.dot(m,v1)}
+        g = theano.function([], [], updates={v2:v2+theano.dot(m,v1)}
                            , mode = mode_blas_opt)

        # Assert they produce the same output
-        f()
+        g()
        assert numpy.allclose(v2.get_value(),
                numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
-        topo = f.maker.env.toposort()
+        topo = g.maker.env.toposort()
        assert len(topo)==1
        assert isinstance(topo[0].op, Gemv)
        if config.mode != 'FAST_COMPILE':
            assert topo[0].op.inplace==True

+        # Do the same tests with a matrix with strides in both dimensions
+        m.set_value(
+                m.get_value(borrow=True)[::-1, ::-1],
+                borrow=True)
+        v2.set_value(v2_orig)
+        assert numpy.allclose(f(),
+                numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
+        g()
+        assert numpy.allclose(v2.get_value(),
+                numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
+
    def test_gemv1(self):
        self.t_gemv1((3,2))
        self.t_gemv1((0,2))
@@ -952,18 +974,29 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
        assert topo[-1].op.inplace==False

        #test the inplace version
-        f = theano.function([], [], updates={v2:v2+theano.dot(v1,m)}
+        g = theano.function([], [], updates={v2:v2+theano.dot(v1,m)}
                            , mode = mode_blas_opt)

        # Assert they produce the same output
-        f()
+        g()
        assert numpy.allclose(v2.get_value(),
                numpy.dot(v1.get_value(), m.get_value()) + v2_orig)
-        topo = f.maker.env.toposort()
+        topo = g.maker.env.toposort()
        assert sum(isinstance(node.op, Gemv) for node in topo)==1
        if config.mode != 'FAST_COMPILE':
            assert topo[-1].op.inplace==True

+        # Do the same tests with a matrix with strides in both dimensions
+        m.set_value(
+                m.get_value(borrow=True)[::-1, ::-1],
+                borrow=True)
+        v2.set_value(v2_orig)
+        assert numpy.allclose(f(),
+                numpy.dot(v1.get_value(), m.get_value()) + v2.get_value())
+        g()
+        assert numpy.allclose(v2.get_value(),
+                numpy.dot(v1.get_value(), m.get_value()) + v2_orig)
+
    def test_gemv_dimensions(self):
        A = T.matrix('A')
        x, y = T.vectors('x', 'y')
@@ -984,6 +1017,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
        ones_6 = numpy.ones(6, dtype=config.floatX)

        f(A_val, ones_3, ones_5)
+        f(A_val[::-1, ::-1], ones_3, ones_5)
        self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
        self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
        self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
@@ -1017,6 +1051,7 @@ def matrixmultiply(a, b):

 class BaseGemv(object):
    mode = mode_blas_opt  # can be overridden with self.mode
+    shared = staticmethod(theano.shared)

    def get_data(self,x_stride=1,y_stride=1):
        rng = numpy.random.RandomState(unittest_tools.fetch_seed())
@@ -1031,7 +1066,7 @@ class BaseGemv(object):
        return alpha,beta,a,x,y

    def test_simple(self):
-        alpha, beta, a, x, y = [ shared(value) for value in self.get_data() ]
+        alpha, beta, a, x, y = [ self.shared(value) for value in self.get_data() ]
        desired_oy = alpha.get_value() * matrixmultiply(a.get_value(),x.get_value()) + beta.get_value() * y.get_value()

        oy    = alpha * T.dot(a,x) + beta * y
@@ -1049,8 +1084,8 @@ class BaseGemv(object):

        vs = self.get_data()
        alpha_v, beta_v, a_v, x_v, y_v = vs
-        a = shared(a_v)
-        x = shared(x_v)
+        a = self.shared(a_v)
+        x = self.shared(x_v)

        desired_oy = matrixmultiply(a_v, x_v)

@@ -1067,7 +1102,7 @@ class BaseGemv(object):
    def test_simple_transpose(self):
        vs = self.get_data()
        alpha_v, beta_v, a_v, x_v, y_v = vs
-        alpha, beta, a, x, y = [ shared(v) for v in vs ]
+        alpha, beta, a, x, y = [ self.shared(v) for v in vs ]

        desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v

@@ -1083,7 +1118,7 @@ class BaseGemv(object):
    def test_x_stride(self):
        vs = self.get_data(x_stride = 2)
        alpha_v, beta_v, a_v, x_v, y_v = vs
-        alpha, beta, a, x, y = [ shared(v) for v in vs ]
+        alpha, beta, a, x, y = [ self.shared(v) for v in vs ]

        desired_oy = alpha_v * matrixmultiply(a_v,x_v[::2])+beta_v*y_v

@@ -1099,7 +1134,7 @@ class BaseGemv(object):
    def test_x_stride_transpose(self):
        vs = self.get_data(x_stride = 2)
        alpha_v, beta_v, a_v, x_v, y_v = vs
-        alpha, beta, a, x, y = [ shared(v) for v in vs ]
+        alpha, beta, a, x, y = [ self.shared(v) for v in vs ]

        desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v[::2])+beta_v*y_v

@@ -1115,7 +1150,7 @@ class BaseGemv(object):
    def test_y_stride(self):
        vs = self.get_data(y_stride = 2)
        alpha_v, beta_v, a_v, x_v, y_v = vs
-        alpha, beta, a, x, y = [ shared(v) for v in vs ]
+        alpha, beta, a, x, y = [ self.shared(v) for v in vs ]

        desired_oy = alpha_v * matrixmultiply(a_v,x_v)+beta_v*y_v[::2]

@@ -1131,7 +1166,7 @@ class BaseGemv(object):
    def test_y_stride_transpose(self):
        vs = self.get_data(y_stride = 2)
        alpha_v, beta_v, a_v, x_v, y_v = vs
-        alpha, beta, a, x, y = [ shared(v) for v in vs ]
+        alpha, beta, a, x, y = [ self.shared(v) for v in vs ]

        desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v[::2]

@@ -1144,6 +1179,46 @@ class BaseGemv(object):
        oy_v = oy_func()
        assert_array_almost_equal(desired_oy, oy_v)

+    def test_a_strides(self):
+        vs = self.get_data()
+        alpha_v, beta_v, a_v, x_v, y_v = vs
+        alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
+        a_v = a_v[::-1, ::-1]
+        a.set_value(
+                a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+
+        desired_oy = alpha_v * matrixmultiply(a_v,x_v)+beta_v*y_v
+
+        oy = alpha * T.dot(a,x)+beta*y
+
+        oy_func = theano.function([], oy, mode=self.mode)
+
+        self.assertFunctionContains1(oy_func, self.gemv)
+
+        oy_v = oy_func()
+        assert_array_almost_equal(desired_oy, oy_v)
+
+    def test_a_strides_transpose(self):
+        vs = self.get_data()
+        alpha_v, beta_v, a_v, x_v, y_v = vs
+        alpha, beta, a, x, y = [ self.shared(v) for v in vs ]
+        a_v = a_v[::-1, ::-1]
+        a.set_value(
+                a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+
+        desired_oy = alpha_v * matrixmultiply(transpose(a_v),x_v)+beta_v*y_v
+
+        oy = alpha * T.dot(a.T,x)+beta*y
+
+        oy_func = theano.function([], oy, mode=self.mode)
+
+        self.assertFunctionContains1(oy_func, self.gemv)
+
+        oy_v = oy_func()
+        assert_array_almost_equal(desired_oy, oy_v)
+
    def test_upcasting_scalar_nogemv(self):
        # Test that the optimization does not crash when the scale has
        # an incorrect dtype, and forces upcasting of the result
@@ -1281,6 +1356,7 @@ class TestGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):


 class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
+    shared = staticmethod(theano.shared)

    def setUp(self):
        self.mode = theano.compile.get_default_mode().including('fast_run')
@@ -1332,6 +1408,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
        f(numpy.random.rand(5, 4).astype(self.dtype),
          numpy.random.rand(5).astype(self.dtype),
          numpy.random.rand(4).astype(self.dtype))
+        f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
+          numpy.random.rand(5).astype(self.dtype),
+          numpy.random.rand(4).astype(self.dtype))

    def test_A_plus_scaled_outer(self):
        f = self.function([self.A, self.x, self.y],
@@ -1340,6 +1419,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
        f(numpy.random.rand(5, 4).astype(self.dtype),
          numpy.random.rand(5).astype(self.dtype),
          numpy.random.rand(4).astype(self.dtype))
+        f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
+          numpy.random.rand(5).astype(self.dtype),
+          numpy.random.rand(4).astype(self.dtype))

    def test_scaled_A_plus_scaled_outer(self):
        f = self.function([self.A, self.x, self.y],
@@ -1352,6 +1434,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
        f(numpy.random.rand(5, 4).astype(self.dtype),
          numpy.random.rand(5).astype(self.dtype),
          numpy.random.rand(4).astype(self.dtype))
+        f(numpy.random.rand(5, 4).astype(self.dtype)[::-1, ::-1],
+          numpy.random.rand(5).astype(self.dtype),
+          numpy.random.rand(4).astype(self.dtype))

    def given_dtype(self, dtype, M, N):
        """ test corner case shape and dtype"""
@@ -1362,6 +1447,9 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
        f(numpy.random.rand(M, N).astype(self.dtype),
          numpy.random.rand(M).astype(self.dtype),
          numpy.random.rand(N).astype(self.dtype))
+        f(numpy.random.rand(M, N).astype(self.dtype)[::-1, ::-1],
+          numpy.random.rand(M).astype(self.dtype),
+          numpy.random.rand(N).astype(self.dtype))

    def test_f32_0_0(self):
        return self.given_dtype('float32', 0, 0)
@@ -1394,10 +1482,393 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
        return self.given_dtype('complex128', 1, 9)

    def test_inplace(self):
-        A = theano.shared(numpy.random.rand(4, 5).astype(self.dtype))
+        A = self.shared(numpy.random.rand(4, 5).astype(self.dtype))
        f = self.function([self.x, self.y], [],
                          updates={A: A + T.constant(0.1, dtype=self.dtype) *
                                   T.outer(self.x, self.y)})
        self.assertFunctionContains(f, self.ger_destructive)
        f(numpy.random.rand(4).astype(self.dtype),
          numpy.random.rand(5).astype(self.dtype))
+
+        A.set_value(
+            A.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+            borrow=True)
+        f(numpy.random.rand(4).astype(self.dtype),
+          numpy.random.rand(5).astype(self.dtype))
+
+class TestBlasStrides(TestCase):
+    dtype = 'float64'
+    shared = staticmethod(tensor._shared)
+    mode = theano.compile.get_default_mode()
+    mode = mode.including('fast_run').excluding('gpu', 'c_blas', 'scipy_blas')
+    rng = numpy.random.RandomState(seed=unittest_tools.fetch_seed())
+
+    def rand(self, *shape):
+        return theano._asarray(self.rng.rand(*shape), dtype=self.dtype)
+
+    def cmp_dot22(self, b_shp, c_shp):
+        av = numpy.zeros((0, 0), dtype=self.dtype)
+        bv = self.rand(*b_shp)
+        cv = self.rand(*c_shp)
+
+        a = self.shared(av, 'a')
+        b = self.shared(bv, 'b')
+        c = self.shared(cv, 'c')
+
+        b_t = self.shared(bv.T, 'b.T')
+        c_t = self.shared(cv.T, 'c.T')
+
+        b_dev = b.get_value(borrow=False, return_internal_type=True)
+        c_dev = c.get_value(borrow=False, return_internal_type=True)
+        bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
+        ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
+
+        f_nn = theano.function([], [], updates={a: tensor.dot(b, c)},
+                mode=self.mode)
+        print 'class name:', self.__class__.__name__
+        theano.printing.debugprint(f_nn)
+        f_nt = theano.function([], [], updates={a: tensor.dot(b, c_t.T)},
+                mode=self.mode)
+        f_tn = theano.function([], [], updates={a: tensor.dot(b_t.T, c)},
+                mode=self.mode)
+        f_tt = theano.function([], [], updates={a: tensor.dot(b_t.T, c_t.T)},
+                mode=self.mode)
+
+        # Try with all stride patterns, and all transposed pattern
+        for step_signs in itertools.product((-1, 1), repeat=4):
+            for step in (1, 2):
+                b_step1, b_step2, c_step1, c_step2 = (s * step
+                        for s in step_signs)
+
+                b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
+                c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
+                b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
+                c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
+
+                # Numpy result
+                a_n = numpy.dot(bv[::b_step1, ::b_step2],
+                                cv[::c_step1, ::c_step2])
+
+                f_nn()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                f_nt()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                f_tn()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                f_tt()
+                assert numpy.allclose(a.get_value(), a_n)
+
+    def test_dot22(self):
+        self.cmp_dot22((3, 4), (4, 5))
+        self.cmp_dot22((1, 4), (4, 5))
+        self.cmp_dot22((3, 4), (4, 1))
+        self.cmp_dot22((3, 1), (1, 1))
+        self.cmp_dot22((1, 4), (4, 1))
+        self.cmp_dot22((3, 1), (1, 5))
+        self.cmp_dot22((0, 4), (4, 5))
+        self.cmp_dot22((0, 4), (4, 1))
+        self.cmp_dot22((0, 1), (1, 5))
+        self.cmp_dot22((3, 4), (4, 0))
+        self.cmp_dot22((3, 0), (0, 5))
+        self.cmp_dot22((0, 4), (4, 0))
+        self.cmp_dot22((0, 0), (0, 0))
+
+    def cmp_dot22scalar(self, b_shp, c_shp):
+        av = numpy.zeros((0, 0), dtype=self.dtype)
+        bv = self.rand(*b_shp)
+        cv = self.rand(*c_shp)
+        l = numpy.float32(0.2)
+
+        a = self.shared(av, 'a')
+        b = self.shared(bv, 'b')
+        c = self.shared(cv, 'c')
+
+        b_t = self.shared(bv.T, 'b.T')
+        c_t = self.shared(cv.T, 'c.T')
+
+        b_dev = b.get_value(borrow=False, return_internal_type=True)
+        c_dev = c.get_value(borrow=False, return_internal_type=True)
+        bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
+        ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
+
+        f_nn = theano.function([], [], updates={a: l * tensor.dot(b, c)},
+                mode=self.mode)
+        f_nt = theano.function([], [], updates={a: l * tensor.dot(b, c_t.T)},
+                mode=self.mode)
+        f_tn = theano.function([], [], updates={a: l * tensor.dot(b_t.T, c)},
+                mode=self.mode)
+        f_tt = theano.function([], [],
+                updates={a: l * tensor.dot(b_t.T, c_t.T)},
+                mode=self.mode)
+
+        # Try with all stride patterns, and all transposed pattern
+        for step_signs in itertools.product((-1, 1), repeat=4):
+            for step in (1, 2):
+                b_step1, b_step2, c_step1, c_step2 = (s * step
+                        for s in step_signs)
+
+                b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
+                c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
+                b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
+                c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
+
+                # Numpy result
+                a_n = l * numpy.dot(bv[::b_step1, ::b_step2],
+                                    cv[::c_step1, ::c_step2])
+
+                f_nn()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                f_nt()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                f_tn()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                f_tt()
+                assert numpy.allclose(a.get_value(), a_n)
+
+    def test_dot22scalar(self):
+        self.cmp_dot22scalar((3, 4), (4, 5))
+        self.cmp_dot22scalar((1, 4), (4, 5))
+        self.cmp_dot22scalar((3, 4), (4, 1))
+        self.cmp_dot22scalar((3, 1), (1, 1))
+        self.cmp_dot22scalar((1, 4), (4, 1))
+        self.cmp_dot22scalar((3, 1), (1, 5))
+        self.cmp_dot22scalar((0, 4), (4, 5))
+        self.cmp_dot22scalar((0, 4), (4, 1))
+        self.cmp_dot22scalar((0, 1), (1, 5))
+        self.cmp_dot22scalar((3, 4), (4, 0))
+        self.cmp_dot22scalar((3, 0), (0, 5))
+        self.cmp_dot22scalar((0, 4), (4, 0))
+        self.cmp_dot22scalar((0, 0), (0, 0))
+
+    def cmp_gemm(self, a_shp, b_shp, c_shp):
+        av = self.rand(*a_shp)
+        bv = self.rand(*b_shp)
+        cv = self.rand(*c_shp)
+        l = numpy.float32(0.2)
+
+        a = self.shared(av, 'a')
+        b = self.shared(bv, 'b')
+        c = self.shared(cv, 'c')
+
+        a_t = self.shared(av.T, 'a.T')
+        b_t = self.shared(bv.T, 'b.T')
+        c_t = self.shared(cv.T, 'c.T')
+
+        a_dev = a.get_value(borrow=False, return_internal_type=True)
+        b_dev = b.get_value(borrow=False, return_internal_type=True)
+        c_dev = c.get_value(borrow=False, return_internal_type=True)
+        bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
+        ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
+
+        f_nnn = theano.function([], [],
+                updates={a: (l * a + tensor.dot(b, c))},
+                mode=self.mode)
+        f_nnt = theano.function([], [],
+                updates={a: (l * a + tensor.dot(b, c_t.T))},
+                mode=self.mode)
+        f_ntn = theano.function([], [],
+                updates={a: (l * a + tensor.dot(b_t.T, c))},
+                mode=self.mode)
+        f_ntt = theano.function([], [],
+                updates={a: (l * a + tensor.dot(b_t.T, c_t.T))},
+                mode=self.mode)
+        f_tnn = theano.function([], [],
+                updates={a_t: (l * a_t + tensor.dot(b, c).T)},
+                mode=self.mode)
+        f_tnt = theano.function([], [],
+                updates={a_t: (l * a_t + tensor.dot(b, c_t.T).T)},
+                mode=self.mode)
+        f_ttn = theano.function([], [],
+                updates={a_t: (l * a_t + tensor.dot(b_t.T, c).T)},
+                mode=self.mode)
+        f_ttt = theano.function([], [],
+                updates={a_t: (l * a_t + tensor.dot(b_t.T, c_t.T).T)},
+                mode=self.mode)
+
+        # Try with all stride patterns, and all transposed pattern
+        for step_signs in itertools.product((-1, 1), repeat=6):
+            for step in (1, 2):
+                a_step1, a_step2, b_step1, b_step2, c_step1, c_step2 = \
+                        (s * step for s in step_signs)
+
+                b.set_value(b_dev.copy()[::b_step1, ::b_step2], borrow=True)
+                c.set_value(c_dev.copy()[::c_step1, ::c_step2], borrow=True)
+                b_t.set_value(bt_dev.copy()[::b_step2, ::b_step1], borrow=True)
+                c_t.set_value(ct_dev.copy()[::c_step2, ::c_step1], borrow=True)
+
+                # Numpy results
+                a_n = (l * av[::a_step1, ::a_step2]
+                       + numpy.dot(bv[::b_step1, ::b_step2],
+                                   cv[::c_step1, ::c_step2]))
+                at_n = (l * av[::a_step1, ::a_step2].T
+                        + numpy.dot(bv[::b_step1, ::b_step2],
+                                    cv[::c_step1, ::c_step2]).T)
+
+                # a's value is updated, so we need to reinitialize it each time
+                a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
+                f_nnn()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
+                f_nnt()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
+                f_ntn()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
+                f_ntt()
+                assert numpy.allclose(a.get_value(), a_n)
+
+                a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
+                        borrow=True)
+                f_tnn()
+                assert numpy.allclose(a_t.get_value(), at_n)
+
+                a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
+                        borrow=True)
+                f_tnt()
+                assert numpy.allclose(a_t.get_value(), at_n)
+
+                a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
+                        borrow=True)
+                f_ttn()
+                assert numpy.allclose(a_t.get_value(), at_n)
+
+                a_t.set_value(transpose(a_dev.copy())[::a_step2, ::a_step1],
+                        borrow=True)
+                f_ttt()
+                assert numpy.allclose(a_t.get_value(), at_n)
+
+    def test_gemm(self):
+        self.cmp_gemm((3, 5), (3, 4), (4, 5))
+        self.cmp_gemm((1, 5), (1, 4), (4, 5))
+        self.cmp_gemm((3, 1), (3, 4), (4, 1))
+        self.cmp_gemm((3, 1), (3, 1), (1, 1))
+        self.cmp_gemm((1, 1), (1, 4), (4, 1))
+        self.cmp_gemm((3, 5), (3, 1), (1, 5))
+        self.cmp_gemm((0, 5), (0, 4), (4, 5))
+        self.cmp_gemm((0, 1), (0, 4), (4, 1))
+        self.cmp_gemm((0, 5), (0, 1), (1, 5))
+        self.cmp_gemm((3, 0), (3, 4), (4, 0))
+        self.cmp_gemm((3, 5), (3, 0), (0, 5))
+        self.cmp_gemm((0, 0), (0, 4), (4, 0))
+        self.cmp_gemm((0, 0), (0, 0), (0, 0))
+
+    def cmp_gemv(self, a_shp, b_shp, c_shp):
+        av = self.rand(a_shp)
+        bv = self.rand(*b_shp)
+        cv = self.rand(c_shp)
+        l = numpy.float32(0.2)
+
+        a = self.shared(av, 'a')
+        b = self.shared(bv, 'b')
+        c = self.shared(cv, 'c')
+        b_t = self.shared(bv.T, 'b.T')
+
+        a_dev = a.get_value(borrow=False, return_internal_type=True)
+        b_dev = b.get_value(borrow=False, return_internal_type=True)
+        c_dev = c.get_value(borrow=False, return_internal_type=True)
+
+        f_n = theano.function([], [], updates={a: (a + l * tensor.dot(b, c))},
+                mode=self.mode)
+
+        f_t = theano.function([], [],
+                updates={a: (a + l * tensor.dot(b_t.T, c))},
+                mode=self.mode)
+
+        # Try with all stride patterns, and all transposed pattern
+        for step_signs in itertools.product((1, -1), repeat=4):
+            for step in (1, 2):
+                a_step, b_step1, b_step2, c_step = (s * step
+                        for s in step_signs)
+
+                a.set_value(a_dev.copy()[::a_step], borrow=True)
+                b.set_value(b_dev.copy()[::b_step1, ::b_step2],
+                        borrow=True)
+                b_t.set_value(transpose(b_dev.copy())[::b_step2, ::b_step1],
+                        borrow=True)
+                c.set_value(c_dev.copy()[::c_step], borrow=True)
+
+                a_n = (av[::a_step]
+                        + l * numpy.dot(bv[::b_step1, ::b_step2], cv[::c_step]))
+                f_n()
+                assert numpy.allclose(a.get_value(), a_n), (a.get_value(), a_n)
+
+                a.set_value(a_dev.copy()[::a_step], borrow=True)
+                f_t()
+                assert numpy.allclose(a.get_value(), a_n), (a.get_value(), a_n)
+
+    def test_gemv(self):
+        self.cmp_gemv(3, (3, 5), 5)
+        self.cmp_gemv(1, (1, 5), 5)
+        self.cmp_gemv(3, (3, 1), 1)
+        self.cmp_gemv(0, (0, 5), 5)
+        self.cmp_gemv(3, (3, 0), 0)
+        self.cmp_gemv(0, (0, 1), 1)
+        self.cmp_gemv(1, (1, 0), 0)
+        self.cmp_gemv(0, (0, 0), 0)
+
+
+    def cmp_ger(self, a_shp, b_shp, c_shp):
+        av = self.rand(*a_shp)
+        bv = self.rand(b_shp)
+        cv = self.rand(c_shp)
+        l = numpy.float32(0.2)
+
+        a = self.shared(av, 'a')
+        b = self.shared(bv, 'b')
+        c = self.shared(cv, 'c')
+        a_t = self.shared(av.T, 'a.T')
+
+        a_dev = a.get_value(borrow=False, return_internal_type=True)
+        b_dev = b.get_value(borrow=False, return_internal_type=True)
+        c_dev = c.get_value(borrow=False, return_internal_type=True)
+
+        f_n = theano.function([], [],
+                updates={a: (a + l * tensor.outer(b, c))},
+                mode=self.mode)
+
+        f_t = theano.function([], [],
+                updates={a_t: (a_t + l * tensor.outer(b, c).T)},
+                mode=self.mode)
+
+        # Try with all stride patterns, and all transposed patterns
+        for step_signs in itertools.product((1, -1), repeat=4):
+            for step in (1, 2):
+                a_step1, a_step2, b_step, c_step = (s * step
+                        for s in step_signs)
+
+                a.set_value(a_dev.copy()[::a_step1, ::a_step2], borrow=True)
+                a_t.set_value(transpose(a_dev.copy())[::a_step1, ::a_step2],
+                        borrow=True)
+                b.set_value(b_dev.copy()[::b_step], borrow=True)
+                c.set_value(c_dev.copy()[::c_step], borrow=True)
+
+                f_n()
+                n_n = (av[::a_step1, ::a_step2]
+                        + l * numpy.outer(bv[::b_step], cv[::c_step]))
+                assert numpy.allclose(a.get_value(), n_n), (a.get_value(), n_n)
+
+                f_t()
+                n_t = (av.T[::a_step1, ::a_step2]
+                        + l * numpy.outer(bv[::b_step], cv[::c_step]).T)
+                assert numpy.allclose(a_t.get_value(), n_t),\
+                        (a_t.get_value(), n_t)
+
+    def test_ger_strides(self):
+        self.cmp_ger((3, 5), 3, 5)
+        self.cmp_ger((1, 5), 1, 5)
+        self.cmp_ger((3, 1), 3, 1)
+        self.cmp_ger((0, 5), 0, 5)
+        self.cmp_ger((3, 0), 3, 0)
+        self.cmp_ger((0, 1), 0, 1)
+        self.cmp_ger((1, 0), 1, 0)
+        self.cmp_ger((0, 0), 0, 0)
--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
 import sys
 import numpy
+from unittest import TestCase
+
 import theano
 import theano.tensor as tensor

@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv
 from theano.tests import unittest_tools
 from theano.tests.unittest_tools import TestOptimizationMixin

-from test_blas import TestCase
-from test_blas import BaseGemv
+from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides

 mode_blas_opt = theano.compile.get_default_mode().including(
    'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin):
                )

    def run_f(self, f):
-        return f(self.Aval, self.xval, self.yval)
+        f(self.Aval, self.xval, self.yval)
+        f(self.Aval[::-1, ::-1], self.xval, self.yval)

    def b(self, bval):
        return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        assert numpy.allclose(f(self.xval, self.Aval),
                numpy.dot(self.xval, self.Aval))

+        # Test with negative strides on 2 dims
+        assert numpy.allclose(f(self.xval, self.Aval[::-1, ::-1]),
+                numpy.dot(self.xval, self.Aval[::-1, ::-1]))
+
    def test_optimizations_mv(self):
        ''' Test matrix dot vector '''
        f = theano.function([self.A, self.y],
@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        # Assert they produce the same output
        assert numpy.allclose(f(self.Aval, self.yval),
                numpy.dot(self.Aval, self.yval))
+        # Test with negative strides on 2 dims
+        assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
+                numpy.dot(self.Aval[::-1, ::-1], self.yval))
+

    def t_gemv1(self, m_shp):
        ''' test vector2 + dot(matrix, vector1) '''
@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        assert topo == [CGemv(inplace=False)], topo

        #test the inplace version
-        f = theano.function([], [],
+        g = theano.function([], [],
                updates={v2:v2+theano.dot(m,v1)},
                mode=self.mode)

        # Assert they produce the same output
-        f()
+        g()
        assert numpy.allclose(v2.get_value(),
                numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
-        topo = [n.op for n in f.maker.env.toposort()]
+        topo = [n.op for n in g.maker.env.toposort()]
        assert topo == [CGemv(inplace=True)]

+        # Do the same tests with a matrix with strides in both dimensions
+        m.set_value(
+                m.get_value(borrow=True)[::-1, ::-1],
+                borrow=True)
+        v2.set_value(v2_orig)
+        assert numpy.allclose(f(),
+            numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
+        g()
+        assert numpy.allclose(v2.get_value(),
+            numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
+
    def test_gemv1(self):
        self.t_gemv1((3,2))
        self.t_gemv1((0,2))
@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        ones_6 = numpy.ones(6, dtype=dtype)

        f(A_val, ones_3, ones_5)
+        f(A_val[::-1, ::-1], ones_3, ones_5)
        self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
        self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
        self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
    dtype = 'float64'
    gemv = CGemv(inplace=False)
    gemv_inplace = CGemv(inplace=True)
+
+class TestBlasStridesC(TestBlasStrides):
+    mode = mode_blas_opt
--- a/theano/tensor/tests/test_blas_scipy.py
+++ b/theano/tensor/tests/test_blas_scipy.py
@@ -4,7 +4,7 @@ import theano
 import theano.tensor as tensor
 from theano.tensor.blas_scipy import ScipyGer

-from test_blas import TestCase, gemm_no_inplace
+from test_blas import TestCase, gemm_no_inplace, TestBlasStrides
 from theano.tests.unittest_tools import TestOptimizationMixin

 class TestScipyGer(TestCase, TestOptimizationMixin):
@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):

    def run_f(self, f):
        f(self.Aval, self.xval, self.yval)
+        f(self.Aval[::-1, ::-1], self.xval[::-1], self.yval[::-1])

    def b(self, bval):
        return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
                0.2 * self.A + 0.1 * tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, gemm_no_inplace)
        self.run_f(f) #DebugMode tests correctness
+
+class TestBlasStridesScipy(TestBlasStrides):
+    mode = theano.compile.get_default_mode()
+    mode = mode.including('fast_run').excluding('gpu', 'c_blas')