Merge pull request #437 from lamblin/blas_double_strides

Make blas functions work with all stride patterns

Merge pull request #437 from lamblin/blas_double_strides
1f6b0c73 · nouiz · b96c7d5c · 3316ba1e · 1f6b0c73 · 1f6b0c73
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -192,7 +192,7 @@ class GpuGemm(GpuOp):
        return Apply(self, [z, a, x, y, b], [z.type()])

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
@@ -200,6 +200,7 @@ class GpuGemm(GpuOp):
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
+        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO.StringIO()

@@ -215,39 +216,50 @@ class GpuGemm(GpuOp):
        : (REAL)(((double*)%(b)s->data)[0]);
        #undef REAL

-        """
-        if self.inplace:
-            print >> sio, """
+        if (%(inplace)s
+            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
+            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
+            && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
+                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
+                || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
+                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
+        {
+            // The input has an appropriate layout, we work inplace
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
-            """
-        else:
-            print >> sio, """
-            if (!%(z_out)s
-                || (%(z_out)s->nd != 2)
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
-                )
+        }
+        else if (%(z_out)s
+                && (%(z_out)s->nd == 2)
+                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
+                    == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
+                    == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
+                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
+                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
+                && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
+                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
+                    || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
+                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
+        {
+            // The existing output has an appropriate layout,
+            // copy the input data into it, then work inplace
+            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
-                Py_XDECREF(%(z_out)s);
-                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
-                if (!%(z_out)s)
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            else
+        }
+        else
+        {
+            // Copy the input, use the copy as output
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+            if (!%(z_out)s)
            {
-                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            """
-
+        }

-        print >> sio, """
        if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
        {
            %(fail)s;
@@ -294,7 +306,7 @@ class GpuGemv(GpuOp):
        return Apply(self, [z, a, x, y, b], [z.type()])

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
@@ -302,44 +314,46 @@ class GpuGemv(GpuOp):
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
+        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO.StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
        float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
-        """
-        if self.inplace:
-            print >> sio, """
+
+        if (%(inplace)s
+            && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
+                || ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
+                    && (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
+        {
+            // Work inplace on the input
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
-            """
-        else:
-            print >> sio, """
-            if (!%(z_out)s
-                || (%(z_out)s->nd != 1)
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
-                )
+        }
+        else if (%(z_out)s
+                && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
+                    || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
+                        && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
+        {
+            // Work on the output
+            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
-                Py_XDECREF(%(z_out)s);
-                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
-                if (!%(z_out)s)
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            else
+        }
+        else
+        {
+            // Copy
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+            if (!%(z_out)s)
            {
-                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            """
-
+        }

-        print >> sio, """
        if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
        {
            %(fail)s;
@@ -385,7 +399,7 @@ class GpuGer(GpuOp):
        return Apply(self, [z, a, x, y], [z.type()])

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
@@ -393,44 +407,57 @@ class GpuGer(GpuOp):
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y = inputs
        z_out, = outputs
+        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO.StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
-        """
-        if self.inplace:
-            print >> sio, """
+
+        if (%(inplace)s
+            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
+            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
+            && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
+                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
+                || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
+                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
+        {
+            // The input has an appropriate layout, we work inplace
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
-            """
-        else:
-            print >> sio, """
-            if (!%(z_out)s
-                || (%(z_out)s->nd != 2)
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
-                || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
-                )
+        }
+        else if (%(z_out)s
+                && (%(z_out)s->nd == 2)
+                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
+                    == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
+                && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
+                    == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
+                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
+                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
+                && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
+                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
+                    || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
+                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
+        {
+            // The existing output has an appropriate layout,
+            // copy the input data into it, then work inplace
+            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
-                Py_XDECREF(%(z_out)s);
-                %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
-                if (!%(z_out)s)
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            else
+        }
+        else
+        {
+            // Copy the input, use the copy as output
+            Py_XDECREF(%(z_out)s);
+            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
+            if (!%(z_out)s)
            {
-                if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
-            """
-
+        }

-        print >> sio, """
        if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
        {
            %(fail)s;

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -81,7 +81,7 @@ struct CudaNdarray


    //device pointers (allocated by cudaMalloc)
-    int dev_structure_fresh;
+    mutable int dev_structure_fresh;
    //dev_structure should be accessed via macros, otherwise may not be synchronized
    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
    real* devdata; //pointer to data element [0,..,0].
@@ -154,11 +154,11 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
 *
 *  This means: recalculate the log2dims and transfer structure to the card
 */
-DllExport int cnda_copy_structure_to_device(CudaNdarray * self);
+DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);

-DllExport const int *CudaNdarray_DEV_DIMS(CudaNdarray * self);
-DllExport const int *CudaNdarray_DEV_STRIDES(CudaNdarray * self);
-DllExport const int *CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self);
+DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
+DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
+DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
 DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);

 /**
@@ -229,13 +229,22 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
        return -1;
    }

-    assert(size>0);
+    if (size < 0)
+    {
+        PyErr_Format(PyExc_AssertionError,
+                     "size (%i) < 0",
+                     size);
+        return -1;
+    }
+
    self->devdata = (float*)device_malloc(size*sizeof(real));
-    if (!self->devdata)
+    if (size && !self->devdata)
    {
-        CudaNdarray_set_nd(self,-1);
+        CudaNdarray_set_nd(self, -1);
        self->data_allocated = 0;
        self->devdata = 0;
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Could not allocate memory on device");
        return -1;
    }
    if (0)
@@ -283,7 +292,7 @@ DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
 /**
 * Return an independent copy of self
 */
-DllExport PyObject * CudaNdarray_Copy(CudaNdarray * self);
+DllExport PyObject * CudaNdarray_Copy(const CudaNdarray * self);

 /**
 * Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
@@ -302,7 +311,7 @@ DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
 *
 * self is reallocated to have the correct dimensions if necessary.
 */
-DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast = false);
+DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast = false);

 /**
 * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
@@ -321,7 +330,7 @@ DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);

 DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
 DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
-DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
+DllExport int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray* A);

 DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
 DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
@@ -343,4 +352,4 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
  fill-column:79
  End:
 */
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 :
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
+import itertools
 from unittest import TestCase

 from theano.compile.pfunc import pfunc
@@ -17,7 +18,7 @@ import theano.sandbox.cuda as tcn
 from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad

 import theano.compile.mode
-from theano.tensor.tests.test_blas import BaseGemv, TestGer
+from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace

@@ -32,20 +33,31 @@ else:
 def my_rand(*shape):
    return theano._asarray(numpy.random.rand(*shape),dtype='float32')

+def transpose(cuda_mat):
+    # The easiest way to transpose a cuda matrix for now
+    return tcn.dimshuffle(cuda_mat, [1, 0])
+
 def test_dot22():
    def cmp(a_shp, b_shp):
-        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
+        a0 = my_rand(*a_shp)
+        a = tcn.shared_constructor(a0, 'a')

        b = tensor.fmatrix()

        f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)

-        a0 = a.get_value() * 1.0
        bval = my_rand(*b_shp)
        f(bval)

        assert numpy.allclose(numpy.dot(a0, bval), a.get_value())

+        # Try with a matrix equal to a0, but with strides in both dims
+        a.set_value(a0)
+        a.set_value(
+                a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+        f(bval)
+
    cmp((3,4),(4,5))
    cmp((0,4),(4,5))
    cmp((3,4),(4,0))
@@ -90,7 +102,8 @@ def test_dot22scalar():

 def test_gemm():
    def cmp(a_shp, b_shp):
-        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
+        a0 = my_rand(*a_shp)
+        a = tcn.shared_constructor(a0, 'a')

        b = tensor.fmatrix('b')
        c = tensor.fmatrix('c')
@@ -98,12 +111,19 @@ def test_gemm():
        f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
        assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])

-        a0 = a.get_value() * 1.0
        bval = my_rand(*b_shp)
        cval = my_rand(a_shp[0],b_shp[1])
        f(bval,cval)

        assert numpy.allclose(numpy.dot(a0, bval)+numpy.exp(cval), a.get_value())
+
+        # Try with a matrix equal to a0, but with strides in both dims
+        a.set_value(a0)
+        a.set_value(
+                a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+        f(bval, cval)
+
    cmp((3,4),(4,5))
    cmp((0,4),(4,5))
    cmp((3,4),(4,0))
@@ -114,7 +134,8 @@ def test_gemm():
 def test_gemm_no_inplace():

    def cmp(a_shp, b_shp):
-        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
+        a0 = my_rand(*a_shp)
+        a = tcn.shared_constructor(a0, 'a')
        cval = my_rand(a_shp[0], b_shp[1])
        c = tcn.shared_constructor(cval.copy(), 'c')

@@ -123,7 +144,6 @@ def test_gemm_no_inplace():

        f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)

-        a0 = a.get_value() * 1.0
        assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
        bval = my_rand(*b_shp)
        bval2 = my_rand(*b_shp)
@@ -132,6 +152,13 @@ def test_gemm_no_inplace():
        assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
        assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)

+        # Try with a matrix equal to a0, but with strides in both dims
+        a.set_value(a0)
+        a.set_value(
+                a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+        f(bval, bval2)
+
    cmp((3,4),(4,5))
    cmp((0,4),(4,5))
    cmp((3,4),(4,0))
@@ -139,6 +166,13 @@ def test_gemm_no_inplace():
    cmp((0,4),(4,0))
    cmp((0,0),(0,0))

+
+class TestBlasStridesGpu(TestBlasStrides):
+    dtype = 'float32'
+    shared = staticmethod(tcn.shared_constructor)
+    mode = mode_with_gpu
+
+
 def test_outer():
    x = tcn.shared_constructor(my_rand(8,), 'x')
    y = tcn.shared_constructor(my_rand(6,), 'y')
@@ -260,6 +294,23 @@ class TestGpuGemv(TestCase, BaseGemv,
    gemv = gpu_gemv_inplace
    gemv_inplace = gpu_gemv_inplace

+class TestGpuGemvNoTransfer(TestCase, BaseGemv,
+                  unittest_tools.TestOptimizationMixin):
+    mode = mode_with_gpu
+    dtype = 'float32'
+
+    # Mimic shared constructors registry
+    @staticmethod
+    def shared(val):
+        try:
+            return tcn.shared_constructor(val)
+        except TypeError:
+            return theano.shared(val)
+
+    # In this test, inputs are not always transfered to GPU
+    gemv = gpu_gemv_no_inplace
+    gemv_inplace = gpu_gemv_inplace
+

 class TestVectorMatrixDot(TestCase):
    ### Tolerance factor used in this tests
@@ -286,6 +337,14 @@ class TestVectorMatrixDot(TestCase):
        assert sum([node.op is gpu_gemv_inplace for node in
                    gpu_f2.maker.env.toposort() ]) == 1

+        # Check double-strided m
+        m.set_value(
+                m.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
+                borrow=True)
+        assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
+        assert numpy.allclose(no_gpu_f(), gpu_f2(), atol=self.atol)
+
+
    def test_dot_mv(self):
        ''' Test matrix dot vector '''
        v = theano.shared( numpy.array(numpy.random.rand(2), dtype='float32'))
@@ -365,6 +424,26 @@ class TestGpuGer(TestGer):
        self.ger = gpu_ger_inplace
        self.gemm = tcn.blas.gpu_gemm_inplace

+class TestGpuGerNoTransfer(TestGer):
+    @staticmethod
+    def shared(val):
+        try:
+            return tcn.shared_constructor(val)
+        except TypeError:
+            return theano.shared(val)
+
+    def setUp(self):
+        self.mode = mode_with_gpu
+        dtype = self.dtype = 'float32'  # optimization isn't dtype-dependent
+        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
+        self.a = tensor.tensor(dtype=dtype, broadcastable=())
+        self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        # data on the gpu make the op always inplace
+        self.ger = gpu_ger_inplace
+        self.ger_destructive = gpu_ger_inplace
+        self.gemm = tcn.blas.gpu_gemm_inplace
+

 class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
    def setUp(self):

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -496,7 +496,9 @@ class GemmRelated(Op):
        if ((Sx[0] < 1) || (Sx[1] < 1) || (Sx[0] MOD type_size) || (Sx[1] MOD type_size)
            || ((Sx[0] != type_size) && (Sx[1] != type_size)))
        {
-            PyArrayObject * _x_copy = PyArray_GETCONTIGUOUS(%(_x)s);
+            PyArrayObject * _x_copy = (PyArrayObject *) PyArray_Copy(%(_x)s);
+            if (!_x_copy)
+                %(fail)s
            Py_XDECREF(%(_x)s);
            %(_x)s = _x_copy;
            Sx = %(_x)s->strides;
@@ -505,7 +507,9 @@ class GemmRelated(Op):
        if ((Sy[0] < 1) || (Sy[1] < 1) || (Sy[0] MOD type_size) || (Sy[1] MOD type_size)
            || ((Sy[0] != type_size) && (Sy[1] != type_size)))
        {
-            PyArrayObject * _y_copy = PyArray_GETCONTIGUOUS(%(_y)s);
+            PyArrayObject * _y_copy = (PyArrayObject *) PyArray_Copy(%(_y)s);
+            if (!_y_copy)
+                %(fail)s
            Py_XDECREF(%(_y)s);
            %(_y)s = _y_copy;
            Sy = %(_y)s->strides;
@@ -514,7 +518,9 @@ class GemmRelated(Op):
        if ((Sz[0] < 1) || (Sz[1] < 1) || (Sz[0] MOD type_size) || (Sz[1] MOD type_size)
            || ((Sz[0] != type_size) && (Sz[1] != type_size)))
        {
-            PyArrayObject * _z_copy = PyArray_GETCONTIGUOUS(%(_zout)s);
+            PyArrayObject * _z_copy = (PyArrayObject *) PyArray_Copy(%(_zout)s);
+            if (!_z_copy)
+                %(fail)s
            Py_XDECREF(%(_zout)s);
            %(_zout)s = _z_copy;
            Sz = %(_zout)s->strides;
@@ -649,7 +655,7 @@ class GemmRelated(Op):
            self.end_switch_typenum), '')

    def build_gemm_version(self):
-        return (10,)
+        return (12,)

 class Gemm(GemmRelated):
    """In-place version of matrix-matrix multiplication (with accumulation):

--- a/theano/tensor/blas_c.py
+++ b/theano/tensor/blas_c.py
--- a/theano/tensor/blas_scipy.py
+++ b/theano/tensor/blas_scipy.py
@@ -44,7 +44,12 @@ class ScipyGer(Ger):
            # N.B. some versions of scipy (e.g. mine) don't actually work
            # in-place on a, even when I tell it to.
            A = cA[0]
-            if A.flags['C_CONTIGUOUS']:
+            if A.size == 0:
+                # We don't have to do anything, A is empty.
+                # We need this special case because Numpy considers it
+                # C-contiguous, wich is confusing.
+                pass
+            elif A.flags['C_CONTIGUOUS']:
                A = local_ger(calpha[0], cy[0], cx[0], a=A.T,
                        overwrite_a=int(self.destructive)).T
            else:

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
 import sys
 import numpy
+from unittest import TestCase
+
 import theano
 import theano.tensor as tensor

@@ -14,8 +16,7 @@ from theano.tensor.blas import Gemv
 from theano.tests import unittest_tools
 from theano.tests.unittest_tools import TestOptimizationMixin

-from test_blas import TestCase
-from test_blas import BaseGemv
+from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides

 mode_blas_opt = theano.compile.get_default_mode().including(
    'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
@@ -41,7 +42,8 @@ class TestCGer(TestCase, TestOptimizationMixin):
                )

    def run_f(self, f):
-        return f(self.Aval, self.xval, self.yval)
+        f(self.Aval, self.xval, self.yval)
+        f(self.Aval[::-1, ::-1], self.xval, self.yval)

    def b(self, bval):
        return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
@@ -132,6 +134,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        assert numpy.allclose(f(self.xval, self.Aval),
                numpy.dot(self.xval, self.Aval))

+        # Test with negative strides on 2 dims
+        assert numpy.allclose(f(self.xval, self.Aval[::-1, ::-1]),
+                numpy.dot(self.xval, self.Aval[::-1, ::-1]))
+
    def test_optimizations_mv(self):
        ''' Test matrix dot vector '''
        f = theano.function([self.A, self.y],
@@ -145,6 +151,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        # Assert they produce the same output
        assert numpy.allclose(f(self.Aval, self.yval),
                numpy.dot(self.Aval, self.yval))
+        # Test with negative strides on 2 dims
+        assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
+                numpy.dot(self.Aval[::-1, ::-1], self.yval))
+

    def t_gemv1(self, m_shp):
        ''' test vector2 + dot(matrix, vector1) '''
@@ -164,17 +174,28 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        assert topo == [CGemv(inplace=False)], topo

        #test the inplace version
-        f = theano.function([], [],
+        g = theano.function([], [],
                updates={v2:v2+theano.dot(m,v1)},
                mode=self.mode)

        # Assert they produce the same output
-        f()
+        g()
        assert numpy.allclose(v2.get_value(),
                numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
-        topo = [n.op for n in f.maker.env.toposort()]
+        topo = [n.op for n in g.maker.env.toposort()]
        assert topo == [CGemv(inplace=True)]

+        # Do the same tests with a matrix with strides in both dimensions
+        m.set_value(
+                m.get_value(borrow=True)[::-1, ::-1],
+                borrow=True)
+        v2.set_value(v2_orig)
+        assert numpy.allclose(f(),
+            numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
+        g()
+        assert numpy.allclose(v2.get_value(),
+            numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
+
    def test_gemv1(self):
        self.t_gemv1((3,2))
        self.t_gemv1((0,2))
@@ -200,6 +221,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        ones_6 = numpy.ones(6, dtype=dtype)

        f(A_val, ones_3, ones_5)
+        f(A_val[::-1, ::-1], ones_3, ones_5)
        self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
        self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
        self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
@@ -217,3 +239,6 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
    dtype = 'float64'
    gemv = CGemv(inplace=False)
    gemv_inplace = CGemv(inplace=True)
+
+class TestBlasStridesC(TestBlasStrides):
+    mode = mode_blas_opt
--- a/theano/tensor/tests/test_blas_scipy.py
+++ b/theano/tensor/tests/test_blas_scipy.py
@@ -4,7 +4,7 @@ import theano
 import theano.tensor as tensor
 from theano.tensor.blas_scipy import ScipyGer

-from test_blas import TestCase, gemm_no_inplace
+from test_blas import TestCase, gemm_no_inplace, TestBlasStrides
 from theano.tests.unittest_tools import TestOptimizationMixin

 class TestScipyGer(TestCase, TestOptimizationMixin):
@@ -30,6 +30,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):

    def run_f(self, f):
        f(self.Aval, self.xval, self.yval)
+        f(self.Aval[::-1, ::-1], self.xval[::-1], self.yval[::-1])

    def b(self, bval):
        return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
@@ -55,3 +56,7 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
                0.2 * self.A + 0.1 * tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, gemm_no_inplace)
        self.run_f(f) #DebugMode tests correctness
+
+class TestBlasStridesScipy(TestBlasStrides):
+    mode = theano.compile.get_default_mode()
+    mode = mode.including('fast_run').excluding('gpu', 'c_blas')