New update.

Gemm implementation rewritten so that a new data memory is allocated when we can not reuse C. C can not be reused either when BETA != 0, or when BETA == 0 but C is not contiguous (LDC != M). All tests passed, + special test suggested by lamblin.

New update.
da835094 · notoraptor · 4185ef0c · da835094
--- a/theano/tensor/alt_gemm_template.c
+++ b/theano/tensor/alt_gemm_template.c
 /** %(name)s **/
-void alt_numpy_scale_matrix_inplace_%(float_type)s(%(float_type)s scalar, PyArrayObject* matrix) {
+
+/* Scalar*Matrix function.
+ * Computes: matrix = scalar*matrix. */
+void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) {
    NpyIter* iterator = NpyIter_New(matrix, 
        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, 
        NPY_KEEPORDER, NPY_NO_CASTING, NULL);
@@ -15,17 +18,15 @@ void alt_numpy_scale_matrix_inplace_%(float_type)s(%(float_type)s scalar, PyArra
        npy_intp stride = *stride_ptr;
        npy_intp count = *innersize_ptr;
        while(count) {
-            *((%(float_type)s*)data) *= scalar;
+            *((%(float_type)s*)data) *= *scalar;
            data += stride;
            --count;
        }
    } while(get_next(iterator));
    NpyIter_Deallocate(iterator);
 }
-/*Matrix+Matrix function.
- * Computes (scalar1 * matrix1) + (scalar2 * matrix2)
- * and puts the result into matrix2:
- * matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
+/* Matrix+Matrix function.
+ * Computes: matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
 void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
        const %(float_type)s* scalar1, PyArrayObject* matrix1,
        const %(float_type)s* scalar2, PyArrayObject* matrix2
@@ -47,33 +48,41 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
    } while(get_next(iterators));
    NpyIter_Deallocate(iterators);
 }
+/* NumPy Wrapping function. Wraps a data into a Numpy's PyArrayObject.
+ * By default, data is considered as Fortran-style array (column by column).
+ * If to_transpose, data will be considered as C-style array (row by row)
+ * with dimensions inversed. */
 PyObject* alt_op_without_copy_%(float_type)s(int to_transpose, %(float_type)s* M, int nrow, int ncol, int LDM) {
-    // By default, M is considered as a nrow*ncol F-contiguous matrix with LDM as stride indicator for the columns.
    npy_intp dims[2];
    npy_intp strides[2];
-    int flags;
    if(to_transpose) {
        dims[0] = ncol;
        dims[1] = nrow;
        strides[0] = LDM * %(float_size)d;
        strides[1] = %(float_size)d;
-        flags = NPY_ARRAY_C_CONTIGUOUS;
    } else {
        dims[0] = nrow;
        dims[1] = ncol;
        strides[0] = %(float_size)d;
        strides[1] = LDM * %(float_size)d;
-        flags = NPY_ARRAY_F_CONTIGUOUS;
    }
-    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, flags, NULL);
+    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, 0, NULL);
+}
+/* Special wrapping case used for matrix C in gemm implementation. */
+inline PyObject* alt_wrap_fortran_writeable_matrix_%(float_type)s(
+    %(float_type)s* matrix, const int* nrow, const int* ncol, const int* LD
+) {
+    npy_intp dims[2] = {*nrow, *ncol};
+    npy_intp strides[2] = {%(float_size)d, (*LD) * %(float_size)d};
+    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, matrix, 0, NPY_ARRAY_WRITEABLE, NULL);
 }
 /* %(name)s template code */
 void %(name)s(
-    char* TRANSA, char* TRANSB, 
-    const int* M, const int* N, const int* K,
+    char* TRANSA, char* TRANSB, const int* M, const int* N, const int* K,
    const %(float_type)s* ALPHA, %(float_type)s* A, const int* LDA, 
    %(float_type)s* B, const int* LDB, const %(float_type)s* BETA, 
-    %(float_type)s* C, const int* LDC) {
+    %(float_type)s* C, const int* LDC
+) {
    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
        alt_fatal_error("The integer arguments passed to %(name)s must all be at least 0.");
    /* If M or N is null, there is nothing to do with C,
@@ -102,15 +111,19 @@ void %(name)s(
    npy_intp* computation_strides;
    npy_intp computation_dims[2] = {*N, *M};
    npy_intp default_computation_strides[2] = {(*LDC) * %(float_size)d, %(float_size)d};
-    if(*BETA == 0) {
-        /*C is never read, just written, so it will be directly used
-         * to store the result of ALPHA*op(A)*op(B). */
-        computation_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE;
+    if(*BETA == 0 && *LDC == *M) {
+        /* BETA == 0, so C is never read.
+         * LDC == M, so C is contiguous in memory
+         * (that condition is needed for dot operation, se below).
+         * Then we can compute ALPHA*op(A)*op(B) directly in C. */
+        computation_flags = NPY_ARRAY_WRITEABLE;
        computation_pointer = C;
        computation_strides = default_computation_strides;
    } else {
-        /* C will be read, so the must allocate a new memory buffer
-         * to compute op(A)*op(B). */
+        /* Either BETA != 0 (C will be read)
+         * or LDC != M (C is not read but is not contiguous in memory).
+         * Then in both cases, we need to allocate a new memory
+         * to compute ALPHA*op(A)*op(B). */
        computation_flags = 0;
        computation_pointer = NULL;
        computation_strides = NULL;
@@ -133,14 +146,20 @@ void %(name)s(
    PyArray_MatrixProduct2(opB_transposed, opA_transposed, (PyArrayObject*)opB_trans_dot_opA_trans);
    if(*BETA == 0) {
        if(*ALPHA != 1.0)
-            alt_numpy_scale_matrix_inplace_%(float_type)s(*ALPHA, (PyArrayObject*)opB_trans_dot_opA_trans);
+            alt_numpy_scale_matrix_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opB_trans_dot_opA_trans);
+        if(*LDC != *M) {
+            /* A buffer has been created to compute ALPHA*op(A)*op(B),
+             * so we must copy it to the real output, that is C. */
+            PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
+            PyObject* alpha_opA_dot_opB = alt_op_without_copy_%(float_type)s(0, (%(float_type)s*)PyArray_DATA((PyArrayObject*)opB_trans_dot_opA_trans), *M, *N, *M);
+            if(0 != PyArray_CopyInto((PyArrayObject*)matrix_C, (PyArrayObject*)alpha_opA_dot_opB))
+                alt_fatal_error("Numpy %(name)s implementation: unable to copy ALPHA*op(A)*op(B) to C when BETA == 0.");
+            Py_XDECREF(alpha_opA_dot_opB);
+            Py_XDECREF(matrix_C);
+        }
    } else {
-        /* C is read, so we must consider it as F-contiguous. */
-        npy_intp dims_C[2] = {*M, *N};
-        npy_intp strides_C[2] = {%(float_size)d, (*LDC) * %(float_size)d};
-        PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_F_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
-        /* Hack: if we use alt_op_without_copy with the right parameters,
-         * it will returns the transposed version of opB_trans_dot_opA_trans. */
+        /* C is read, so we must consider it as Fortran-style matrix. */
+        PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
        PyObject* opA_dot_opB = alt_op_without_copy_%(float_type)s(0, (%(float_type)s*)PyArray_DATA((PyArrayObject*)opB_trans_dot_opA_trans), *M, *N, *M);
        alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opA_dot_opB, BETA, (PyArrayObject*)matrix_C);
        Py_XDECREF(opA_dot_opB);