New update! I have integrated the required changes and

re-run the tests. All tests are passed as before, and now some tests are faster. The biggest gain on my computer is for theano/tensor/nnet/tests/test_corr3d.py, which goes from 687 seconds before to 259 seconds now. For other tests, it's between 3 and 20 seconds. Now there is not copy nor memory allocation (apart from NumPy wrapping structures) when BETA == 0. I rewrote the OP(matrix) function so that it does not return new allocated data anymore. Instead it just creates a PyArrayObject wrapper around the matrix pointer with the right format: F-contiguous (nrow * ncol) by default, or C-contiguous (ncol * nrow) if matrix need to be transposed. I also rewrote the matrix sum function so that it requires scalars to multiply each passed matrix before addition. Now the function do: B = alpha*A + beta*B with alpha and beta as the scalars (both set to 1 if we just want B = A + B). Thus, there is now only one iteration over A and B, in which A and B are each read once, and B modified once.

New update! I have integrated the required changes and
24f96fa8 · notoraptor · a4cc7dbd · 24f96fa8 · 24f96fa8
--- a/theano/tensor/alt_gemm_common.c
+++ b/theano/tensor/alt_gemm_common.c
 /** C Implementation of [sd]gemm_ based on NumPy
 * Used instead of blas when Theano config flag blas.ldflags is empty.
 * This file contains the common code for [sd]gemm_.
- * File alt_gemm_template.c contains template code for [sd]gemm_.
+ * File alt_gemm_template.c contains template code for [sd]gemm_. **/
-**/
-inline void alt_fatal_error(const char* message) {
+#define alt_fatal_error(message) { if(message != NULL) fprintf(stderr, message); exit(-1); }
-    if(message != NULL) fprintf(stderr, message);
-    exit(-1);
+#define alt_trans_to_bool(trans)  (*trans != 'N' && *trans != 'n')
-}
-inline PyObject* alt_op(char* trans, PyArrayObject* matrix) {
-    return (*trans == 'N' || *trans == 'n') ? 
-        (PyObject*)matrix : 
-        PyArray_Transpose(matrix, NULL);
-}
 /**Template code for [sd]gemm_ follows in file alt_gemm_template.c
 * (as Python string to be used with old formatting).
 * PARAMETERS:
 * float_type: "float" for sgemm_, "double" for dgemm_.
 * float_size: 4 for float32 (sgemm_), 8 for float64 (dgemm_).
 * npy_float: "NPY_FLOAT32" for sgemm_, "NPY_FLOAT64" for dgemm_.
- * name: usually "sgemm_" for sgemm_, "dgemm_" for dgemm_. 
+ * name: "sgemm_" for sgemm_, "dgemm_" for dgemm_. 
 * See blas_headers.py for current use.**/
--- a/theano/tensor/alt_gemm_template.c
+++ b/theano/tensor/alt_gemm_template.c
@@ -22,10 +22,13 @@ void alt_numpy_scalar_matrix_product_in_place_%(float_type)s(%(float_type)s scal
    } while(get_next(iterator));
    NpyIter_Deallocate(iterator);
 }
-/*Matrix+Matrix function.
+/*Matrix+Matrix function. Compute (coeffA * matrixA) + (coeffB * matrixB)
 * Remark: This function actually sums a C-contiguous matrix (alpha*op(A)*op(B)) with a F-contiguous matrix (beta*C)
 * (see gemm implementation at next function for more details) */
-void alt_numpy_matrix_sum_in_place_%(float_type)s(PyArrayObject* A, PyArrayObject* B) {
+void alt_numpy_matrix_extended_sum_in_place_%(float_type)s(
+        const %(float_type)s* ALPHA, PyArrayObject* A,
+        const %(float_type)s* BETA, PyArrayObject* B
+) {
    PyArrayObject* op[2]       = {A, B};
    npy_uint32     op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
    npy_uint32     flags       = 0;
@@ -37,12 +40,32 @@ void alt_numpy_matrix_sum_in_place_%(float_type)s(PyArrayObject* A, PyArrayObjec
    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
    char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
    do {
-        char* from_A = data_ptr_array[0];
+        %(float_type)s* from_A = (%(float_type)s*)data_ptr_array[0];
-        char* from_B = data_ptr_array[1];
+        %(float_type)s* from_B = (%(float_type)s*)data_ptr_array[1];
-        *((%(float_type)s*)from_B) += *((%(float_type)s*)from_A);
+        *from_B = (*ALPHA)*(*from_A) + (*BETA)*(*from_B);
    } while(get_next(iterators));
    NpyIter_Deallocate(iterators);
 }
+PyObject* alt_op_without_copy_%(float_type)s(int transposable, %(float_type)s* M, int nrow, int ncol, int LDM) {
+    // By default, M is considered as a nrow*ncol F-contiguous matrix with LDM as stride indicator for the columns.
+    npy_intp dims[2];
+    npy_intp strides[2];
+    int flags;
+    if(transposable) {
+        dims[0] = ncol;
+        dims[1] = nrow;
+        strides[0] = dims[1] * %(float_size)d;
+        strides[1] = %(float_size)d;
+        flags = NPY_ARRAY_C_CONTIGUOUS;
+    } else {
+        dims[0] = nrow;
+        dims[1] = ncol;
+        strides[0] = %(float_size)d;
+        strides[1] = LDM * %(float_size)d;
+        flags = NPY_ARRAY_F_CONTIGUOUS;
+    }
+    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, flags, NULL);
+}
 /* %(name)s template code */
 void %(name)s(
    char* TRANSA, char* TRANSB, 
@@ -53,56 +76,53 @@ void %(name)s(
    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
        return;
    int nrowa, ncola, nrowb, ncolb;
-    if(*TRANSA == 'N' || *TRANSA == 'n') {
+    int is_A_transposable = alt_trans_to_bool(TRANSA);
-        nrowa = *M; ncola = *K;
+    int is_B_transposable = alt_trans_to_bool(TRANSB);
-    } else {
+    if(is_A_transposable) {
        nrowa = *K; ncola = *M;
-    }
-    if(*TRANSB == 'N' || *TRANSB == 'n') {
-        nrowb = *K; ncolb = *N;
    } else {
+        nrowa = *M; ncola = *K;
+    }
+    if(is_B_transposable) {
        nrowb = *N; ncolb = *K;
+    } else {
+        nrowb = *K; ncolb = *N;
    }
-    npy_intp dims_A[2] = {nrowa, ncola};
-    npy_intp dims_B[2] = {nrowb, ncolb};
-    npy_intp dims_C[2] = {*M, *N};
-    npy_intp strides_A[2] = {%(float_size)d, (*LDA) * %(float_size)d};
-    npy_intp strides_B[2] = {%(float_size)d, (*LDB) * %(float_size)d};
-    PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, %(npy_float)s, strides_A, A, 0, NPY_ARRAY_F_CONTIGUOUS, NULL);
-    PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, %(npy_float)s, strides_B, B, 0, NPY_ARRAY_F_CONTIGUOUS, NULL);
-    PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
-    PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
    if(*BETA == 0) {
-        /*C is never red, just written.*/
+        /*C is never read, just written.
-        npy_intp strides_C[2] = {(*N) * %(float_size)d, %(float_size)d};
+         * matrix_C will be created as C-contiguous because
-        /*matrix_C is created as C-contiguous because the 3rd parameter of PyArray_MatrixProduct2 (below) expects a C-contiguous array.*/
+         * the 3rd parameter of PyArray_MatrixProduct2 (used below)
+         * expects a C-contiguous array.
+         * Also, to avoid some memory copy, transposition conditions
+         * for A and B will be reversed, so that C will contain
+         * C-contiguous op_B_transposed * op_A_transposed (N*M matrix).
+         * As the calling code will consider C as F-contiguous M*N matrix,
+         * it will get the transposed of op_B_transposed * op_A_transposed,
+         * that is op_A * op_B (M*N matrix) as expected. */
+        npy_intp dims_C[2] = {*N, *M};
+        npy_intp strides_C[2] = {(*M) * %(float_size)d, %(float_size)d};
        PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
-        PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
+        PyObject* op_A_transposed = alt_op_without_copy_%(float_type)s(!is_A_transposable, A, nrowa, ncola, *LDA);
+        PyObject* op_B_transposed = alt_op_without_copy_%(float_type)s(!is_B_transposable, B, nrowb, ncolb, *LDB);
+        PyArray_MatrixProduct2(op_B_transposed, op_A_transposed, (PyArrayObject*)matrix_C);
        if(*ALPHA != 1.0)
            alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*ALPHA, (PyArrayObject*)matrix_C);
-        /*But it seems Python|NumPy expects C to be F-contiguous at output, so the convert it.*/
+        Py_XDECREF(op_B_transposed);
-        PyObject* matrix_C_as_f_contiguous = PyArray_FromAny(matrix_C, PyArray_DESCR((PyArrayObject*)matrix_C), 2, 2, NPY_ARRAY_F_CONTIGUOUS, NULL);
+        Py_XDECREF(op_A_transposed);
-        if(matrix_C_as_f_contiguous != matrix_C) {
-            memcpy(C, PyArray_DATA((PyArrayObject*)matrix_C_as_f_contiguous), (*M)*(*N)*sizeof(%(float_type)s));
-            Py_XDECREF(matrix_C_as_f_contiguous);
-        }
        Py_XDECREF(matrix_C);
    } else {
        /*C is read, so we must consider it as F-contiguous, as we do for A and B.*/
+        npy_intp dims_C[2] = {*M, *N};
        npy_intp strides_C[2] = {%(float_size)d, (*LDC) * %(float_size)d};
        PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_F_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
+        PyObject* op_A = alt_op_without_copy_%(float_type)s(is_A_transposable, A, nrowa, ncola, *LDA);
+        PyObject* op_B = alt_op_without_copy_%(float_type)s(is_B_transposable, B, nrowb, ncolb, *LDB);
        PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
-        if(*ALPHA != 1.0)
+        alt_numpy_matrix_extended_sum_in_place_%(float_type)s(ALPHA, op_A_times_op_B, BETA, (PyArrayObject*)matrix_C);
-            alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*ALPHA, op_A_times_op_B);
-        if(*BETA != 1.0)
-            alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*BETA, (PyArrayObject*)matrix_C);
-        alt_numpy_matrix_sum_in_place_%(float_type)s(op_A_times_op_B, (PyArrayObject*)matrix_C);
        /*C is already F-contiguous, thus no conversion needed for output.*/
        Py_XDECREF(op_A_times_op_B);
+        Py_XDECREF(op_B);
+        Py_XDECREF(op_A);
        Py_XDECREF(matrix_C);
    }
-    if(op_B != matrix_B) Py_XDECREF(op_B);
-    if(op_A != matrix_A) Py_XDECREF(op_A);
-    Py_XDECREF(matrix_B);
-    Py_XDECREF(matrix_A);
 }