Required corrections and modifications are done.

Recall: code is tested with: $ theano-cache purge && THEANO_FLAGS=blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestCorrConv2d NB: 1) dgemm_ is never called in these tests. Only sgemm_ is called. 2) All LDA,LDB,LDC are always in the set {M;N;K} during these tests.

Required corrections and modifications are done.
9ca9474b · notoraptor · 6b3afd89 · 9ca9474b · 9ca9474b · 9ca9474b
--- a/theano/tensor/alt_dgemm.c
+++ b/theano/tensor/alt_dgemm.c
-/**
+/** C Implementation of dgemm_ based on NumPy
-C Implementation of dgemm_ based on NumPy
+ * Used instead of blas when Theano config flag blas.ldflags is empty.
-Used instead of blas when Theano config flag blas.ldflags is empty.
+ * PS: For further comments, see equivalent functions in alt_sgemm.c.
 **/
-void alt_double_scalar_matrix_product_in_place(double scalar, double* matrix, int size_to_compute) {
+void alt_numpy_double_scalar_matrix_product_in_place(double scalar, PyArrayObject* matrix) {
-	for(int i = 0; i < size_to_compute; ++i) {
+    NpyIter* iterator = NpyIter_New(matrix, 
-		matrix[i] *= scalar;
+        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, 
+        NPY_KEEPORDER, NPY_NO_CASTING, NULL);
+    if(iterator == NULL)
+        alt_fatal_error("Unable to iterate over a matrix for a scalar * matrix operation.");
+    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
+    char** data_ptr = NpyIter_GetDataPtrArray(iterator);
+    npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
+    npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
+    do {
+        char* data = *data_ptr;
+        npy_intp stride = *stride_ptr;
+        npy_intp count = *innersize_ptr;
+        while(count) {
+            double new_value = scalar * (*((double*)data));
+            memcpy(data, &new_value, sizeof(double));
+            data += stride;
+            --count;
        }
+    } while(get_next(iterator));
+    NpyIter_Deallocate(iterator);
 }
-void alt_double_matrix_sum_in_place(double* A, double* B, double* out, int size_to_compute) {
+void alt_numpy_double_matrix_sum(PyArrayObject* A, PyArrayObject* B, PyArrayObject* out) {
-	for(int i = 0; i < size_to_compute; ++i) {
+    PyArrayObject* op[3]       = {A, B, out};
-		out[i] = A[i] + B[i];
+    npy_uint32     op_flags[3] = {NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_WRITEONLY};
+    npy_uint32     flags       = NPY_ITER_EXTERNAL_LOOP;
+    NpyIter*       iterators   = NpyIter_MultiNew(3, op, flags, NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
+    if(iterators == NULL)
+        alt_fatal_error("Unable to iterate over some matrices for matrix + matrix operation.");
+    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
+    npy_intp innerstride = NpyIter_GetInnerStrideArray(iterators)[0];
+    npy_intp *innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterators);
+    char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
+    do {
+        char* from_A = data_ptr_array[0];
+        char* from_B = data_ptr_array[1];
+        char* from_out = data_ptr_array[2];
+        npy_intp size = *innersize_ptr;
+        for(npy_intp i = 0; i < size; ++i, from_A += innerstride, from_B += innerstride, from_out += innerstride) {
+            double sum = *((double*)from_A);
+            sum += *((double*)from_B);
+            memcpy(from_out, &sum, sizeof(double));
        }
+    } while(get_next(iterators));
+    NpyIter_Deallocate(iterators);
 }
-/* dgemm
+/* dgemm */
- * NB: See sgemm_ (in alt_sgemm.c) for same assumptions.
- * */
 void dgemm_(char* TRANSA, char* TRANSB, 
            const int* M, const int* N, const int* K,
            const double* ALPHA, double* A, const int* LDA, 
@@ -22,38 +57,45 @@ void dgemm_(char* TRANSA, char* TRANSB,
            double* C, const int* LDC) {
    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
        return;
-	if(C == NULL)
+    int nrowa, ncola, nrowb, ncolb;
-		return;
+    if(*TRANSA == 'N' || *TRANSA == 'n') {
-	int ka, kb;
+        nrowa = *M;
-	if(*TRANSA == 'N' || *TRANSA == 'n')
+        ncola = *K;
-		ka = *K;
+    } else {
-	else
+        nrowa = *K;
-		ka = *M;
+        ncola = *M;
-	if(*TRANSB == 'N' || *TRANSB == 'n')
+    }
-		kb = *N;
+    if(*TRANSB == 'N' || *TRANSB == 'n') {
-	else
+        nrowb = *K;
-		kb = *K;
+        ncolb = *N;
-	npy_intp dims_A[2] = {*LDA, ka};
+    } else {
-	npy_intp dims_B[2] = {*LDB, kb};
+        nrowb = *N;
-	PyObject* matrix_A = PyArray_SimpleNewFromData(2, dims_A, NPY_FLOAT64, A);
+        ncolb = *K;
-	PyObject* matrix_B = PyArray_SimpleNewFromData(2, dims_B, NPY_FLOAT64, B);
+    }
+    npy_intp dims_A[2] = {nrowa, ncola};
+    npy_intp dims_B[2] = {nrowb, ncolb};
+    npy_intp dims_C[2] = {*M, *N};
+    npy_intp strides_A[2] = {ncola*8, 8};
+    npy_intp strides_B[2] = {ncolb*8, 8};
+    npy_intp strides_C[2] =  {(*N)*8, 8};
+    PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, NPY_FLOAT64, strides_A, A, 0, 0, NULL);
+    PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, NPY_FLOAT64, strides_B, B, 0, 0, NULL);
+    PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, NPY_FLOAT64, strides_C, C, 0, NPY_ARRAY_WRITEABLE, NULL);
    PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
    PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
    if(*BETA == 0) {
-		npy_intp dims_C[2] = {*LDC, *N};
+        PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
-		PyObject* matrix_C = PyArray_SimpleNewFromData(2, dims_C, NPY_FLOAT64, C);
+        alt_numpy_double_scalar_matrix_product_in_place(*ALPHA, (PyArrayObject*)matrix_C);
-		alt_matrix_matrix_product2(op_A, op_B, matrix_C);
-		alt_double_scalar_matrix_product_in_place(*ALPHA, C, (*M) * (*N));
-		Py_XDECREF(matrix_C);
    } else {
-		PyArrayObject* op_A_times_op_B = (PyArrayObject*)alt_matrix_matrix_product(op_A, op_B);
+        PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
-		alt_double_scalar_matrix_product_in_place(*ALPHA, (double*)PyArray_DATA(op_A_times_op_B), (*M) * (*N));
+        alt_numpy_double_scalar_matrix_product_in_place(*ALPHA, op_A_times_op_B);
-		alt_double_scalar_matrix_product_in_place(*BETA, C, (*M) * (*N));
+        alt_numpy_double_scalar_matrix_product_in_place(*BETA, (PyArrayObject*)matrix_C);
-		alt_double_matrix_sum_in_place((double*)PyArray_DATA(op_A_times_op_B), C, C, (*M) * (*N));
+        alt_numpy_double_matrix_sum(op_A_times_op_B, (PyArrayObject*)matrix_C, (PyArrayObject*)matrix_C);
        Py_XDECREF(op_A_times_op_B);
    }
    if(op_B != matrix_B) Py_XDECREF(op_B);
    if(op_A != matrix_A) Py_XDECREF(op_A);
+    Py_XDECREF(matrix_C);
    Py_XDECREF(matrix_B);
    Py_XDECREF(matrix_A);
 }
--- a/theano/tensor/alt_sgemm.c
+++ b/theano/tensor/alt_sgemm.c
-/**
+/** C Implementation of sgemm_ based on NumPy
-C Implementation of sgemm_ based on NumPy
+ * Used instead of blas when Theano config flag blas.ldflags is empty.
-Used instead of blas when Theano config flag blas.ldflags is empty.
+ * PS: Comments are the same for equivalent functions in alt_dgemm.c.
 **/
-inline PyObject* alt_transpose(PyArrayObject* o) {
+void alt_fatal_error(const char* message) {
-	return PyArray_Transpose(o, NULL);
+    if(message != NULL) puts(message);
+    exit(-1);
 }
-inline PyObject* alt_matrix_matrix_product(PyObject* o1, PyObject* o2) {
+void alt_numpy_scalar_matrix_product_in_place(float scalar, PyArrayObject* matrix) {
-	return PyArray_MatrixProduct(o1, o2);
+    // Get an iterator on matrix.
-}
+    NpyIter* iterator = NpyIter_New(matrix, 
-inline PyObject* alt_matrix_matrix_product2(PyObject* o1, PyObject* o2, PyObject* out) {
+        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, 
-	return PyArray_MatrixProduct2(o1, o2, (PyArrayObject*)out);
+        NPY_KEEPORDER, NPY_NO_CASTING, NULL);
-}
+    if(iterator == NULL)
-void alt_scalar_matrix_product_in_place(float scalar, float* matrix, int size_to_compute) {
+        alt_fatal_error("Unable to iterate over a matrix for a scalar * matrix operation.");
-	for(int i = 0; i < size_to_compute; ++i) {
+    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
-		matrix[i] *= scalar;
+    char** data_ptr = NpyIter_GetDataPtrArray(iterator);
+    npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
+    npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
+    do {
+        char* data = *data_ptr;
+        npy_intp stride = *stride_ptr;
+        npy_intp count = *innersize_ptr;
+        while(count) {
+            float new_value = scalar * (*((float*)data));
+            memcpy(data, &new_value, sizeof(float));
+            data += stride;
+            --count;
        }
+    } while(get_next(iterator));
+    NpyIter_Deallocate(iterator);
 }
-void alt_matrix_sum_in_place(float* A, float* B, float* out, int size_to_compute) {
+void alt_numpy_matrix_sum(PyArrayObject* A, PyArrayObject* B, PyArrayObject* out) {
-	for(int i = 0; i < size_to_compute; ++i) {
+    /* NB: It may be better to check if A,B and out have the same dimensions.
-		out[i] = A[i] + B[i];
+     * But for now, we made this assumption. */
+    PyArrayObject* op[3]       = {A, B, out};
+    npy_uint32     op_flags[3] = {NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_WRITEONLY};
+    npy_uint32     flags       = NPY_ITER_EXTERNAL_LOOP;
+    NpyIter*       iterators   = NpyIter_MultiNew(3, op, flags, NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
+    if(iterators == NULL)
+        alt_fatal_error("Unable to iterate over some matrices for matrix + matrix operation.");
+    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
+    npy_intp innerstride = NpyIter_GetInnerStrideArray(iterators)[0];
+    npy_intp *innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterators);
+    char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
+    do {
+        char* from_A = data_ptr_array[0];
+        char* from_B = data_ptr_array[1];
+        char* from_out = data_ptr_array[2];
+        npy_intp size = *innersize_ptr;
+        for(npy_intp i = 0; i < size; ++i, from_A += innerstride, from_B += innerstride, from_out += innerstride) {
+            float sum = *((float*)from_A);
+            sum += *((float*)from_B);
+            memcpy(from_out, &sum, sizeof(float));
        }
+    } while(get_next(iterators));
+    NpyIter_Deallocate(iterators);
 }
 inline PyObject* alt_op(char* trans, PyArrayObject* matrix) {
-	return (*trans == 'N' || *trans == 'n') ? (PyObject*)matrix : alt_transpose(matrix);
+    return (*trans == 'N' || *trans == 'n') ? (PyObject*)matrix : PyArray_Transpose(matrix, NULL);
 }
 /* sgemm
- * We assume that none of these 13 pointers passed as arguments is null.
+ * Recall: operation performed:
- * NB: We can more optimize this function (for example, when alpha == 0).
+ * C = ALPHA * op(TRANSA, A) * op(TRANSB, B) + BETA * C
+ * NB: We assume that none of the 13 pointers passed as arguments is null.
+ * NB: We can more optimize this function (for example, when ALPHA == 0).
 * */
 void sgemm_(char* TRANSA, char* TRANSB, 
            const int* M, const int* N, const int* K,
@@ -35,43 +72,59 @@ void sgemm_(char* TRANSA, char* TRANSB,
            float* C, const int* LDC) {
    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
        return;
-	if(C == NULL)
-		return;
    /* Recall:
-	 * A is a *LDA by *ka matrix.
+     * op(A) is a m by k matrix.
-	 * B is a *LDB by *kb matrix.
+     * op(B) is a k by n matrix.
-	 * C is a *LDC By *N  matrix.
+     *    C  is a m by n matrix.
     */
-	int ka, kb;
+    int nrowa, ncola, nrowb, ncolb;
-	if(*TRANSA == 'N' || *TRANSA == 'n')
+    if(*TRANSA == 'N' || *TRANSA == 'n') {
-		ka = *K;
+        nrowa = *M;
-	else
+        ncola = *K;
-		ka = *M;
+    } else {
-	if(*TRANSB == 'N' || *TRANSB == 'n')
+        nrowa = *K;
-		kb = *N;
+        ncola = *M;
-	else
+    }
-		kb = *K;
+    if(*TRANSB == 'N' || *TRANSB == 'n') {
-	npy_intp dims_A[2] = {*LDA, ka};
+        nrowb = *K;
-	npy_intp dims_B[2] = {*LDB, kb};
+        ncolb = *N;
-	PyObject* matrix_A = PyArray_SimpleNewFromData(2, dims_A, NPY_FLOAT32, A);
+    } else {
-	PyObject* matrix_B = PyArray_SimpleNewFromData(2, dims_B, NPY_FLOAT32, B);
+        nrowb = *N;
+        ncolb = *K;
+    }
+    npy_intp dims_A[2] = {nrowa, ncola};
+    npy_intp dims_B[2] = {nrowb, ncolb};
+    npy_intp dims_C[2] = {*M, *N};
+    /*NB: It seems that A,B and C are always row-major matrices, thus
+     * the stride for the 1st dimension (from a row to the next row) only depends on the number of elements in a row
+     * (that is, the size of the 2nd dimension, which is ncola for A, ncolb for B, and *N for C),
+     * and the stride for the 2nd dimension (from a column to the next column) is always the size of one element
+     * (that is, 4 bytes for a float32 matrix, 8 bytes for a float64 matrix).
+     * Then, LDA, LDB and LDC seems totally useless in the strides calcuulations.
+     * For LDA,LDB,LDC to be taken account, we need to have column-major matrices,
+     * which seems never to happen. */
+    npy_intp strides_A[2] = {ncola*4, 4};
+    npy_intp strides_B[2] = {ncolb*4, 4};
+    npy_intp strides_C[2] =  {(*N)*4, 4};
+    /*NB: in fact, we could replace the strides with NULL as argument in the 3 following lines.*/
+    PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, NPY_FLOAT32, strides_A, A, 0, 0, NULL);
+    PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, NPY_FLOAT32, strides_B, B, 0, 0, NULL);
+    PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, NPY_FLOAT32, strides_C, C, 0, NPY_ARRAY_WRITEABLE, NULL);
    PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
    PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
    if(*BETA == 0) {
-		npy_intp dims_C[2] = {*LDC, *N};
+        PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
-		PyObject* matrix_C = PyArray_SimpleNewFromData(2, dims_C, NPY_FLOAT32, C);
+        alt_numpy_scalar_matrix_product_in_place(*ALPHA, (PyArrayObject*)matrix_C);
-		alt_matrix_matrix_product2(op_A, op_B, matrix_C);
-		alt_scalar_matrix_product_in_place(*ALPHA, C, (*M) * (*N));
-		Py_XDECREF(matrix_C);
    } else {
-		PyArrayObject* op_A_times_op_B = (PyArrayObject*)alt_matrix_matrix_product(op_A, op_B);
+        PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
-		alt_scalar_matrix_product_in_place(*ALPHA, (float*)PyArray_DATA(op_A_times_op_B), (*M) * (*N));
+        alt_numpy_scalar_matrix_product_in_place(*ALPHA, op_A_times_op_B);
-		alt_scalar_matrix_product_in_place(*BETA, C, (*M) * (*N));
+        alt_numpy_scalar_matrix_product_in_place(*BETA, (PyArrayObject*)matrix_C);
-		alt_matrix_sum_in_place((float*)PyArray_DATA(op_A_times_op_B), C, C, (*M) * (*N));
+        alt_numpy_matrix_sum(op_A_times_op_B, (PyArrayObject*)matrix_C, (PyArrayObject*)matrix_C);
        Py_XDECREF(op_A_times_op_B);
    }
    if(op_B != matrix_B) Py_XDECREF(op_B);
    if(op_A != matrix_A) Py_XDECREF(op_A);
+    Py_XDECREF(matrix_C);
    Py_XDECREF(matrix_B);
    Py_XDECREF(matrix_A);
 }
--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
@@ -9,6 +9,7 @@ import logging
 import textwrap
 import sys
 import os
+from os.path import dirname, normpath
 from theano import config
 from theano.gof.cmodule import GCC_compiler
@@ -734,7 +735,6 @@ def blas_header_text():
    const = "const"
    if not config.blas.ldflags:
        # Include the Numpy version implementation of sgemm_ and dgemm_ from alt_sgemm.c and alt_dgemm.c
-        from os.path import dirname, normpath
        current_filedir = dirname(__file__)
        sgemm_filepath = normpath(current_filedir + "/alt_sgemm.c")
        dgemm_filepath = normpath(current_filedir + "/alt_dgemm.c")
@@ -745,10 +745,10 @@ def blas_header_text():
        with open(dgemm_filepath) as code:
            dgemm_code = code.read()
        if not sgemm_code or not dgemm_code:
-            _logger.warning("Unable to load Numpy implementation of gemm code from C source files.")
+            raise IOError("Unable to load NumPy implementation of gemm code from C source files.")
        else:
            const = ""
-            # _logger.warning("Numpy implementation of gemm code loaded (config.blas.ldflags is empty)")
+            # _logger.info("Numpy implementation of gemm code loaded (config.blas.ldflags is empty)")
        gemm_code += sgemm_code
        gemm_code += dgemm_code

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -63,7 +63,7 @@ class BaseCorrMM(gof.OpenMPOp):
        self.filter_dilation = tuple(filter_dilation)
        if not theano.config.blas.ldflags:
-            # raise NotImplementedError("C code for corrMM* classes need a blas library.")
+            # Theano will use a NumPy C implementation of [sd]gemm_ instead.
            self.blas_type = ''
        else:
            if 'openblas' in theano.config.blas.ldflags:

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -72,7 +72,9 @@ compile.optdb.register('local_inplace_sparse_block_outer',
 # Conv opts
 @local_optimizer([AbstractConv2d])
 def local_abstractconv_gemm(node):
-    if theano.config.cxx == "":  # or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv2d):
        return None