It seems everithing works well now !

Tests passed (see details below) (with blas.ldflags empty and ldflags skipping removed from files): tensor/nnet/tests/test_corr.py tensor/nnet/tests/test_corr3d.py tensor/tests/test_blas.py tensor/tests/test_blas_scipy.py tensor/tests/test_blas_c.py (28 tests skipped) tensor/nnet/tests/test_abstract_conv.py:TestCorrConv2d tensor/nnet/tests/test_abstract_conv.py:TestCorrConv3d tensor/nnet/tests/test_abstract_conv.py:TestAbstractConvNoOptim tensor/nnet/tests/test_abstract_conv.py:TestCpuConv2d (252 tests skipped) tensor/nnet/tests/test_abstract_conv.py:TestCpuConv3d (60 tests skipped) tensor/nnet/tests/test_abstract_conv.py:TestBilinearUpsampling __ $ theano-cache purge && THEANO_FLAGS=optdb.max_use_ratio=7,blas.ldflags= nosetests --verbose theano/tensor/nnet/tests/test_corr.py Tests that basic correlations work for odd and even ... ok Checks dtype upcast for CorrMM methods. ... ok Tests correlation where filter dilation != (1,1) ... ok Tests basic correlation in full mode and case where filter ... ok test_img_kernel_same_shape (theano.tensor.nnet.tests.test_corr.TestCorr2D) ... ok test_infer_shape_forward (theano.tensor.nnet.tests.test_corr.TestCorr2D) ... ok test_infer_shape_gradI (theano.tensor.nnet.tests.test_corr.TestCorr2D) ... ok test_infer_shape_gradW (theano.tensor.nnet.tests.test_corr.TestCorr2D) ... ok Tests scenario where filter_shape[1] != input_shape[1] ... ok test_non_contiguous (theano.tensor.nnet.tests.test_corr.TestCorr2D) ... ok Tests correlation where the {image,filter}_shape is a Constant tensor. ... ok Tests correlation where subsampling != (1,1) ... ok Make sure errors are raised when image and kernel are not 4D tensors ... ok ---------------------------------------------------------------------- Ran 13 tests in 167.377s OK $ theano-cache purge && THEANO_FLAGS=optdb.max_use_ratio=10,blas.ldflags= nosetests --verbose theano/tensor/nnet/tests/test_corr3d.py Tests that basic correlations work for odd and even ... ok Checks dtype upcast for Corr3dMM methods. ... ok Tests correlation where filter dilation != (1,1,1) ... ok Tests basic correlation in full mode and case where filter ... ok test_img_kernel_same_shape (theano.tensor.nnet.tests.test_corr3d.TestCorr3D) ... ok test_infer_shape_forward (theano.tensor.nnet.tests.test_corr3d.TestCorr3D) ... ok test_infer_shape_gradI (theano.tensor.nnet.tests.test_corr3d.TestCorr3D) ... ok test_infer_shape_gradW (theano.tensor.nnet.tests.test_corr3d.TestCorr3D) ... ok Tests scenario where filter_shape[1] != input_shape[1] ... ok test_non_contiguous (theano.tensor.nnet.tests.test_corr3d.TestCorr3D) ... ok Tests correlation where the {image,filter}_shape is a Constant tensor. ... ok Tests correlation where subsampling != (1,1,1) ... ok Make sure errors are raised when image and kernel are not 5D tensors ... ok ---------------------------------------------------------------------- Ran 13 tests in 687.905s OK $ theano-cache purge && THEANO_FLAGS=optdb.max_use_ratio=10,blas.ldflags= nosetests theano/tensor/tests/test_blas.py .............................................................................................................. ---------------------------------------------------------------------- Ran 110 tests in 69.618s OK $ theano-cache purge && THEANO_FLAGS=optdb.max_use_ratio=10,blas.ldflags= nosetests theano/tensor/tests/test_blas_scipy.py .............. ---------------------------------------------------------------------- Ran 14 tests in 16.113s OK $ theano-cache purge && THEANO_FLAGS=optdb.max_use_ratio=10,blas.ldflags= nosetests theano/tensor/tests/test_blas_c.py ...........S.S.SSSSSSSSSSSSSSSSSSSSSSSS...SS ---------------------------------------------------------------------- Ran 44 tests in 14.716s OK (SKIP=28) $ theano-cache purge && THEANO_FLAGS=optdb.max_use_ratio=10,blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestCorrConv2d .................................................................................................................................................................................................................................................................................................................................................................................................................. ---------------------------------------------------------------------- Ran 402 tests in 589.767s OK $ theano-cache purge && THEANO_FLAGS=optdb.max_use_ratio=10,blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestCorrConv3d .................................................................................................. ---------------------------------------------------------------------- Ran 98 tests in 302.220s OK $ theano-cache purge && THEANO_FLAGS=blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestAbstractConvNoOptim .................... ---------------------------------------------------------------------- Ran 20 tests in 93.374s OK $ theano-cache purge && THEANO_FLAGS=blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestCpuConv2d .......................SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.........................SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.........................SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.........................SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.........................SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.........................SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.. ---------------------------------------------------------------------- Ran 402 tests in 137.067s OK (SKIP=252) # 252 SKIPs pour la même raison: SKIP: No dilation implementation for basic cpu ConvOp. # (test_abstract_conv.py, ligne 494) $ theano-cache purge && THEANO_FLAGS=blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestCpuConv3d .................SSSSSSSSSSSSSSSSSSSSSSSSSSSSSS...................SSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.. ---------------------------------------------------------------------- Ran 98 tests in 44.181s OK (SKIP=60) # 60 SKIPs pour la même raison: SKIP: No dilation implementation for basic cpu Conv3D. # (test_abstract_conv.py, ligne 688) $ theano-cache purge && THEANO_FLAGS=blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestBilinearUpsampling ..... ---------------------------------------------------------------------- Ran 5 tests in 29.046s OK

It seems everithing works well now !
a4eb981d · notoraptor · bc911254 · a4eb981d · a4eb981d · bc911254
--- a/theano/tensor/alt_gemm_common.c
+++ b/theano/tensor/alt_gemm_common.c
+/** C Implementation of [sd]gemm_ based on NumPy
+ * Used instead of blas when Theano config flag blas.ldflags is empty.
+ * This file contains the common code for [sd]gemm_.
+ * File alt_gemm_template.c contains template code for [sd]gemm_.
+**/
+inline void alt_fatal_error(const char* message) {
+    if(message != NULL) fprintf(stderr, message);
+    exit(-1);
+}
+inline PyObject* alt_op(char* trans, PyArrayObject* matrix) {
+    return (*trans == 'N' || *trans == 'n') ? 
+        (PyObject*)matrix : 
+        PyArray_Transpose(matrix, NULL);
+}
+/**Template code for [sd]gemm_ follows in file alt_gemm_template.c
+ * (as Python string to be used with old formatting).
+ * PARAMETERS:
+ * float_type: "float" for sgemm_, "double" for dgemm_.
+ * float_size: 4 for float32 (sgemm_), 8 for float64 (dgemm_).
+ * npy_float: "NPY_FLOAT32" for sgemm_, "NPY_FLOAT64" for dgemm_.
+ * name: usually "sgemm_" for sgemm_, "dgemm_" for dgemm_. 
+ * See blas_headers.py for current use.**/
--- a/theano/tensor/alt_dgemm.c
+++ b/theano/tensor/alt_dgemm.c
-/** C Implementation of dgemm_ based on NumPy
- * Used instead of blas when Theano config flag blas.ldflags is empty.
- * PS: For further comments, see equivalent functions in alt_sgemm.c.
-**/
-void alt_numpy_double_scalar_matrix_product_in_place(double scalar, PyArrayObject* matrix) {
+/** %(name)s **/
+void alt_numpy_scalar_matrix_product_in_place_%(float_type)s(%(float_type)s scalar, PyArrayObject* matrix) {
    NpyIter* iterator = NpyIter_New(matrix, 
        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, 
        NPY_KEEPORDER, NPY_NO_CASTING, NULL);
    if(iterator == NULL)
-        alt_fatal_error("Unable to iterate over a matrix for a scalar * matrix operation.");
+        alt_fatal_error("Unable to iterate over a matrix "
+                        "for a scalar * matrix operation.");
    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
    char** data_ptr = NpyIter_GetDataPtrArray(iterator);
    npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
@@ -17,88 +15,94 @@ void alt_numpy_double_scalar_matrix_product_in_place(double scalar, PyArrayObjec
        npy_intp stride = *stride_ptr;
        npy_intp count = *innersize_ptr;
        while(count) {
-            double new_value = scalar * (*((double*)data));
-            memcpy(data, &new_value, sizeof(double));
+            *((%(float_type)s*)data) *= scalar;
            data += stride;
            --count;
        }
    } while(get_next(iterator));
    NpyIter_Deallocate(iterator);
 }
-void alt_numpy_double_matrix_sum(PyArrayObject* A, PyArrayObject* B, PyArrayObject* out) {
-    PyArrayObject* op[3]       = {A, B, out};
-    npy_uint32     op_flags[3] = {NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_WRITEONLY};
-    npy_uint32     flags       = NPY_ITER_EXTERNAL_LOOP;
-    NpyIter*       iterators   = NpyIter_MultiNew(3, op, flags, NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
+/*Matrix+Matrix function.
+ * Remark: This function actually sums a C-contiguous matrix (alpha*op(A)*op(B)) with a F-contiguous matrix (beta*C)
+ * (see gemm implementation at next function for more details) */
+void alt_numpy_matrix_sum_in_place_%(float_type)s(PyArrayObject* A, PyArrayObject* B) {
+    PyArrayObject* op[2]       = {A, B};
+    npy_uint32     op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
+    npy_uint32     flags       = 0;
+    NpyIter*       iterators   = NpyIter_MultiNew(
+            2, op, flags, NPY_CORDER, NPY_NO_CASTING, op_flags, NULL);
    if(iterators == NULL)
-        alt_fatal_error("Unable to iterate over some matrices for matrix + matrix operation.");
+        alt_fatal_error("Unable to iterate over some matrices "
+                        "for matrix + matrix operation.");
    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
-    npy_intp innerstride = NpyIter_GetInnerStrideArray(iterators)[0];
-    npy_intp *innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterators);
    char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
    do {
        char* from_A = data_ptr_array[0];
        char* from_B = data_ptr_array[1];
-        char* from_out = data_ptr_array[2];
-        npy_intp size = *innersize_ptr;
-        for(npy_intp i = 0; i < size; ++i, from_A += innerstride, from_B += innerstride, from_out += innerstride) {
-            double sum = *((double*)from_A);
-            sum += *((double*)from_B);
-            memcpy(from_out, &sum, sizeof(double));
-        }
+        *((%(float_type)s*)from_B) += *((%(float_type)s*)from_A);
    } while(get_next(iterators));
    NpyIter_Deallocate(iterators);
 }
-/* dgemm */
-void dgemm_(char* TRANSA, char* TRANSB, 
-            const int* M, const int* N, const int* K,
-            const double* ALPHA, double* A, const int* LDA, 
-            double* B, const int* LDB, const double* BETA, 
-            double* C, const int* LDC) {
+/* %(name)s template code */
+void %(name)s(
+    char* TRANSA, char* TRANSB, 
+    const int* M, const int* N, const int* K,
+    const %(float_type)s* ALPHA, %(float_type)s* A, const int* LDA, 
+    %(float_type)s* B, const int* LDB, const %(float_type)s* BETA, 
+    %(float_type)s* C, const int* LDC) {
    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
        return;
    int nrowa, ncola, nrowb, ncolb;
    if(*TRANSA == 'N' || *TRANSA == 'n') {
-        nrowa = *M;
-        ncola = *K;
+        nrowa = *M; ncola = *K;
    } else {
-        nrowa = *K;
-        ncola = *M;
+        nrowa = *K; ncola = *M;
    }
    if(*TRANSB == 'N' || *TRANSB == 'n') {
-        nrowb = *K;
-        ncolb = *N;
+        nrowb = *K; ncolb = *N;
    } else {
-        nrowb = *N;
-        ncolb = *K;
+        nrowb = *N; ncolb = *K;
    }
    npy_intp dims_A[2] = {nrowa, ncola};
    npy_intp dims_B[2] = {nrowb, ncolb};
    npy_intp dims_C[2] = {*M, *N};
-    npy_intp strides_A[2] = {ncola*8, 8};
-    npy_intp strides_B[2] = {ncolb*8, 8};
-    npy_intp strides_C[2] =  {(*N)*8, 8};
-    PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, NPY_FLOAT64, strides_A, A, 0, 0, NULL);
-    PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, NPY_FLOAT64, strides_B, B, 0, 0, NULL);
-    PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, NPY_FLOAT64, strides_C, C, 0, NPY_ARRAY_WRITEABLE, NULL);
+    npy_intp strides_A[2] = {%(float_size)d, (*LDA) * %(float_size)d};
+    npy_intp strides_B[2] = {%(float_size)d, (*LDB) * %(float_size)d};
+    PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, %(npy_float)s, strides_A, A, 0, NPY_ARRAY_F_CONTIGUOUS, NULL);
+    PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, %(npy_float)s, strides_B, B, 0, NPY_ARRAY_F_CONTIGUOUS, NULL);
    PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
    PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
    if(*BETA == 0) {
+        /*C is never red, just written.*/
+        npy_intp strides_C[2] = {(*N) * %(float_size)d, %(float_size)d};
+        /*matrix_C is created as C-contiguous because the 3rd parameter of PyArray_MatrixProduct2 (below) expects a C-contiguous array.*/
+        PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
        PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
        if(*ALPHA != 1.0)
-            alt_numpy_double_scalar_matrix_product_in_place(*ALPHA, (PyArrayObject*)matrix_C);
+            alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*ALPHA, (PyArrayObject*)matrix_C);
+        /*But it seems Python|NumPy expects C to be F-contiguous at output, so the convert it.*/
+        PyObject* matrix_C_as_f_contiguous = PyArray_FromAny(matrix_C, PyArray_DESCR((PyArrayObject*)matrix_C), 2, 2, NPY_ARRAY_F_CONTIGUOUS, NULL);
+        if(matrix_C_as_f_contiguous != matrix_C) {
+            memcpy(C, PyArray_DATA((PyArrayObject*)matrix_C_as_f_contiguous), (*M)*(*N)*sizeof(%(float_type)s));
+            Py_XDECREF(matrix_C_as_f_contiguous);
+        }
+        Py_XDECREF(matrix_C);
    } else {
+        /*C is read, so we must consider it as F-contiguous, as we do for A and B.*/
+        npy_intp strides_C[2] = {%(float_size)d, (*LDC) * %(float_size)d};
+        PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_F_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
        PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
        if(*ALPHA != 1.0)
-            alt_numpy_double_scalar_matrix_product_in_place(*ALPHA, op_A_times_op_B);
+            alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*ALPHA, op_A_times_op_B);
        if(*BETA != 1.0)
-            alt_numpy_double_scalar_matrix_product_in_place(*BETA, (PyArrayObject*)matrix_C);
-        alt_numpy_double_matrix_sum(op_A_times_op_B, (PyArrayObject*)matrix_C, (PyArrayObject*)matrix_C);
+            alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*BETA, (PyArrayObject*)matrix_C);
+        alt_numpy_matrix_sum_in_place_%(float_type)s(op_A_times_op_B, (PyArrayObject*)matrix_C);
+        /*C is already F-contiguous, thus no conversion needed for output.*/
        Py_XDECREF(op_A_times_op_B);
+        Py_XDECREF(matrix_C);
    }
    if(op_B != matrix_B) Py_XDECREF(op_B);
    if(op_A != matrix_A) Py_XDECREF(op_A);
-    Py_XDECREF(matrix_C);
    Py_XDECREF(matrix_B);
    Py_XDECREF(matrix_A);
 }
--- a/theano/tensor/alt_sgemm.c
+++ b/theano/tensor/alt_sgemm.c
-/** C Implementation of sgemm_ based on NumPy
- * Used instead of blas when Theano config flag blas.ldflags is empty.
- * PS: Comments are the same for equivalent functions in alt_dgemm.c.
-**/
-void alt_fatal_error(const char* message) {
-    if(message != NULL) puts(message);
-    exit(-1);
-}
-void alt_numpy_scalar_matrix_product_in_place(float scalar, PyArrayObject* matrix) {
-    // Get an iterator on matrix.
-    NpyIter* iterator = NpyIter_New(matrix, 
-        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, 
-        NPY_KEEPORDER, NPY_NO_CASTING, NULL);
-    if(iterator == NULL)
-        alt_fatal_error("Unable to iterate over a matrix for a scalar * matrix operation.");
-    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
-    char** data_ptr = NpyIter_GetDataPtrArray(iterator);
-    npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
-    npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
-    do {
-        char* data = *data_ptr;
-        npy_intp stride = *stride_ptr;
-        npy_intp count = *innersize_ptr;
-        while(count) {
-            float new_value = scalar * (*((float*)data));
-            memcpy(data, &new_value, sizeof(float));
-            data += stride;
-            --count;
-        }
-    } while(get_next(iterator));
-    NpyIter_Deallocate(iterator);
-}
-void alt_numpy_matrix_sum(PyArrayObject* A, PyArrayObject* B, PyArrayObject* out) {
-    /* NB: It may be better to check if A,B and out have the same dimensions.
-     * But for now, we made this assumption. */
-    PyArrayObject* op[3]       = {A, B, out};
-    npy_uint32     op_flags[3] = {NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_WRITEONLY};
-    npy_uint32     flags       = NPY_ITER_EXTERNAL_LOOP;
-    NpyIter*       iterators   = NpyIter_MultiNew(3, op, flags, NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
-    if(iterators == NULL)
-        alt_fatal_error("Unable to iterate over some matrices for matrix + matrix operation.");
-    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
-    npy_intp innerstride = NpyIter_GetInnerStrideArray(iterators)[0];
-    npy_intp *innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterators);
-    char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
-    do {
-        char* from_A = data_ptr_array[0];
-        char* from_B = data_ptr_array[1];
-        char* from_out = data_ptr_array[2];
-        npy_intp size = *innersize_ptr;
-        for(npy_intp i = 0; i < size; ++i, from_A += innerstride, from_B += innerstride, from_out += innerstride) {
-            float sum = *((float*)from_A);
-            sum += *((float*)from_B);
-            memcpy(from_out, &sum, sizeof(float));
-        }
-    } while(get_next(iterators));
-    NpyIter_Deallocate(iterators);
-}
-inline PyObject* alt_op(char* trans, PyArrayObject* matrix) {
-    return (*trans == 'N' || *trans == 'n') ? (PyObject*)matrix : PyArray_Transpose(matrix, NULL);
-}
-/* sgemm
- * Recall: operation performed:
- * C = ALPHA * op(TRANSA, A) * op(TRANSB, B) + BETA * C
- * NB: We assume that none of the 13 pointers passed as arguments is null.
- * NB: We can more optimize this function (for example, when ALPHA == 0).
- * */
-void sgemm_(char* TRANSA, char* TRANSB, 
-            const int* M, const int* N, const int* K,
-            const float* ALPHA, float* A, const int* LDA, 
-            float* B, const int* LDB, const float* BETA, 
-            float* C, const int* LDC) {
-    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
-        return;
-    /* Recall:
-     * op(A) is a m by k matrix.
-     * op(B) is a k by n matrix.
-     *    C  is a m by n matrix.
-     */
-    int nrowa, ncola, nrowb, ncolb;
-    if(*TRANSA == 'N' || *TRANSA == 'n') {
-        nrowa = *M;
-        ncola = *K;
-    } else {
-        nrowa = *K;
-        ncola = *M;
-    }
-    if(*TRANSB == 'N' || *TRANSB == 'n') {
-        nrowb = *K;
-        ncolb = *N;
-    } else {
-        nrowb = *N;
-        ncolb = *K;
-    }
-    npy_intp dims_A[2] = {nrowa, ncola};
-    npy_intp dims_B[2] = {nrowb, ncolb};
-    npy_intp dims_C[2] = {*M, *N};
-    /*NB: It seems that A,B and C are always row-major matrices, thus
-     * the stride for the 1st dimension (from a row to the next row) only depends on the number of elements in a row
-     * (that is, the size of the 2nd dimension, which is ncola for A, ncolb for B, and *N for C),
-     * and the stride for the 2nd dimension (from a column to the next column) is always the size of one element
-     * (that is, 4 bytes for a float32 matrix, 8 bytes for a float64 matrix).
-     * Then, LDA, LDB and LDC seems totally useless in the strides calcuulations.
-     * For LDA,LDB,LDC to be taken account, we need to have column-major matrices,
-     * which seems never to happen. */
-    npy_intp strides_A[2] = {ncola*4, 4};
-    npy_intp strides_B[2] = {ncolb*4, 4};
-    npy_intp strides_C[2] =  {(*N)*4, 4};
-    /*NB: in fact, we could replace the strides with NULL as argument in the 3 following lines.*/
-    PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, NPY_FLOAT32, strides_A, A, 0, 0, NULL);
-    PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, NPY_FLOAT32, strides_B, B, 0, 0, NULL);
-    PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, NPY_FLOAT32, strides_C, C, 0, NPY_ARRAY_WRITEABLE, NULL);
-    PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
-    PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
-    if(*BETA == 0) {
-        PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
-        if(*ALPHA != 1.0)
-            alt_numpy_scalar_matrix_product_in_place(*ALPHA, (PyArrayObject*)matrix_C);
-    } else {
-        PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
-        if(*ALPHA != 1.0)
-            alt_numpy_scalar_matrix_product_in_place(*ALPHA, op_A_times_op_B);
-        if(*BETA != 1.0)
-            alt_numpy_scalar_matrix_product_in_place(*BETA, (PyArrayObject*)matrix_C);
-        alt_numpy_matrix_sum(op_A_times_op_B, (PyArrayObject*)matrix_C, (PyArrayObject*)matrix_C);
-        Py_XDECREF(op_A_times_op_B);
-    }
-    if(op_B != matrix_B) Py_XDECREF(op_B);
-    if(op_A != matrix_A) Py_XDECREF(op_A);
-    Py_XDECREF(matrix_C);
-    Py_XDECREF(matrix_B);
-    Py_XDECREF(matrix_A);
-}
--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
@@ -734,21 +734,25 @@ def blas_header_text():
    gemm_code = ""
    const = "const"
    if not config.blas.ldflags:
-        # Include the Numpy version implementation of sgemm_ and dgemm_ from alt_sgemm.c and alt_dgemm.c
+        # Include the Numpy version implementation of [sd]gemm_.
        current_filedir = dirname(__file__)
-        sgemm_filepath = normpath(current_filedir + "/alt_sgemm.c")
-        dgemm_filepath = normpath(current_filedir + "/alt_dgemm.c")
+        gemm_common_filepath   = normpath(current_filedir + "/alt_gemm_common.c")
+        gemm_template_filepath = normpath(current_filedir + "/alt_gemm_template.c")
+        common_code = ""
        sgemm_code = ""
        dgemm_code = ""
-        with open(sgemm_filepath) as code:
-            sgemm_code = code.read()
-        with open(dgemm_filepath) as code:
-            dgemm_code = code.read()
-        if not sgemm_code or not dgemm_code:
+        with open(gemm_common_filepath) as code:
+            common_code = code.read()
+        with open(gemm_template_filepath) as code:
+            template_code = code.read()
+            sgemm_code = template_code % {"float_type":"float",  "float_size":4, "npy_float":"NPY_FLOAT32", "name":"sgemm_"}
+            dgemm_code = template_code % {"float_type":"double", "float_size":8, "npy_float":"NPY_FLOAT64", "name":"dgemm_"}
+        if not common_code or not sgemm_code:
            raise IOError("Unable to load NumPy implementation of gemm code from C source files.")
        else:
            const = ""
            # _logger.info("Numpy implementation of gemm code loaded (config.blas.ldflags is empty)")
+        gemm_code += common_code
        gemm_code += sgemm_code
        gemm_code += dgemm_code


--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
@@ -63,7 +63,8 @@ class BaseCorr3dMM(gof.OpenMPOp):
        self.filter_dilation = tuple(filter_dilation)

        if not theano.config.blas.ldflags:
-            raise NotImplementedError("C code for corrMM* classes need a blas library.")
+            # raise NotImplementedError("C code for corrMM* classes need a blas library.")
+            self.blas_type = ''
        else:
            if 'openblas' in theano.config.blas.ldflags:
                self.blas_type = 'openblas'

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -96,7 +96,9 @@ def local_abstractconv_gemm(node):

 @local_optimizer([AbstractConv3d])
 def local_abstractconv3d_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "": # or not theano.config.blas.ldflags:
        return
    if not isinstance(node.op, AbstractConv3d):
        return None
@@ -143,7 +145,7 @@ def local_abstractconv_gradweight_gemm(node):

 @local_optimizer([AbstractConv3d_gradWeights])
 def local_abstractconv3d_gradweight_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    if theano.config.cxx == "": # or not theano.config.blas.ldflags:
        return
    if not isinstance(node.op, AbstractConv3d_gradWeights):
        return None
@@ -191,7 +193,7 @@ def local_abstractconv_gradinputs_gemm(node):

 @local_optimizer([AbstractConv3d_gradInputs])
 def local_abstractconv3d_gradinputs_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    if theano.config.cxx == "": # or not theano.config.blas.ldflags:
        return
    if not isinstance(node.op, AbstractConv3d_gradInputs):
        return None

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -443,8 +443,8 @@ class TestCorrConv2d(BaseTestConv2d):
 class TestAbstractConvNoOptim(BaseTestConv2d):
    @classmethod
    def setup_class(cls):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest()
+        # if theano.config.blas.ldflags == "":
+            # raise SkipTest()
        BaseTestConv2d.setup_class()
        cls.inputs_shapes = [(8, 1, 6, 6)]
        cls.filters_shapes = [(5, 1, 2, 2)]
@@ -517,8 +517,8 @@ class TestCpuConv2d(BaseTestConv2d):
            gradinput_OK = False

        if fwd_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv2d")
+            # if not theano.config.blas.ldflags:
+                # raise SkipTest("Need blas to test conv2d")
            self.run_fwd(inputs_shape=i, filters_shape=f,
                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
@@ -540,8 +540,8 @@ class TestCpuConv2d(BaseTestConv2d):
                          filter_dilation=fd)

        if gradweight_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv2d")
+            # if not theano.config.blas.ldflags:
+                # raise SkipTest("Need blas to test conv2d")
            self.run_gradweight(inputs_shape=i, filters_shape=f,
                                output_shape=o, subsample=s,
                                verify_grad=False, mode=mode,
@@ -566,8 +566,8 @@ class TestCpuConv2d(BaseTestConv2d):
                          filter_dilation=fd)

        if gradinput_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv2d")
+            # if not theano.config.blas.ldflags:
+                # raise SkipTest("Need blas to test conv2d")
            self.run_gradinput(inputs_shape=i, filters_shape=f,
                               output_shape=o, subsample=s,
                               verify_grad=False, mode=mode,
@@ -595,8 +595,8 @@ class TestCpuConv2d(BaseTestConv2d):
 class BaseTestConv3d(BaseTestConv):
    @classmethod
    def setup_class(cls):
-        if theano.config.blas.ldflags == '':
-            raise SkipTest("BLAS required for reference")
+        # if theano.config.blas.ldflags == '':
+            # raise SkipTest("BLAS required for reference")
        cls.inputs_shapes = [(2, 1, 5, 5, 5), (1, 2, 7, 5, 6)]
        cls.filters_shapes = [(2, 1, 2, 2, 2), (1, 2, 2, 1, 3)]
        cls.subsamples = [(1, 1, 1), (2, 2, 2), (1, 2, 3)]
@@ -644,14 +644,14 @@ class BaseTestConv3d(BaseTestConv):
 class TestCorrConv3d(BaseTestConv3d):
    @classmethod
    def setup_class(cls):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest()
+        # if theano.config.blas.ldflags == "":
+            # raise SkipTest()
        BaseTestConv3d.setup_class()

    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
        o = self.get_output_shape(i, f, s, b, fd)
-        if (not theano.config.blas.ldflags or
-                not theano.config.cxx or
+        # if (not theano.config.blas.ldflags or
+        if (not theano.config.cxx or
                theano.config.mode == "FAST_COMPILE"):
            raise SkipTest("Need blas to test conv3d")
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
@@ -698,8 +698,8 @@ class TestCpuConv3d(BaseTestConv3d):
            gradinput_OK = False

        if fwd_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv3d")
+            # if not theano.config.blas.ldflags:
+                # raise SkipTest("Need blas to test conv3d")
            self.run_fwd(inputs_shape=i, filters_shape=f,
                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
@@ -721,8 +721,8 @@ class TestCpuConv3d(BaseTestConv3d):
                          filter_dilation=fd)

        if gradweight_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv3d")
+            # if not theano.config.blas.ldflags:
+                # raise SkipTest("Need blas to test conv3d")
            self.run_gradweight(inputs_shape=i, filters_shape=f,
                                output_shape=o, subsample=s,
                                verify_grad=False, mode=mode,
@@ -747,8 +747,8 @@ class TestCpuConv3d(BaseTestConv3d):
                          filter_dilation=fd)

        if gradinput_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv3d")
+            # if not theano.config.blas.ldflags:
+                # raise SkipTest("Need blas to test conv3d")
            self.run_gradinput(inputs_shape=i, filters_shape=f,
                               output_shape=o, subsample=s,
                               verify_grad=False, mode=mode,
@@ -918,7 +918,7 @@ class TestBilinearUpsampling(unittest.TestCase):
    if theano.config.mode == "FAST_COMPILE":
        compile_mode = compile_mode.excluding("conv_gemm")
        compile_mode = compile_mode.excluding('AbstractConvCheck')
-    elif not theano.config.blas.ldflags or not theano.config.cxx:
+    elif not theano.config.cxx: # not theano.config.blas.ldflags or 
        compile_mode = compile_mode.excluding('AbstractConvCheck')

    def numerical_kernel_1D(self, ratio):

--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -27,8 +27,8 @@ class TestCorr2D(utt.InferShapeTester):
        self.filters.name = 'default_filters'
        if not conv.imported_scipy_signal and theano.config.cxx == "":
            raise SkipTest("CorrMM tests need SciPy or a c++ compiler")
-        if not theano.config.blas.ldflags:
-            raise SkipTest("CorrMM tests need a BLAS")
+        # if not theano.config.blas.ldflags:
+            # raise SkipTest("CorrMM tests need a BLAS")

    def validate(self, image_shape, filter_shape,
                 border_mode='valid', subsample=(1, 1),
@@ -131,7 +131,8 @@ class TestCorr2D(utt.InferShapeTester):
                                icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
                            ).sum()

-        utt.assert_allclose(theano_output, ref_output)
+        #utt.assert_allclose(theano_output, ref_output)
+        utt.assert_allclose(ref_output, theano_output)

        # TEST GRADIENT
        if verify_grad:

--- a/theano/tensor/nnet/tests/test_corr3d.py
+++ b/theano/tensor/nnet/tests/test_corr3d.py
@@ -27,8 +27,8 @@ class TestCorr3D(utt.InferShapeTester):
        self.filters.name = 'default_filters'
        if not conv.imported_scipy_signal and theano.config.cxx == "":
            raise SkipTest("Corr3dMM tests need SciPy or a c++ compiler")
-        if not theano.config.blas.ldflags:
-            raise SkipTest("Corr3dMM tests need a BLAS")
+        # if not theano.config.blas.ldflags:
+            # raise SkipTest("Corr3dMM tests need a BLAS")

    def validate(self, image_shape, filter_shape,
                 border_mode='valid', subsample=(1, 1, 1),

--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
@@ -31,7 +31,7 @@ def skip_if_blas_ldflags_empty(*functions_detected):
        functions_string = ""
        if functions_detected:
            functions_string = " (at least " + (", ".join(functions_detected)) + ")"
-        raise SkipTest("This test is useful only when Theano can access to BLAS functions" + functions_string + ".")
+        raise SkipTest("This test is useful only when Theano can access to BLAS functions" + functions_string + " other than [sd]gemm_.")


 class TestCGer(TestCase, TestOptimizationMixin):
@@ -83,13 +83,13 @@ class TestCGer(TestCase, TestOptimizationMixin):
        self.assertTrue(hash(CGer(False)) != hash(CGer(True)))

    def test_optimization_pipeline(self):
-        skip_if_blas_ldflags_empty('dger_')
+        skip_if_blas_ldflags_empty()
        f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, CGer(destructive=True))
        f(self.xval, self.yval)  # DebugMode tests correctness

    def test_optimization_pipeline_float(self):
-        skip_if_blas_ldflags_empty('sger_')
+        skip_if_blas_ldflags_empty()
        self.setUp('float32')
        f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, CGer(destructive=True))
@@ -102,14 +102,14 @@ class TestCGer(TestCase, TestOptimizationMixin):
        self.assertFunctionContains0(f, CGer(destructive=False))

    def test_A_plus_outer(self):
-        skip_if_blas_ldflags_empty('sger_', 'dger_')
+        skip_if_blas_ldflags_empty()
        f = self.function([self.A, self.x, self.y],
                self.A + tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, CGer(destructive=False))
        self.run_f(f)  # DebugMode tests correctness

    def test_A_plus_scaled_outer(self):
-        skip_if_blas_ldflags_empty('sger_', 'dger_')
+        skip_if_blas_ldflags_empty()
        f = self.function([self.A, self.x, self.y],
                self.A + 0.1 * tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, CGer(destructive=False))
@@ -155,7 +155,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        assert not numpy.isnan(zval).any()

    def test_optimizations_vm(self):
-        skip_if_blas_ldflags_empty('sdot_')
+        skip_if_blas_ldflags_empty()
        ''' Test vector dot matrix '''
        f = theano.function([self.x, self.A],
                theano.dot(self.x, self.A),
@@ -177,7 +177,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
                numpy.dot(self.xval, self.Aval[::-1, ::-1]))

    def test_optimizations_mv(self):
-        skip_if_blas_ldflags_empty('sdot_')
+        skip_if_blas_ldflags_empty()
        ''' Test matrix dot vector '''
        f = theano.function([self.A, self.y],
                theano.dot(self.A, self.y),
@@ -248,7 +248,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
            numpy.dot(m.get_value(), v1.get_value()) + v2_orig)

    def test_gemv1(self):
-        skip_if_blas_ldflags_empty('sdot_')
+        skip_if_blas_ldflags_empty()
        self.t_gemv1((3, 2))
        self.t_gemv1((1, 2))
        self.t_gemv1((0, 2))
@@ -283,7 +283,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        self.assertRaises(ValueError, f, A_val, ones_4, ones_6)

    def test_multiple_inplace(self):
-        skip_if_blas_ldflags_empty('sdot_')
+        skip_if_blas_ldflags_empty()
        x = tensor.dmatrix('x')
        y = tensor.dvector('y')
        z = tensor.dvector('z')
@@ -307,7 +307,7 @@ class TestCGemvFloat32(TestCase, BaseGemv, TestOptimizationMixin):
    gemv_inplace = CGemv(inplace=True)

    def setUp(self):
-        skip_if_blas_ldflags_empty('sdot_')
+        skip_if_blas_ldflags_empty()


 class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
@@ -317,12 +317,8 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
    gemv_inplace = CGemv(inplace=True)

    def setUp(self):
-        skip_if_blas_ldflags_empty('sdot_')
+        skip_if_blas_ldflags_empty()


 class TestBlasStridesC(TestBlasStrides):
    mode = mode_blas_opt
-
-    def test_ger_strides(self):
-        skip_if_blas_ldflags_empty('dger_')
-        super(TestBlasStridesC, self).test_ger_strides()