Merge pull request #5076 from notoraptor/master

Issue 5008 fixed

Merge pull request #5076 from notoraptor/master
856b98d3 · Pascal Lamblin · GitHub · bf9413c8 · 48e47823 · 856b98d3
--- a/theano/tensor/alt_gemm_common.c
+++ b/theano/tensor/alt_gemm_common.c
+/** C Implementation of [sd]gemm_ based on NumPy
+ * Used instead of blas when Theano config flag blas.ldflags is empty.
+ * This file contains the common code for [sd]gemm_.
+ * File alt_gemm_template.c contains template code for [sd]gemm_. **/
+
+#define alt_fatal_error(message) { if(message != NULL) fprintf(stderr, message); exit(-1); }
+
+#define alt_trans_to_bool(trans)  (*trans != 'N' && *trans != 'n')
+
+/**Template code for [sd]gemm_ follows in file alt_gemm_template.c
+ * (as Python string to be used with old formatting).
+ * PARAMETERS:
+ * float_type: "float" for sgemm_, "double" for dgemm_.
+ * float_size: 4 for float32 (sgemm_), 8 for float64 (dgemm_).
+ * npy_float: "NPY_FLOAT32" for sgemm_, "NPY_FLOAT64" for dgemm_.
+ * name: "sgemm_" for sgemm_, "dgemm_" for dgemm_. 
+ * See blas_headers.py for current use.**/
--- a/theano/tensor/alt_gemm_template.c
+++ b/theano/tensor/alt_gemm_template.c
+/** %(name)s **/
+
+/* Scalar*Matrix function.
+ * Computes: matrix = scalar*matrix. */
+void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) {
+    NpyIter* iterator = NpyIter_New(matrix, 
+        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, 
+        NPY_KEEPORDER, NPY_NO_CASTING, NULL);
+    if(iterator == NULL)
+        alt_fatal_error("Unable to iterate over a matrix "
+                        "for a scalar * matrix operation.");
+    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
+    char** data_ptr = NpyIter_GetDataPtrArray(iterator);
+    npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
+    npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
+    do {
+        char* data = *data_ptr;
+        npy_intp stride = *stride_ptr;
+        npy_intp count = *innersize_ptr;
+        while(count) {
+            *((%(float_type)s*)data) *= *scalar;
+            data += stride;
+            --count;
+        }
+    } while(get_next(iterator));
+    NpyIter_Deallocate(iterator);
+}
+/* Matrix+Matrix function.
+ * Computes: matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
+void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
+        const %(float_type)s* scalar1, PyArrayObject* matrix1,
+        const %(float_type)s* scalar2, PyArrayObject* matrix2
+) {
+    PyArrayObject* op[2]       = {matrix1, matrix2};
+    npy_uint32     op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
+    npy_uint32     flags       = 0;
+    NpyIter*       iterators   = NpyIter_MultiNew(
+            2, op, flags, NPY_CORDER, NPY_NO_CASTING, op_flags, NULL);
+    if(iterators == NULL)
+        alt_fatal_error("Unable to iterate over some matrices "
+                        "for matrix + matrix operation.");
+    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
+    char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
+    do {
+        %(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0];
+        %(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1];
+        *from_matrix2 = (*scalar1)*(*from_matrix1) + (*scalar2)*(*from_matrix2);
+    } while(get_next(iterators));
+    NpyIter_Deallocate(iterators);
+}
+/* NumPy Wrapping function. Wraps a data into a NumPy's PyArrayObject.
+ * By default, data is considered as Fortran-style array (column by column).
+ * If to_transpose, data will be considered as C-style array (row by row)
+ * with dimensions reversed. */
+PyObject* alt_op_%(float_type)s(int to_transpose, %(float_type)s* M, int nrow, int ncol, int LDM) {
+    npy_intp dims[2];
+    npy_intp strides[2];
+    if(to_transpose) {
+        dims[0] = ncol;
+        dims[1] = nrow;
+        strides[0] = LDM * %(float_size)d;
+        strides[1] = %(float_size)d;
+    } else {
+        dims[0] = nrow;
+        dims[1] = ncol;
+        strides[0] = %(float_size)d;
+        strides[1] = LDM * %(float_size)d;
+    }
+    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, 0, NULL);
+}
+/* Special wrapping case used for matrix C in gemm implementation. */
+inline PyObject* alt_wrap_fortran_writeable_matrix_%(float_type)s(
+    %(float_type)s* matrix, const int* nrow, const int* ncol, const int* LD
+) {
+    npy_intp dims[2] = {*nrow, *ncol};
+    npy_intp strides[2] = {%(float_size)d, (*LD) * %(float_size)d};
+    return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, matrix, 0, NPY_ARRAY_WRITEABLE, NULL);
+}
+/* %(name)s template code */
+void %(name)s(
+    char* TRANSA, char* TRANSB, const int* M, const int* N, const int* K,
+    const %(float_type)s* ALPHA, %(float_type)s* A, const int* LDA, 
+    %(float_type)s* B, const int* LDB, const %(float_type)s* BETA, 
+    %(float_type)s* C, const int* LDC
+) {
+    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
+        alt_fatal_error("The integer arguments passed to %(name)s must all be at least 0.");
+    /* If M or N is null, there is nothing to do with C,
+     * as C should contain M*N == 0 items. */
+    if(*M == 0 || *N == 0)
+        return;
+    int nrowa, ncola, nrowb, ncolb;
+    int to_transpose_A = alt_trans_to_bool(TRANSA);
+    int to_transpose_B = alt_trans_to_bool(TRANSB);
+    if(to_transpose_A) {
+        nrowa = *K;
+        ncola = *M;
+    } else {
+        nrowa = *M;
+        ncola = *K;
+    }
+    if(to_transpose_B) {
+        nrowb = *N;
+        ncolb = *K;
+    } else {
+        nrowb = *K;
+        ncolb = *N;
+    }
+    int computation_flags;
+    void* computation_pointer;
+    npy_intp* computation_strides;
+    npy_intp computation_dims[2] = {*N, *M};
+    npy_intp default_computation_strides[2] = {(*LDC) * %(float_size)d, %(float_size)d};
+    if(*BETA == 0 && *LDC == *M) {
+        /* BETA == 0, so C is never read.
+         * LDC == M, so C is contiguous in memory
+         * (that condition is needed for dot operation, se below).
+         * Then we can compute ALPHA*op(A)*op(B) directly in C. */
+        computation_flags = NPY_ARRAY_WRITEABLE;
+        computation_pointer = C;
+        computation_strides = default_computation_strides;
+    } else {
+        /* Either BETA != 0 (C will be read)
+         * or LDC != M (C is not read but is not contiguous in memory).
+         * Then in both cases, we need to allocate a new memory
+         * to compute ALPHA*op(A)*op(B). */
+        computation_flags = 0;
+        computation_pointer = NULL;
+        computation_strides = NULL;
+    }
+    /* The memory buffer used to compute op(A)*op(B) (either C or
+     * new allocated buffer) will be considered as C-contiguous because
+     * the 3rd parameter of PyArray_MatrixProduct2 (used below)
+     * expects a C-contiguous array.
+     * Also, to avoid some memory copy, transposition conditions
+     * for A and B will be reversed, so that the buffer will contain
+     * C-contiguous opB_transposed * opA_transposed (N*M matrix).
+     * After that, the code that uses the buffer (either the code calling
+     * this function, or this function if BETA != 0) just has to 
+     * consider the buffer as a F-contiguous M*N matrix, so that
+     * it will get the transposed of op_B_transposed * op_A_transposed,
+     * that is op_A * op_B (M*N matrix) as expected. */
+    PyObject* opA_transposed = alt_op_%(float_type)s(!to_transpose_A, A, nrowa, ncola, *LDA);
+    PyObject* opB_transposed = alt_op_%(float_type)s(!to_transpose_B, B, nrowb, ncolb, *LDB);
+    PyObject* opB_trans_dot_opA_trans = PyArray_New(&PyArray_Type, 2, computation_dims, %(npy_float)s, computation_strides, computation_pointer, 0, computation_flags, NULL);
+    PyArray_MatrixProduct2(opB_transposed, opA_transposed, (PyArrayObject*)opB_trans_dot_opA_trans);
+    if(*BETA == 0) {
+        if(*ALPHA != 1.0)
+            alt_numpy_scale_matrix_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opB_trans_dot_opA_trans);
+        if(*LDC != *M) {
+            /* A buffer has been created to compute ALPHA*op(A)*op(B),
+             * so we must copy it to the real output, that is C. */
+            PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
+            PyObject* alpha_opA_dot_opB = PyArray_Transpose((PyArrayObject*)opB_trans_dot_opA_trans, NULL);
+            if(0 != PyArray_CopyInto((PyArrayObject*)matrix_C, (PyArrayObject*)alpha_opA_dot_opB))
+                alt_fatal_error("NumPy %(name)s implementation: unable to copy ALPHA*op(A)*op(B) into C when BETA == 0.");
+            Py_XDECREF(alpha_opA_dot_opB);
+            Py_XDECREF(matrix_C);
+        }
+    } else {
+        /* C is read, so we must consider it as Fortran-style matrix. */
+        PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
+        PyObject* opA_dot_opB = PyArray_Transpose((PyArrayObject*)opB_trans_dot_opA_trans, NULL);
+        alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opA_dot_opB, BETA, (PyArrayObject*)matrix_C);
+        Py_XDECREF(opA_dot_opB);
+        Py_XDECREF(matrix_C);
+    }
+    Py_XDECREF(opB_trans_dot_opA_trans);
+    Py_XDECREF(opB_transposed);
+    Py_XDECREF(opA_transposed);
+}
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1037,10 +1037,6 @@ class Gemm(GemmRelated):
        if node.inputs[0].type.dtype.startswith('complex'):
            raise utils.MethodNotDefined('%s.c_code'
                                         % self.__class__.__name__)
-        if not config.blas.ldflags:
-            return super(Gemm, self).c_code(node, name,
-                                            (_z, _a, _x, _y, _b), (_zout, ),
-                                            sub)
        full_code = self.build_gemm_call() % dict(locals(), **sub)
        return full_code

@@ -2154,10 +2150,6 @@ class BatchedDot(Op):
        _z, = out
        fail = sub["fail"]

-        if not config.blas.ldflags:
-            return super(BatchedDot, self).c_code(node, name,
-                                                  inp, out, sub)
-
        # generate contiguity condition
        def contiguous(var, ndim):
            strides = "PyArray_STRIDES(%s)" % var

--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
@@ -9,6 +9,7 @@ import logging
 import textwrap
 import sys
 import os
+from os.path import dirname, normpath

 from theano import config
 from theano.gof.cmodule import GCC_compiler
@@ -729,6 +730,31 @@ def cblas_header_text():

 def blas_header_text():
    """C header for the fortran blas interface"""
+
+    gemm_code = ""
+    const = "const"
+    if not config.blas.ldflags:
+        # Include the Numpy version implementation of [sd]gemm_.
+        current_filedir = dirname(__file__)
+        gemm_common_filepath = normpath(current_filedir + "/alt_gemm_common.c")
+        gemm_template_filepath = normpath(current_filedir + "/alt_gemm_template.c")
+        common_code = ""
+        sgemm_code = ""
+        dgemm_code = ""
+        with open(gemm_common_filepath) as code:
+            common_code = code.read()
+        with open(gemm_template_filepath) as code:
+            template_code = code.read()
+            sgemm_code = template_code % {"float_type": "float", "float_size": 4, "npy_float": "NPY_FLOAT32", "name": "sgemm_"}
+            dgemm_code = template_code % {"float_type": "double", "float_size": 8, "npy_float": "NPY_FLOAT64", "name": "dgemm_"}
+        if not common_code or not sgemm_code:
+            raise IOError("Unable to load NumPy implementation of gemm code from C source files.")
+        else:
+            const = ""
+        gemm_code += common_code
+        gemm_code += sgemm_code
+        gemm_code += dgemm_code
+
    header = """
    extern "C"
    {
@@ -890,7 +916,7 @@ def blas_header_text():

    /* Single Precision */

-        void sgemm_(char*, char*, const int*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*);
+        void sgemm_(char*, char*, const int*, const int*, const int*, const float *, %(const)s float *, const int*, %(const)s float *, const int*, const float *, float *, const int*);
        void ssymm_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*);
        void ssyrk_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, float *, const int*);
        void ssyr2k_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*);
@@ -899,7 +925,7 @@ def blas_header_text():

    /* Double Precision */

-        void dgemm_(char*, char*, const int*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*);
+        void dgemm_(char*, char*, const int*, const int*, const int*, const double *, %(const)s double *, const int*, %(const)s double *, const int*, const double *, double *, const int*);
        void dsymm_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*);
        void dsyrk_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, double *, const int*);
        void dsyr2k_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*);
@@ -958,7 +984,7 @@ def blas_header_text():
                    }
                    """)

-    return header
+    return (header % {'const': const}) + gemm_code


 def mkl_threads_text():

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -63,7 +63,8 @@ class BaseCorrMM(gof.OpenMPOp):
        self.filter_dilation = tuple(filter_dilation)

        if not theano.config.blas.ldflags:
-            raise NotImplementedError("C code for corrMM* classes need a blas library.")
+            # Theano will use a NumPy C implementation of [sd]gemm_ instead.
+            self.blas_type = ''
        else:
            if 'openblas' in theano.config.blas.ldflags:
                self.blas_type = 'openblas'

--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
@@ -63,7 +63,8 @@ class BaseCorr3dMM(gof.OpenMPOp):
        self.filter_dilation = tuple(filter_dilation)

        if not theano.config.blas.ldflags:
-            raise NotImplementedError("C code for corrMM* classes need a blas library.")
+            # Theano will use a NumPy C implementation of [sd]gemm_ instead.
+            self.blas_type = ''
        else:
            if 'openblas' in theano.config.blas.ldflags:
                self.blas_type = 'openblas'

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -72,7 +72,9 @@ compile.optdb.register('local_inplace_sparse_block_outer',
 # Conv opts
 @local_optimizer([AbstractConv2d])
 def local_abstractconv_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv2d):
        return None
@@ -94,7 +96,9 @@ def local_abstractconv_gemm(node):

 @local_optimizer([AbstractConv3d])
 def local_abstractconv3d_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv3d):
        return None
@@ -116,7 +120,9 @@ def local_abstractconv3d_gemm(node):

 @local_optimizer([AbstractConv2d_gradWeights])
 def local_abstractconv_gradweight_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv2d_gradWeights):
        return None
@@ -141,7 +147,9 @@ def local_abstractconv_gradweight_gemm(node):

 @local_optimizer([AbstractConv3d_gradWeights])
 def local_abstractconv3d_gradweight_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv3d_gradWeights):
        return None
@@ -166,7 +174,9 @@ def local_abstractconv3d_gradweight_gemm(node):

 @local_optimizer([AbstractConv2d_gradInputs])
 def local_abstractconv_gradinputs_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv2d_gradInputs):
        return None
@@ -189,7 +199,9 @@ def local_abstractconv_gradinputs_gemm(node):

 @local_optimizer([AbstractConv3d_gradInputs])
 def local_abstractconv3d_gradinputs_gemm(node):
-    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv3d_gradInputs):
        return None
@@ -603,6 +615,5 @@ def local_abstractconv_check(node):
            node.op.__class__.__name__)

 optdb.register('AbstractConvCheck',
-               opt.in2out(local_abstractconv_check,
-                          name="AbstractConvCheck"),
+               opt.in2out(local_abstractconv_check, name="AbstractConvCheck"),
               48.7, 'fast_compile', 'fast_run')
--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -363,8 +363,7 @@ class BaseTestConv(object):
 class BaseTestConv2d(BaseTestConv):
    @classmethod
    def setup_class(cls):
-        if theano.config.blas.ldflags == '':
-            raise SkipTest("BLAS required for reference")
+        # This tests can run even when theano.config.blas.ldflags is empty.
        cls.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
                             (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
        cls.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
@@ -414,14 +413,13 @@ class BaseTestConv2d(BaseTestConv):
 class TestCorrConv2d(BaseTestConv2d):
    @classmethod
    def setup_class(cls):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest()
+        # This tests can run even when theano.config.blas.ldflags is empty.
        BaseTestConv2d.setup_class()

    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
        o = self.get_output_shape(i, f, s, b, fd)
-        if (not theano.config.blas.ldflags or
-                not theano.config.cxx or
+        # This tests can run even when theano.config.blas.ldflags is empty.
+        if (not theano.config.cxx or
                theano.config.mode == "FAST_COMPILE"):
            raise SkipTest("Need blas to test conv2d")
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
@@ -444,8 +442,7 @@ class TestCorrConv2d(BaseTestConv2d):
 class TestAbstractConvNoOptim(BaseTestConv2d):
    @classmethod
    def setup_class(cls):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest()
+        # This tests can run even when theano.config.blas.ldflags is empty.
        BaseTestConv2d.setup_class()
        cls.inputs_shapes = [(8, 1, 6, 6)]
        cls.filters_shapes = [(5, 1, 2, 2)]
@@ -518,8 +515,7 @@ class TestCpuConv2d(BaseTestConv2d):
            gradinput_OK = False

        if fwd_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv2d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_fwd(inputs_shape=i, filters_shape=f,
                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
@@ -541,8 +537,7 @@ class TestCpuConv2d(BaseTestConv2d):
                          filter_dilation=fd)

        if gradweight_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv2d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_gradweight(inputs_shape=i, filters_shape=f,
                                output_shape=o, subsample=s,
                                verify_grad=False, mode=mode,
@@ -567,8 +562,7 @@ class TestCpuConv2d(BaseTestConv2d):
                          filter_dilation=fd)

        if gradinput_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv2d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_gradinput(inputs_shape=i, filters_shape=f,
                               output_shape=o, subsample=s,
                               verify_grad=False, mode=mode,
@@ -596,8 +590,7 @@ class TestCpuConv2d(BaseTestConv2d):
 class BaseTestConv3d(BaseTestConv):
    @classmethod
    def setup_class(cls):
-        if theano.config.blas.ldflags == '':
-            raise SkipTest("BLAS required for reference")
+        # This tests can run even when theano.config.blas.ldflags is empty.
        cls.inputs_shapes = [(2, 1, 5, 5, 5), (1, 2, 7, 5, 6)]
        cls.filters_shapes = [(2, 1, 2, 2, 2), (1, 2, 2, 1, 3)]
        cls.subsamples = [(1, 1, 1), (2, 2, 2), (1, 2, 3)]
@@ -645,14 +638,13 @@ class BaseTestConv3d(BaseTestConv):
 class TestCorrConv3d(BaseTestConv3d):
    @classmethod
    def setup_class(cls):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest()
+        # This tests can run even when theano.config.blas.ldflags is empty.
        BaseTestConv3d.setup_class()

    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
        o = self.get_output_shape(i, f, s, b, fd)
-        if (not theano.config.blas.ldflags or
-                not theano.config.cxx or
+        # This test can run even when theano.config.blas.ldflags is empty.
+        if (not theano.config.cxx or
                theano.config.mode == "FAST_COMPILE"):
            raise SkipTest("Need blas to test conv3d")
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
@@ -699,8 +691,7 @@ class TestCpuConv3d(BaseTestConv3d):
            gradinput_OK = False

        if fwd_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv3d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_fwd(inputs_shape=i, filters_shape=f,
                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
@@ -722,8 +713,7 @@ class TestCpuConv3d(BaseTestConv3d):
                          filter_dilation=fd)

        if gradweight_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv3d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_gradweight(inputs_shape=i, filters_shape=f,
                                output_shape=o, subsample=s,
                                verify_grad=False, mode=mode,
@@ -748,8 +738,7 @@ class TestCpuConv3d(BaseTestConv3d):
                          filter_dilation=fd)

        if gradinput_OK:
-            if not theano.config.blas.ldflags:
-                raise SkipTest("Need blas to test conv3d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_gradinput(inputs_shape=i, filters_shape=f,
                               output_shape=o, subsample=s,
                               verify_grad=False, mode=mode,
@@ -913,13 +902,13 @@ class TestConvTypes(unittest.TestCase):


 class TestBilinearUpsampling(unittest.TestCase):
-    # If BLAS is not available on CPU, then we accept the fallback to the
-    # slow Python implementation for that test.
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
    compile_mode = theano.compile.mode.get_default_mode()
    if theano.config.mode == "FAST_COMPILE":
        compile_mode = compile_mode.excluding("conv_gemm")
        compile_mode = compile_mode.excluding('AbstractConvCheck')
-    elif not theano.config.blas.ldflags or not theano.config.cxx:
+    elif not theano.config.cxx:
        compile_mode = compile_mode.excluding('AbstractConvCheck')

    def numerical_kernel_1D(self, ratio):

--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -27,8 +27,7 @@ class TestCorr2D(utt.InferShapeTester):
        self.filters.name = 'default_filters'
        if not conv.imported_scipy_signal and theano.config.cxx == "":
            raise SkipTest("CorrMM tests need SciPy or a c++ compiler")
-        if not theano.config.blas.ldflags:
-            raise SkipTest("CorrMM tests need a BLAS")
+        # This tests can run even when theano.config.blas.ldflags is empty.

    def validate(self, image_shape, filter_shape,
                 border_mode='valid', subsample=(1, 1),
@@ -131,7 +130,7 @@ class TestCorr2D(utt.InferShapeTester):
                                icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
                            ).sum()

-        utt.assert_allclose(theano_output, ref_output)
+        utt.assert_allclose(ref_output, theano_output)

        # TEST GRADIENT
        if verify_grad:

--- a/theano/tensor/nnet/tests/test_corr3d.py
+++ b/theano/tensor/nnet/tests/test_corr3d.py
@@ -27,8 +27,7 @@ class TestCorr3D(utt.InferShapeTester):
        self.filters.name = 'default_filters'
        if not conv.imported_scipy_signal and theano.config.cxx == "":
            raise SkipTest("Corr3dMM tests need SciPy or a c++ compiler")
-        if not theano.config.blas.ldflags:
-            raise SkipTest("Corr3dMM tests need a BLAS")
+        # This tests can run even when theano.config.blas.ldflags is empty.

    def validate(self, image_shape, filter_shape,
                 border_mode='valid', subsample=(1, 1, 1),

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -95,10 +95,11 @@ class t_gemm(TestCase):

            cmp_linker(copy(z), a, x, y, b, 'c|py')
            cmp_linker(copy(z), a, x, y, b, 'py')
-            if (config.blas.ldflags and not dtype.startswith("complex")
+
+            if (not dtype.startswith("complex")
                and theano.config.cxx):
-                # If blas.ldflags is equal to '', the C code will not
-                # be generated
+                # If theano.config.blas.ldflags is empty, Theano will use
+                # a NumPy C implementation of [sd]gemm_.
                cmp_linker(copy(z), a, x, y, b, 'c')

    def test0a(self):
@@ -2160,6 +2161,24 @@ class TestBlasStrides(TestCase):
        self.cmp_ger((1, 0), 1, 0)
        self.cmp_ger((0, 0), 0, 0)

+    def test_gemm_non_contiguous(self):
+        """test_gemm_non_contiguous: Test if GEMM works well with non-contiguous matrices."""
+        aval = numpy.ones((6, 2))
+        bval = numpy.ones((2, 7))
+        cval = numpy.arange(7) + numpy.arange(0, .6, .1)[:, numpy.newaxis]
+
+        a = theano.shared(aval[:3], borrow=True)
+        b = theano.shared(bval[:, :5], borrow=True)
+        c = theano.shared(cval[:3, :5], borrow=True)
+
+        s = theano.tensor.scalar()
+        upd_c = s * c + theano.tensor.dot(a, b)
+        f = theano.function([s], [], updates={c: upd_c})
+
+        f(0)
+        ref_output = numpy.ones((3, 5)) * 2
+        unittest_tools.assert_allclose(c.get_value(), ref_output)
+

 class test_infer_shape(unittest_tools.InferShapeTester):
    def test_dot22(self):

--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
@@ -26,13 +26,18 @@ mode_blas_opt = theano.compile.get_default_mode().including(
    'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')


+def skip_if_blas_ldflags_empty(*functions_detected):
+    if theano.config.blas.ldflags == "":
+        functions_string = ""
+        if functions_detected:
+            functions_string = " (at least " + (", ".join(functions_detected)) + ")"
+        raise SkipTest("This test is useful only when Theano can access to BLAS functions" + functions_string + " other than [sd]gemm_.")
+
+
 class TestCGer(TestCase, TestOptimizationMixin):

    def setUp(self, dtype='float64'):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest("This test is useful only when Theano"
-                           " is directly linked to blas.")
-
+        # This tests can run even when theano.config.blas.ldflags is empty.
        self.dtype = dtype
        self.mode = theano.compile.get_default_mode().including('fast_run')
        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
@@ -76,11 +81,13 @@ class TestCGer(TestCase, TestOptimizationMixin):
        self.assertTrue(hash(CGer(False)) != hash(CGer(True)))

    def test_optimization_pipeline(self):
+        skip_if_blas_ldflags_empty()
        f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, CGer(destructive=True))
        f(self.xval, self.yval)  # DebugMode tests correctness

    def test_optimization_pipeline_float(self):
+        skip_if_blas_ldflags_empty()
        self.setUp('float32')
        f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, CGer(destructive=True))
@@ -93,12 +100,14 @@ class TestCGer(TestCase, TestOptimizationMixin):
        self.assertFunctionContains0(f, CGer(destructive=False))

    def test_A_plus_outer(self):
+        skip_if_blas_ldflags_empty()
        f = self.function([self.A, self.x, self.y],
                self.A + tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, CGer(destructive=False))
        self.run_f(f)  # DebugMode tests correctness

    def test_A_plus_scaled_outer(self):
+        skip_if_blas_ldflags_empty()
        f = self.function([self.A, self.x, self.y],
                self.A + 0.1 * tensor.outer(self.x, self.y))
        self.assertFunctionContains(f, CGer(destructive=False))
@@ -113,9 +122,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):

    """
    def setUp(self, dtype='float64'):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest("This test is useful only when Theano"
-                           " is directly linked to blas.")
+        # This tests can run even when theano.config.blas.ldflags is empty.
        self.dtype = dtype
        self.mode = theano.compile.get_default_mode().including('fast_run')
        # matrix
@@ -144,6 +151,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        assert not numpy.isnan(zval).any()

    def test_optimizations_vm(self):
+        skip_if_blas_ldflags_empty()
        ''' Test vector dot matrix '''
        f = theano.function([self.x, self.A],
                theano.dot(self.x, self.A),
@@ -165,6 +173,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
                numpy.dot(self.xval, self.Aval[::-1, ::-1]))

    def test_optimizations_mv(self):
+        skip_if_blas_ldflags_empty()
        ''' Test matrix dot vector '''
        f = theano.function([self.A, self.y],
                theano.dot(self.A, self.y),
@@ -235,6 +244,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
            numpy.dot(m.get_value(), v1.get_value()) + v2_orig)

    def test_gemv1(self):
+        skip_if_blas_ldflags_empty()
        self.t_gemv1((3, 2))
        self.t_gemv1((1, 2))
        self.t_gemv1((0, 2))
@@ -269,6 +279,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        self.assertRaises(ValueError, f, A_val, ones_4, ones_6)

    def test_multiple_inplace(self):
+        skip_if_blas_ldflags_empty()
        x = tensor.dmatrix('x')
        y = tensor.dvector('y')
        z = tensor.dvector('z')
@@ -292,9 +303,7 @@ class TestCGemvFloat32(TestCase, BaseGemv, TestOptimizationMixin):
    gemv_inplace = CGemv(inplace=True)

    def setUp(self):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest("This test is useful only when Theano"
-                           " is directly linked to blas.")
+        skip_if_blas_ldflags_empty()


 class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
@@ -304,9 +313,7 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
    gemv_inplace = CGemv(inplace=True)

    def setUp(self):
-        if theano.config.blas.ldflags == "":
-            raise SkipTest("This test is useful only when Theano"
-                           " is directly linked to blas.")
+        skip_if_blas_ldflags_empty()


 class TestBlasStridesC(TestBlasStrides):