New update.

Some commented codes about empty ldflags have been removed and replaced with normal comments. Some changes are made in alt_gemm_template.c relative to last recommendations.

New update.
bb25e997 · notoraptor · 70896325 · bb25e997 · bb25e997 · bb25e997
--- a/theano/tensor/alt_gemm_template.c
+++ b/theano/tensor/alt_gemm_template.c
 /** %(name)s **/
-void alt_numpy_scalar_matrix_product_in_place_%(float_type)s(%(float_type)s scalar, PyArrayObject* matrix) {
+void alt_numpy_scale_matrix_inplace_%(float_type)s(%(float_type)s scalar, PyArrayObject* matrix) {
    NpyIter* iterator = NpyIter_New(matrix, 
        NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, 
        NPY_KEEPORDER, NPY_NO_CASTING, NULL);
@@ -25,7 +25,7 @@ void alt_numpy_scalar_matrix_product_in_place_%(float_type)s(%(float_type)s scal
 /*Matrix+Matrix function. Compute (coeffA * matrixA) + (coeffB * matrixB)
 * Remark: This function actually sums a C-contiguous matrix (alpha*op(A)*op(B)) with a F-contiguous matrix (beta*C)
 * (see gemm implementation at next function for more details) */
-void alt_numpy_matrix_extended_sum_in_place_%(float_type)s(
+void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
        const %(float_type)s* ALPHA, PyArrayObject* A,
        const %(float_type)s* BETA, PyArrayObject* B
 ) {
@@ -46,15 +46,15 @@ void alt_numpy_matrix_extended_sum_in_place_%(float_type)s(
    } while(get_next(iterators));
    NpyIter_Deallocate(iterators);
 }
-PyObject* alt_op_without_copy_%(float_type)s(int transposable, %(float_type)s* M, int nrow, int ncol, int LDM) {
+PyObject* alt_op_without_copy_%(float_type)s(int to_transpose, %(float_type)s* M, int nrow, int ncol, int LDM) {
    // By default, M is considered as a nrow*ncol F-contiguous matrix with LDM as stride indicator for the columns.
    npy_intp dims[2];
    npy_intp strides[2];
    int flags;
-    if(transposable) {
+    if(to_transpose) {
        dims[0] = ncol;
        dims[1] = nrow;
-        strides[0] = dims[1] * %(float_size)d;
+        strides[0] = LDM * %(float_size)d;
        strides[1] = %(float_size)d;
        flags = NPY_ARRAY_C_CONTIGUOUS;
    } else {
@@ -73,12 +73,14 @@ void %(name)s(
    const %(float_type)s* ALPHA, %(float_type)s* A, const int* LDA, 
    %(float_type)s* B, const int* LDB, const %(float_type)s* BETA, 
    %(float_type)s* C, const int* LDC) {
+    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
+        alt_fatal_error("The integer arguments passed to %(name)s must all be at least 0.");
    /* NB: it seems that matrix+matrix and scalar*matrix functions
     * defined above do not allocate iterator for a matrix with 0
     * content, that is a matrix whose nrow*ncol == 0. As these
     * functions actually work with M*N matrices (op(A)*op(B) and/or C),
     * I think that we could just return if M or N is null. */
-    if(*M < 1 || *N < 1 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
+    if(*M == 0 || *N == 0)
        return;
    int nrowa, ncola, nrowb, ncolb;
    int is_A_transposable = alt_trans_to_bool(TRANSA);
@@ -111,7 +113,7 @@ void %(name)s(
        PyObject* op_B_transposed = alt_op_without_copy_%(float_type)s(!is_B_transposable, B, nrowb, ncolb, *LDB);
        PyArray_MatrixProduct2(op_B_transposed, op_A_transposed, (PyArrayObject*)matrix_C);
        if(*ALPHA != 1.0)
-            alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*ALPHA, (PyArrayObject*)matrix_C);
+            alt_numpy_scale_matrix_inplace_%(float_type)s(*ALPHA, (PyArrayObject*)matrix_C);
        Py_XDECREF(op_B_transposed);
        Py_XDECREF(op_A_transposed);
        Py_XDECREF(matrix_C);
@@ -123,7 +125,7 @@ void %(name)s(
        PyObject* op_A = alt_op_without_copy_%(float_type)s(is_A_transposable, A, nrowa, ncola, *LDA);
        PyObject* op_B = alt_op_without_copy_%(float_type)s(is_B_transposable, B, nrowb, ncolb, *LDB);
        PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
-        alt_numpy_matrix_extended_sum_in_place_%(float_type)s(ALPHA, op_A_times_op_B, BETA, (PyArrayObject*)matrix_C);
+        alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, op_A_times_op_B, BETA, (PyArrayObject*)matrix_C);
        /*C is already F-contiguous, thus no conversion needed for output.*/
        Py_XDECREF(op_A_times_op_B);
        Py_XDECREF(op_B);

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1037,7 +1037,6 @@ class Gemm(GemmRelated):
        if node.inputs[0].type.dtype.startswith('complex'):
            raise utils.MethodNotDefined('%s.c_code'
                                         % self.__class__.__name__)
-        # if not config.blas.ldflags: # return super(Gemm, self).c_code(node, name, (_z, _a, _x, _y, _b), (_zout, ), sub)
        full_code = self.build_gemm_call() % dict(locals(), **sub)
        return full_code

@@ -2151,8 +2150,6 @@ class BatchedDot(Op):
        _z, = out
        fail = sub["fail"]

-        # if not config.blas.ldflags: # return super(BatchedDot, self).c_code(node, name, inp, out, sub)
-
        # generate contiguity condition
        def contiguous(var, ndim):
            strides = "PyArray_STRIDES(%s)" % var

--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
@@ -751,7 +751,6 @@ def blas_header_text():
            raise IOError("Unable to load NumPy implementation of gemm code from C source files.")
        else:
            const = ""
-            # _logger.info("Numpy implementation of gemm code loaded (config.blas.ldflags is empty)")
        gemm_code += common_code
        gemm_code += sgemm_code
        gemm_code += dgemm_code

--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
@@ -63,7 +63,7 @@ class BaseCorr3dMM(gof.OpenMPOp):
        self.filter_dilation = tuple(filter_dilation)

        if not theano.config.blas.ldflags:
-            # raise NotImplementedError("C code for corrMM* classes need a blas library.")
+            # Theano will use a NumPy C implementation of [sd]gemm_ instead.
            self.blas_type = ''
        else:
            if 'openblas' in theano.config.blas.ldflags:

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -98,7 +98,7 @@ def local_abstractconv_gemm(node):
 def local_abstractconv3d_gemm(node):
    # If theano.config.blas.ldflags is empty, Theano will use
    # a NumPy C implementation of [sd]gemm_.
-    if theano.config.cxx == "":  # or not theano.config.blas.ldflags:
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv3d):
        return None
@@ -120,7 +120,9 @@ def local_abstractconv3d_gemm(node):

 @local_optimizer([AbstractConv2d_gradWeights])
 def local_abstractconv_gradweight_gemm(node):
-    if theano.config.cxx == "":  # or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv2d_gradWeights):
        return None
@@ -145,7 +147,9 @@ def local_abstractconv_gradweight_gemm(node):

 @local_optimizer([AbstractConv3d_gradWeights])
 def local_abstractconv3d_gradweight_gemm(node):
-    if theano.config.cxx == "":  # or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv3d_gradWeights):
        return None
@@ -170,7 +174,9 @@ def local_abstractconv3d_gradweight_gemm(node):

 @local_optimizer([AbstractConv2d_gradInputs])
 def local_abstractconv_gradinputs_gemm(node):
-    if theano.config.cxx == "":  # or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv2d_gradInputs):
        return None
@@ -193,7 +199,9 @@ def local_abstractconv_gradinputs_gemm(node):

 @local_optimizer([AbstractConv3d_gradInputs])
 def local_abstractconv3d_gradinputs_gemm(node):
-    if theano.config.cxx == "":  # or not theano.config.blas.ldflags:
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
+    if theano.config.cxx == "":
        return
    if not isinstance(node.op, AbstractConv3d_gradInputs):
        return None

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -363,8 +363,7 @@ class BaseTestConv(object):
 class BaseTestConv2d(BaseTestConv):
    @classmethod
    def setup_class(cls):
-        # if theano.config.blas.ldflags == '':
-            # raise SkipTest("BLAS required for reference")
+        # This tests can run even when theano.config.blas.ldflags is empty.
        cls.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
                             (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
        cls.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
@@ -414,12 +413,12 @@ class BaseTestConv2d(BaseTestConv):
 class TestCorrConv2d(BaseTestConv2d):
    @classmethod
    def setup_class(cls):
-        # if theano.config.blas.ldflags == "": raise SkipTest()
+        # This tests can run even when theano.config.blas.ldflags is empty.
        BaseTestConv2d.setup_class()

    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
        o = self.get_output_shape(i, f, s, b, fd)
-        # if (not theano.config.blas.ldflags or
+        # This tests can run even when theano.config.blas.ldflags is empty.
        if (not theano.config.cxx or
                theano.config.mode == "FAST_COMPILE"):
            raise SkipTest("Need blas to test conv2d")
@@ -443,8 +442,7 @@ class TestCorrConv2d(BaseTestConv2d):
 class TestAbstractConvNoOptim(BaseTestConv2d):
    @classmethod
    def setup_class(cls):
-        # if theano.config.blas.ldflags == "":
-            # raise SkipTest()
+        # This tests can run even when theano.config.blas.ldflags is empty.
        BaseTestConv2d.setup_class()
        cls.inputs_shapes = [(8, 1, 6, 6)]
        cls.filters_shapes = [(5, 1, 2, 2)]
@@ -517,8 +515,7 @@ class TestCpuConv2d(BaseTestConv2d):
            gradinput_OK = False

        if fwd_OK:
-            # if not theano.config.blas.ldflags:
-                # raise SkipTest("Need blas to test conv2d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_fwd(inputs_shape=i, filters_shape=f,
                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
@@ -540,8 +537,7 @@ class TestCpuConv2d(BaseTestConv2d):
                          filter_dilation=fd)

        if gradweight_OK:
-            # if not theano.config.blas.ldflags:
-                # raise SkipTest("Need blas to test conv2d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_gradweight(inputs_shape=i, filters_shape=f,
                                output_shape=o, subsample=s,
                                verify_grad=False, mode=mode,
@@ -566,8 +562,7 @@ class TestCpuConv2d(BaseTestConv2d):
                          filter_dilation=fd)

        if gradinput_OK:
-            # if not theano.config.blas.ldflags:
-                # raise SkipTest("Need blas to test conv2d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_gradinput(inputs_shape=i, filters_shape=f,
                               output_shape=o, subsample=s,
                               verify_grad=False, mode=mode,
@@ -595,8 +590,7 @@ class TestCpuConv2d(BaseTestConv2d):
 class BaseTestConv3d(BaseTestConv):
    @classmethod
    def setup_class(cls):
-        # if theano.config.blas.ldflags == '':
-            # raise SkipTest("BLAS required for reference")
+        # This tests can run even when theano.config.blas.ldflags is empty.
        cls.inputs_shapes = [(2, 1, 5, 5, 5), (1, 2, 7, 5, 6)]
        cls.filters_shapes = [(2, 1, 2, 2, 2), (1, 2, 2, 1, 3)]
        cls.subsamples = [(1, 1, 1), (2, 2, 2), (1, 2, 3)]
@@ -644,13 +638,12 @@ class BaseTestConv3d(BaseTestConv):
 class TestCorrConv3d(BaseTestConv3d):
    @classmethod
    def setup_class(cls):
-        # if theano.config.blas.ldflags == "":
-            # raise SkipTest()
+        # This tests can run even when theano.config.blas.ldflags is empty.
        BaseTestConv3d.setup_class()

    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
        o = self.get_output_shape(i, f, s, b, fd)
-        # if (not theano.config.blas.ldflags or
+        # This test can run even when theano.config.blas.ldflags is empty.
        if (not theano.config.cxx or
                theano.config.mode == "FAST_COMPILE"):
            raise SkipTest("Need blas to test conv3d")
@@ -698,8 +691,7 @@ class TestCpuConv3d(BaseTestConv3d):
            gradinput_OK = False

        if fwd_OK:
-            # if not theano.config.blas.ldflags:
-                # raise SkipTest("Need blas to test conv3d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_fwd(inputs_shape=i, filters_shape=f,
                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
@@ -721,8 +713,7 @@ class TestCpuConv3d(BaseTestConv3d):
                          filter_dilation=fd)

        if gradweight_OK:
-            # if not theano.config.blas.ldflags:
-                # raise SkipTest("Need blas to test conv3d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_gradweight(inputs_shape=i, filters_shape=f,
                                output_shape=o, subsample=s,
                                verify_grad=False, mode=mode,
@@ -747,8 +738,7 @@ class TestCpuConv3d(BaseTestConv3d):
                          filter_dilation=fd)

        if gradinput_OK:
-            # if not theano.config.blas.ldflags:
-                # raise SkipTest("Need blas to test conv3d")
+            # This test can run even when theano.config.blas.ldflags is empty.
            self.run_gradinput(inputs_shape=i, filters_shape=f,
                               output_shape=o, subsample=s,
                               verify_grad=False, mode=mode,
@@ -912,13 +902,13 @@ class TestConvTypes(unittest.TestCase):


 class TestBilinearUpsampling(unittest.TestCase):
-    # If BLAS is not available on CPU, then we accept the fallback to the
-    # slow Python implementation for that test.
+    # If theano.config.blas.ldflags is empty, Theano will use
+    # a NumPy C implementation of [sd]gemm_.
    compile_mode = theano.compile.mode.get_default_mode()
    if theano.config.mode == "FAST_COMPILE":
        compile_mode = compile_mode.excluding("conv_gemm")
        compile_mode = compile_mode.excluding('AbstractConvCheck')
-    elif not theano.config.cxx:  # not theano.config.blas.ldflags or
+    elif not theano.config.cxx:
        compile_mode = compile_mode.excluding('AbstractConvCheck')

    def numerical_kernel_1D(self, ratio):

--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -27,8 +27,7 @@ class TestCorr2D(utt.InferShapeTester):
        self.filters.name = 'default_filters'
        if not conv.imported_scipy_signal and theano.config.cxx == "":
            raise SkipTest("CorrMM tests need SciPy or a c++ compiler")
-        # if not theano.config.blas.ldflags:
-            # raise SkipTest("CorrMM tests need a BLAS")
+        # This tests can run even when theano.config.blas.ldflags is empty.

    def validate(self, image_shape, filter_shape,
                 border_mode='valid', subsample=(1, 1),

--- a/theano/tensor/nnet/tests/test_corr3d.py
+++ b/theano/tensor/nnet/tests/test_corr3d.py
@@ -27,8 +27,7 @@ class TestCorr3D(utt.InferShapeTester):
        self.filters.name = 'default_filters'
        if not conv.imported_scipy_signal and theano.config.cxx == "":
            raise SkipTest("Corr3dMM tests need SciPy or a c++ compiler")
-        # if not theano.config.blas.ldflags:
-            # raise SkipTest("Corr3dMM tests need a BLAS")
+        # This tests can run even when theano.config.blas.ldflags is empty.

    def validate(self, image_shape, filter_shape,
                 border_mode='valid', subsample=(1, 1, 1),

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -95,11 +95,11 @@ class t_gemm(TestCase):

            cmp_linker(copy(z), a, x, y, b, 'c|py')
            cmp_linker(copy(z), a, x, y, b, 'py')
-            # if (config.blas.ldflags and not dtype.startswith("complex")
+
            if (not dtype.startswith("complex")
                and theano.config.cxx):
-                # If blas.ldflags is equal to '', the C code will not
-                # be generated
+                # If theano.config.blas.ldflags is empty, Theano will use
+                # a NumPy C implementation of [sd]gemm_.
                cmp_linker(copy(z), a, x, y, b, 'c')

    def test0a(self):

--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
@@ -37,9 +37,7 @@ def skip_if_blas_ldflags_empty(*functions_detected):
 class TestCGer(TestCase, TestOptimizationMixin):

    def setUp(self, dtype='float64'):
-        # if theano.config.blas.ldflags == "":
-            # raise SkipTest("This test is useful only when Theano"
-                        # " is directly linked to blas.")
+        # This tests can run even when theano.config.blas.ldflags is empty.
        self.dtype = dtype
        self.mode = theano.compile.get_default_mode().including('fast_run')
        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
@@ -124,9 +122,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):

    """
    def setUp(self, dtype='float64'):
-        # if theano.config.blas.ldflags == "":
-            # raise SkipTest("This test is useful only when Theano"
-                        # " is directly linked to blas.")
+        # This tests can run even when theano.config.blas.ldflags is empty.
        self.dtype = dtype
        self.mode = theano.compile.get_default_mode().including('fast_run')
        # matrix