New update.

Code-comment removed from test_corr. Code factorized in alt_gemm_template.c All tests passed.

New update.
4185ef0c · notoraptor · bb25e997 · 4185ef0c · 4185ef0c
--- a/theano/tensor/alt_gemm_template.c
+++ b/theano/tensor/alt_gemm_template.c
@@ -22,14 +22,15 @@ void alt_numpy_scale_matrix_inplace_%(float_type)s(%(float_type)s scalar, PyArra
    } while(get_next(iterator));
    NpyIter_Deallocate(iterator);
 }
-/*Matrix+Matrix function. Compute (coeffA * matrixA) + (coeffB * matrixB)
- * Remark: This function actually sums a C-contiguous matrix (alpha*op(A)*op(B)) with a F-contiguous matrix (beta*C)
- * (see gemm implementation at next function for more details) */
+/*Matrix+Matrix function.
+ * Computes (scalar1 * matrix1) + (scalar2 * matrix2)
+ * and puts the result into matrix2:
+ * matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
 void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
-        const %(float_type)s* ALPHA, PyArrayObject* A,
-        const %(float_type)s* BETA, PyArrayObject* B
+        const %(float_type)s* scalar1, PyArrayObject* matrix1,
+        const %(float_type)s* scalar2, PyArrayObject* matrix2
 ) {
-    PyArrayObject* op[2]       = {A, B};
+    PyArrayObject* op[2]       = {matrix1, matrix2};
    npy_uint32     op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
    npy_uint32     flags       = 0;
    NpyIter*       iterators   = NpyIter_MultiNew(
@@ -40,9 +41,9 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
    NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
    char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
    do {
-        %(float_type)s* from_A = (%(float_type)s*)data_ptr_array[0];
-        %(float_type)s* from_B = (%(float_type)s*)data_ptr_array[1];
-        *from_B = (*ALPHA)*(*from_A) + (*BETA)*(*from_B);
+        %(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0];
+        %(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1];
+        *from_matrix2 = (*scalar1)*(*from_matrix1) + (*scalar2)*(*from_matrix2);
    } while(get_next(iterators));
    NpyIter_Deallocate(iterators);
 }
@@ -75,61 +76,77 @@ void %(name)s(
    %(float_type)s* C, const int* LDC) {
    if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
        alt_fatal_error("The integer arguments passed to %(name)s must all be at least 0.");
-    /* NB: it seems that matrix+matrix and scalar*matrix functions
-     * defined above do not allocate iterator for a matrix with 0
-     * content, that is a matrix whose nrow*ncol == 0. As these
-     * functions actually work with M*N matrices (op(A)*op(B) and/or C),
-     * I think that we could just return if M or N is null. */
+    /* If M or N is null, there is nothing to do with C,
+     * as C should contain M*N == 0 items. */
    if(*M == 0 || *N == 0)
        return;
    int nrowa, ncola, nrowb, ncolb;
-    int is_A_transposable = alt_trans_to_bool(TRANSA);
-    int is_B_transposable = alt_trans_to_bool(TRANSB);
-    if(is_A_transposable) {
-        nrowa = *K; ncola = *M;
+    int to_transpose_A = alt_trans_to_bool(TRANSA);
+    int to_transpose_B = alt_trans_to_bool(TRANSB);
+    if(to_transpose_A) {
+        nrowa = *K;
+        ncola = *M;
    } else {
-        nrowa = *M; ncola = *K;
+        nrowa = *M;
+        ncola = *K;
    }
-    if(is_B_transposable) {
-        nrowb = *N; ncolb = *K;
+    if(to_transpose_B) {
+        nrowb = *N;
+        ncolb = *K;
    } else {
-        nrowb = *K; ncolb = *N;
+        nrowb = *K;
+        ncolb = *N;
    }
+    int computation_flags;
+    void* computation_pointer;
+    npy_intp* computation_strides;
+    npy_intp computation_dims[2] = {*N, *M};
+    npy_intp default_computation_strides[2] = {(*LDC) * %(float_size)d, %(float_size)d};
+    if(*BETA == 0) {
+        /*C is never read, just written, so it will be directly used
+         * to store the result of ALPHA*op(A)*op(B). */
+        computation_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE;
+        computation_pointer = C;
+        computation_strides = default_computation_strides;
+    } else {
+        /* C will be read, so the must allocate a new memory buffer
+         * to compute op(A)*op(B). */
+        computation_flags = 0;
+        computation_pointer = NULL;
+        computation_strides = NULL;
+    }
+    /* The memory buffer used to compute op(A)*op(B) (either C or
+     * new allocated buffer) will be considered as C-contiguous because
+     * the 3rd parameter of PyArray_MatrixProduct2 (used below)
+     * expects a C-contiguous array.
+     * Also, to avoid some memory copy, transposition conditions
+     * for A and B will be reversed, so that the buffer will contain
+     * C-contiguous opB_transposed * opA_transposed (N*M matrix).
+     * After that, the code that uses the buffer (either the code calling
+     * this function, or this function if BETA != 0) just has to 
+     * consider the buffer as a F-contiguous M*N matrix, so that
+     * it will get the transposed of op_B_transposed * op_A_transposed,
+     * that is op_A * op_B (M*N matrix) as expected. */
+    PyObject* opA_transposed = alt_op_without_copy_%(float_type)s(!to_transpose_A, A, nrowa, ncola, *LDA);
+    PyObject* opB_transposed = alt_op_without_copy_%(float_type)s(!to_transpose_B, B, nrowb, ncolb, *LDB);
+    PyObject* opB_trans_dot_opA_trans = PyArray_New(&PyArray_Type, 2, computation_dims, %(npy_float)s, computation_strides, computation_pointer, 0, computation_flags, NULL);
+    PyArray_MatrixProduct2(opB_transposed, opA_transposed, (PyArrayObject*)opB_trans_dot_opA_trans);
    if(*BETA == 0) {
-        /*C is never read, just written.
-         * matrix_C will be created as C-contiguous because
-         * the 3rd parameter of PyArray_MatrixProduct2 (used below)
-         * expects a C-contiguous array.
-         * Also, to avoid some memory copy, transposition conditions
-         * for A and B will be reversed, so that C will contain
-         * C-contiguous op_B_transposed * op_A_transposed (N*M matrix).
-         * As the calling code will consider C as F-contiguous M*N matrix,
-         * it will get the transposed of op_B_transposed * op_A_transposed,
-         * that is op_A * op_B (M*N matrix) as expected. */
-        npy_intp dims_C[2] = {*N, *M};
-        npy_intp strides_C[2] = {(*M) * %(float_size)d, %(float_size)d};
-        PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
-        PyObject* op_A_transposed = alt_op_without_copy_%(float_type)s(!is_A_transposable, A, nrowa, ncola, *LDA);
-        PyObject* op_B_transposed = alt_op_without_copy_%(float_type)s(!is_B_transposable, B, nrowb, ncolb, *LDB);
-        PyArray_MatrixProduct2(op_B_transposed, op_A_transposed, (PyArrayObject*)matrix_C);
        if(*ALPHA != 1.0)
-            alt_numpy_scale_matrix_inplace_%(float_type)s(*ALPHA, (PyArrayObject*)matrix_C);
-        Py_XDECREF(op_B_transposed);
-        Py_XDECREF(op_A_transposed);
-        Py_XDECREF(matrix_C);
+            alt_numpy_scale_matrix_inplace_%(float_type)s(*ALPHA, (PyArrayObject*)opB_trans_dot_opA_trans);
    } else {
-        /*C is read, so we must consider it as F-contiguous, as we do for A and B.*/
+        /* C is read, so we must consider it as F-contiguous. */
        npy_intp dims_C[2] = {*M, *N};
        npy_intp strides_C[2] = {%(float_size)d, (*LDC) * %(float_size)d};
        PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_F_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
-        PyObject* op_A = alt_op_without_copy_%(float_type)s(is_A_transposable, A, nrowa, ncola, *LDA);
-        PyObject* op_B = alt_op_without_copy_%(float_type)s(is_B_transposable, B, nrowb, ncolb, *LDB);
-        PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
-        alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, op_A_times_op_B, BETA, (PyArrayObject*)matrix_C);
-        /*C is already F-contiguous, thus no conversion needed for output.*/
-        Py_XDECREF(op_A_times_op_B);
-        Py_XDECREF(op_B);
-        Py_XDECREF(op_A);
+        /* Hack: if we use alt_op_without_copy with the right parameters,
+         * it will returns the transposed version of opB_trans_dot_opA_trans. */
+        PyObject* opA_dot_opB = alt_op_without_copy_%(float_type)s(0, (%(float_type)s*)PyArray_DATA((PyArrayObject*)opB_trans_dot_opA_trans), *M, *N, *M);
+        alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opA_dot_opB, BETA, (PyArrayObject*)matrix_C);
+        Py_XDECREF(opA_dot_opB);
        Py_XDECREF(matrix_C);
    }
+    Py_XDECREF(opB_trans_dot_opA_trans);
+    Py_XDECREF(opB_transposed);
+    Py_XDECREF(opA_transposed);
 }
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -130,7 +130,6 @@ class TestCorr2D(utt.InferShapeTester):
                                icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
                            ).sum()

-        # utt.assert_allclose(theano_output, ref_output)
        utt.assert_allclose(ref_output, theano_output)

        # TEST GRADIENT