提交 4185ef0c authored 作者: notoraptor's avatar notoraptor

New update.

Code-comment removed from test_corr. Code factorized in alt_gemm_template.c All tests passed.
上级 bb25e997
......@@ -22,14 +22,15 @@ void alt_numpy_scale_matrix_inplace_%(float_type)s(%(float_type)s scalar, PyArra
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
}
/*Matrix+Matrix function. Compute (coeffA * matrixA) + (coeffB * matrixB)
* Remark: This function actually sums a C-contiguous matrix (alpha*op(A)*op(B)) with a F-contiguous matrix (beta*C)
* (see gemm implementation at next function for more details) */
/*Matrix+Matrix function.
* Computes (scalar1 * matrix1) + (scalar2 * matrix2)
* and puts the result into matrix2:
* matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
const %(float_type)s* ALPHA, PyArrayObject* A,
const %(float_type)s* BETA, PyArrayObject* B
const %(float_type)s* scalar1, PyArrayObject* matrix1,
const %(float_type)s* scalar2, PyArrayObject* matrix2
) {
PyArrayObject* op[2] = {A, B};
PyArrayObject* op[2] = {matrix1, matrix2};
npy_uint32 op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
npy_uint32 flags = 0;
NpyIter* iterators = NpyIter_MultiNew(
......@@ -40,9 +41,9 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
do {
%(float_type)s* from_A = (%(float_type)s*)data_ptr_array[0];
%(float_type)s* from_B = (%(float_type)s*)data_ptr_array[1];
*from_B = (*ALPHA)*(*from_A) + (*BETA)*(*from_B);
%(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0];
%(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1];
*from_matrix2 = (*scalar1)*(*from_matrix1) + (*scalar2)*(*from_matrix2);
} while(get_next(iterators));
NpyIter_Deallocate(iterators);
}
......@@ -75,61 +76,77 @@ void %(name)s(
%(float_type)s* C, const int* LDC) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
alt_fatal_error("The integer arguments passed to %(name)s must all be at least 0.");
/* NB: it seems that matrix+matrix and scalar*matrix functions
* defined above do not allocate iterator for a matrix with 0
* content, that is a matrix whose nrow*ncol == 0. As these
* functions actually work with M*N matrices (op(A)*op(B) and/or C),
* I think that we could just return if M or N is null. */
/* If M or N is null, there is nothing to do with C,
* as C should contain M*N == 0 items. */
if(*M == 0 || *N == 0)
return;
int nrowa, ncola, nrowb, ncolb;
int is_A_transposable = alt_trans_to_bool(TRANSA);
int is_B_transposable = alt_trans_to_bool(TRANSB);
if(is_A_transposable) {
nrowa = *K; ncola = *M;
int to_transpose_A = alt_trans_to_bool(TRANSA);
int to_transpose_B = alt_trans_to_bool(TRANSB);
if(to_transpose_A) {
nrowa = *K;
ncola = *M;
} else {
nrowa = *M; ncola = *K;
nrowa = *M;
ncola = *K;
}
if(is_B_transposable) {
nrowb = *N; ncolb = *K;
if(to_transpose_B) {
nrowb = *N;
ncolb = *K;
} else {
nrowb = *K; ncolb = *N;
nrowb = *K;
ncolb = *N;
}
int computation_flags;
void* computation_pointer;
npy_intp* computation_strides;
npy_intp computation_dims[2] = {*N, *M};
npy_intp default_computation_strides[2] = {(*LDC) * %(float_size)d, %(float_size)d};
if(*BETA == 0) {
/*C is never read, just written, so it will be directly used
* to store the result of ALPHA*op(A)*op(B). */
computation_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE;
computation_pointer = C;
computation_strides = default_computation_strides;
} else {
/* C will be read, so the must allocate a new memory buffer
* to compute op(A)*op(B). */
computation_flags = 0;
computation_pointer = NULL;
computation_strides = NULL;
}
/* The memory buffer used to compute op(A)*op(B) (either C or
* new allocated buffer) will be considered as C-contiguous because
* the 3rd parameter of PyArray_MatrixProduct2 (used below)
* expects a C-contiguous array.
* Also, to avoid some memory copy, transposition conditions
* for A and B will be reversed, so that the buffer will contain
* C-contiguous opB_transposed * opA_transposed (N*M matrix).
* After that, the code that uses the buffer (either the code calling
* this function, or this function if BETA != 0) just has to
* consider the buffer as a F-contiguous M*N matrix, so that
* it will get the transposed of op_B_transposed * op_A_transposed,
* that is op_A * op_B (M*N matrix) as expected. */
PyObject* opA_transposed = alt_op_without_copy_%(float_type)s(!to_transpose_A, A, nrowa, ncola, *LDA);
PyObject* opB_transposed = alt_op_without_copy_%(float_type)s(!to_transpose_B, B, nrowb, ncolb, *LDB);
PyObject* opB_trans_dot_opA_trans = PyArray_New(&PyArray_Type, 2, computation_dims, %(npy_float)s, computation_strides, computation_pointer, 0, computation_flags, NULL);
PyArray_MatrixProduct2(opB_transposed, opA_transposed, (PyArrayObject*)opB_trans_dot_opA_trans);
if(*BETA == 0) {
/*C is never read, just written.
* matrix_C will be created as C-contiguous because
* the 3rd parameter of PyArray_MatrixProduct2 (used below)
* expects a C-contiguous array.
* Also, to avoid some memory copy, transposition conditions
* for A and B will be reversed, so that C will contain
* C-contiguous op_B_transposed * op_A_transposed (N*M matrix).
* As the calling code will consider C as F-contiguous M*N matrix,
* it will get the transposed of op_B_transposed * op_A_transposed,
* that is op_A * op_B (M*N matrix) as expected. */
npy_intp dims_C[2] = {*N, *M};
npy_intp strides_C[2] = {(*M) * %(float_size)d, %(float_size)d};
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
PyObject* op_A_transposed = alt_op_without_copy_%(float_type)s(!is_A_transposable, A, nrowa, ncola, *LDA);
PyObject* op_B_transposed = alt_op_without_copy_%(float_type)s(!is_B_transposable, B, nrowb, ncolb, *LDB);
PyArray_MatrixProduct2(op_B_transposed, op_A_transposed, (PyArrayObject*)matrix_C);
if(*ALPHA != 1.0)
alt_numpy_scale_matrix_inplace_%(float_type)s(*ALPHA, (PyArrayObject*)matrix_C);
Py_XDECREF(op_B_transposed);
Py_XDECREF(op_A_transposed);
Py_XDECREF(matrix_C);
alt_numpy_scale_matrix_inplace_%(float_type)s(*ALPHA, (PyArrayObject*)opB_trans_dot_opA_trans);
} else {
/*C is read, so we must consider it as F-contiguous, as we do for A and B.*/
/* C is read, so we must consider it as F-contiguous. */
npy_intp dims_C[2] = {*M, *N};
npy_intp strides_C[2] = {%(float_size)d, (*LDC) * %(float_size)d};
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_F_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
PyObject* op_A = alt_op_without_copy_%(float_type)s(is_A_transposable, A, nrowa, ncola, *LDA);
PyObject* op_B = alt_op_without_copy_%(float_type)s(is_B_transposable, B, nrowb, ncolb, *LDB);
PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, op_A_times_op_B, BETA, (PyArrayObject*)matrix_C);
/*C is already F-contiguous, thus no conversion needed for output.*/
Py_XDECREF(op_A_times_op_B);
Py_XDECREF(op_B);
Py_XDECREF(op_A);
/* Hack: if we use alt_op_without_copy with the right parameters,
* it will returns the transposed version of opB_trans_dot_opA_trans. */
PyObject* opA_dot_opB = alt_op_without_copy_%(float_type)s(0, (%(float_type)s*)PyArray_DATA((PyArrayObject*)opB_trans_dot_opA_trans), *M, *N, *M);
alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opA_dot_opB, BETA, (PyArrayObject*)matrix_C);
Py_XDECREF(opA_dot_opB);
Py_XDECREF(matrix_C);
}
Py_XDECREF(opB_trans_dot_opA_trans);
Py_XDECREF(opB_transposed);
Py_XDECREF(opA_transposed);
}
......@@ -130,7 +130,6 @@ class TestCorr2D(utt.InferShapeTester):
icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
).sum()
# utt.assert_allclose(theano_output, ref_output)
utt.assert_allclose(ref_output, theano_output)
# TEST GRADIENT
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论