提交 da835094 authored 作者: notoraptor's avatar notoraptor

New update.

Gemm implementation rewritten so that a new data memory is allocated when we can not reuse C. C can not be reused either when BETA != 0, or when BETA == 0 but C is not contiguous (LDC != M). All tests passed, + special test suggested by lamblin.
上级 4185ef0c
/** %(name)s **/
void alt_numpy_scale_matrix_inplace_%(float_type)s(%(float_type)s scalar, PyArrayObject* matrix) {
/* Scalar*Matrix function.
* Computes: matrix = scalar*matrix. */
void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) {
NpyIter* iterator = NpyIter_New(matrix,
NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL);
......@@ -15,17 +18,15 @@ void alt_numpy_scale_matrix_inplace_%(float_type)s(%(float_type)s scalar, PyArra
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count) {
*((%(float_type)s*)data) *= scalar;
*((%(float_type)s*)data) *= *scalar;
data += stride;
--count;
}
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
}
/*Matrix+Matrix function.
* Computes (scalar1 * matrix1) + (scalar2 * matrix2)
* and puts the result into matrix2:
* matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
/* Matrix+Matrix function.
* Computes: matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
const %(float_type)s* scalar1, PyArrayObject* matrix1,
const %(float_type)s* scalar2, PyArrayObject* matrix2
......@@ -47,33 +48,41 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
} while(get_next(iterators));
NpyIter_Deallocate(iterators);
}
/* NumPy Wrapping function. Wraps a data into a Numpy's PyArrayObject.
* By default, data is considered as Fortran-style array (column by column).
* If to_transpose, data will be considered as C-style array (row by row)
* with dimensions inversed. */
PyObject* alt_op_without_copy_%(float_type)s(int to_transpose, %(float_type)s* M, int nrow, int ncol, int LDM) {
// By default, M is considered as a nrow*ncol F-contiguous matrix with LDM as stride indicator for the columns.
npy_intp dims[2];
npy_intp strides[2];
int flags;
if(to_transpose) {
dims[0] = ncol;
dims[1] = nrow;
strides[0] = LDM * %(float_size)d;
strides[1] = %(float_size)d;
flags = NPY_ARRAY_C_CONTIGUOUS;
} else {
dims[0] = nrow;
dims[1] = ncol;
strides[0] = %(float_size)d;
strides[1] = LDM * %(float_size)d;
flags = NPY_ARRAY_F_CONTIGUOUS;
}
return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, flags, NULL);
return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, 0, NULL);
}
/* Special wrapping case used for matrix C in gemm implementation. */
inline PyObject* alt_wrap_fortran_writeable_matrix_%(float_type)s(
%(float_type)s* matrix, const int* nrow, const int* ncol, const int* LD
) {
npy_intp dims[2] = {*nrow, *ncol};
npy_intp strides[2] = {%(float_size)d, (*LD) * %(float_size)d};
return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, matrix, 0, NPY_ARRAY_WRITEABLE, NULL);
}
/* %(name)s template code */
void %(name)s(
char* TRANSA, char* TRANSB,
const int* M, const int* N, const int* K,
char* TRANSA, char* TRANSB, const int* M, const int* N, const int* K,
const %(float_type)s* ALPHA, %(float_type)s* A, const int* LDA,
%(float_type)s* B, const int* LDB, const %(float_type)s* BETA,
%(float_type)s* C, const int* LDC) {
%(float_type)s* C, const int* LDC
) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
alt_fatal_error("The integer arguments passed to %(name)s must all be at least 0.");
/* If M or N is null, there is nothing to do with C,
......@@ -102,15 +111,19 @@ void %(name)s(
npy_intp* computation_strides;
npy_intp computation_dims[2] = {*N, *M};
npy_intp default_computation_strides[2] = {(*LDC) * %(float_size)d, %(float_size)d};
if(*BETA == 0) {
/*C is never read, just written, so it will be directly used
* to store the result of ALPHA*op(A)*op(B). */
computation_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE;
if(*BETA == 0 && *LDC == *M) {
/* BETA == 0, so C is never read.
* LDC == M, so C is contiguous in memory
* (that condition is needed for dot operation, se below).
* Then we can compute ALPHA*op(A)*op(B) directly in C. */
computation_flags = NPY_ARRAY_WRITEABLE;
computation_pointer = C;
computation_strides = default_computation_strides;
} else {
/* C will be read, so the must allocate a new memory buffer
* to compute op(A)*op(B). */
/* Either BETA != 0 (C will be read)
* or LDC != M (C is not read but is not contiguous in memory).
* Then in both cases, we need to allocate a new memory
* to compute ALPHA*op(A)*op(B). */
computation_flags = 0;
computation_pointer = NULL;
computation_strides = NULL;
......@@ -133,14 +146,20 @@ void %(name)s(
PyArray_MatrixProduct2(opB_transposed, opA_transposed, (PyArrayObject*)opB_trans_dot_opA_trans);
if(*BETA == 0) {
if(*ALPHA != 1.0)
alt_numpy_scale_matrix_inplace_%(float_type)s(*ALPHA, (PyArrayObject*)opB_trans_dot_opA_trans);
alt_numpy_scale_matrix_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opB_trans_dot_opA_trans);
if(*LDC != *M) {
/* A buffer has been created to compute ALPHA*op(A)*op(B),
* so we must copy it to the real output, that is C. */
PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
PyObject* alpha_opA_dot_opB = alt_op_without_copy_%(float_type)s(0, (%(float_type)s*)PyArray_DATA((PyArrayObject*)opB_trans_dot_opA_trans), *M, *N, *M);
if(0 != PyArray_CopyInto((PyArrayObject*)matrix_C, (PyArrayObject*)alpha_opA_dot_opB))
alt_fatal_error("Numpy %(name)s implementation: unable to copy ALPHA*op(A)*op(B) to C when BETA == 0.");
Py_XDECREF(alpha_opA_dot_opB);
Py_XDECREF(matrix_C);
}
} else {
/* C is read, so we must consider it as F-contiguous. */
npy_intp dims_C[2] = {*M, *N};
npy_intp strides_C[2] = {%(float_size)d, (*LDC) * %(float_size)d};
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_F_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
/* Hack: if we use alt_op_without_copy with the right parameters,
* it will returns the transposed version of opB_trans_dot_opA_trans. */
/* C is read, so we must consider it as Fortran-style matrix. */
PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
PyObject* opA_dot_opB = alt_op_without_copy_%(float_type)s(0, (%(float_type)s*)PyArray_DATA((PyArrayObject*)opB_trans_dot_opA_trans), *M, *N, *M);
alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opA_dot_opB, BETA, (PyArrayObject*)matrix_C);
Py_XDECREF(opA_dot_opB);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论