提交 24f96fa8 authored 作者: notoraptor's avatar notoraptor

New update! I have integrated the required changes and

re-run the tests. All tests are passed as before, and now some tests are faster. The biggest gain on my computer is for theano/tensor/nnet/tests/test_corr3d.py, which goes from 687 seconds before to 259 seconds now. For other tests, it's between 3 and 20 seconds. Now there is not copy nor memory allocation (apart from NumPy wrapping structures) when BETA == 0. I rewrote the OP(matrix) function so that it does not return new allocated data anymore. Instead it just creates a PyArrayObject wrapper around the matrix pointer with the right format: F-contiguous (nrow * ncol) by default, or C-contiguous (ncol * nrow) if matrix need to be transposed. I also rewrote the matrix sum function so that it requires scalars to multiply each passed matrix before addition. Now the function do: B = alpha*A + beta*B with alpha and beta as the scalars (both set to 1 if we just want B = A + B). Thus, there is now only one iteration over A and B, in which A and B are each read once, and B modified once.
上级 a4cc7dbd
/** C Implementation of [sd]gemm_ based on NumPy /** C Implementation of [sd]gemm_ based on NumPy
* Used instead of blas when Theano config flag blas.ldflags is empty. * Used instead of blas when Theano config flag blas.ldflags is empty.
* This file contains the common code for [sd]gemm_. * This file contains the common code for [sd]gemm_.
* File alt_gemm_template.c contains template code for [sd]gemm_. * File alt_gemm_template.c contains template code for [sd]gemm_. **/
**/
inline void alt_fatal_error(const char* message) { #define alt_fatal_error(message) { if(message != NULL) fprintf(stderr, message); exit(-1); }
if(message != NULL) fprintf(stderr, message);
exit(-1); #define alt_trans_to_bool(trans) (*trans != 'N' && *trans != 'n')
}
inline PyObject* alt_op(char* trans, PyArrayObject* matrix) {
return (*trans == 'N' || *trans == 'n') ?
(PyObject*)matrix :
PyArray_Transpose(matrix, NULL);
}
/**Template code for [sd]gemm_ follows in file alt_gemm_template.c /**Template code for [sd]gemm_ follows in file alt_gemm_template.c
* (as Python string to be used with old formatting). * (as Python string to be used with old formatting).
* PARAMETERS: * PARAMETERS:
* float_type: "float" for sgemm_, "double" for dgemm_. * float_type: "float" for sgemm_, "double" for dgemm_.
* float_size: 4 for float32 (sgemm_), 8 for float64 (dgemm_). * float_size: 4 for float32 (sgemm_), 8 for float64 (dgemm_).
* npy_float: "NPY_FLOAT32" for sgemm_, "NPY_FLOAT64" for dgemm_. * npy_float: "NPY_FLOAT32" for sgemm_, "NPY_FLOAT64" for dgemm_.
* name: usually "sgemm_" for sgemm_, "dgemm_" for dgemm_. * name: "sgemm_" for sgemm_, "dgemm_" for dgemm_.
* See blas_headers.py for current use.**/ * See blas_headers.py for current use.**/
...@@ -22,10 +22,13 @@ void alt_numpy_scalar_matrix_product_in_place_%(float_type)s(%(float_type)s scal ...@@ -22,10 +22,13 @@ void alt_numpy_scalar_matrix_product_in_place_%(float_type)s(%(float_type)s scal
} while(get_next(iterator)); } while(get_next(iterator));
NpyIter_Deallocate(iterator); NpyIter_Deallocate(iterator);
} }
/*Matrix+Matrix function. /*Matrix+Matrix function. Compute (coeffA * matrixA) + (coeffB * matrixB)
* Remark: This function actually sums a C-contiguous matrix (alpha*op(A)*op(B)) with a F-contiguous matrix (beta*C) * Remark: This function actually sums a C-contiguous matrix (alpha*op(A)*op(B)) with a F-contiguous matrix (beta*C)
* (see gemm implementation at next function for more details) */ * (see gemm implementation at next function for more details) */
void alt_numpy_matrix_sum_in_place_%(float_type)s(PyArrayObject* A, PyArrayObject* B) { void alt_numpy_matrix_extended_sum_in_place_%(float_type)s(
const %(float_type)s* ALPHA, PyArrayObject* A,
const %(float_type)s* BETA, PyArrayObject* B
) {
PyArrayObject* op[2] = {A, B}; PyArrayObject* op[2] = {A, B};
npy_uint32 op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE}; npy_uint32 op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
npy_uint32 flags = 0; npy_uint32 flags = 0;
...@@ -37,12 +40,32 @@ void alt_numpy_matrix_sum_in_place_%(float_type)s(PyArrayObject* A, PyArrayObjec ...@@ -37,12 +40,32 @@ void alt_numpy_matrix_sum_in_place_%(float_type)s(PyArrayObject* A, PyArrayObjec
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL); NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators); char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
do { do {
char* from_A = data_ptr_array[0]; %(float_type)s* from_A = (%(float_type)s*)data_ptr_array[0];
char* from_B = data_ptr_array[1]; %(float_type)s* from_B = (%(float_type)s*)data_ptr_array[1];
*((%(float_type)s*)from_B) += *((%(float_type)s*)from_A); *from_B = (*ALPHA)*(*from_A) + (*BETA)*(*from_B);
} while(get_next(iterators)); } while(get_next(iterators));
NpyIter_Deallocate(iterators); NpyIter_Deallocate(iterators);
} }
PyObject* alt_op_without_copy_%(float_type)s(int transposable, %(float_type)s* M, int nrow, int ncol, int LDM) {
// By default, M is considered as a nrow*ncol F-contiguous matrix with LDM as stride indicator for the columns.
npy_intp dims[2];
npy_intp strides[2];
int flags;
if(transposable) {
dims[0] = ncol;
dims[1] = nrow;
strides[0] = dims[1] * %(float_size)d;
strides[1] = %(float_size)d;
flags = NPY_ARRAY_C_CONTIGUOUS;
} else {
dims[0] = nrow;
dims[1] = ncol;
strides[0] = %(float_size)d;
strides[1] = LDM * %(float_size)d;
flags = NPY_ARRAY_F_CONTIGUOUS;
}
return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, flags, NULL);
}
/* %(name)s template code */ /* %(name)s template code */
void %(name)s( void %(name)s(
char* TRANSA, char* TRANSB, char* TRANSA, char* TRANSB,
...@@ -53,56 +76,53 @@ void %(name)s( ...@@ -53,56 +76,53 @@ void %(name)s(
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0) if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
return; return;
int nrowa, ncola, nrowb, ncolb; int nrowa, ncola, nrowb, ncolb;
if(*TRANSA == 'N' || *TRANSA == 'n') { int is_A_transposable = alt_trans_to_bool(TRANSA);
nrowa = *M; ncola = *K; int is_B_transposable = alt_trans_to_bool(TRANSB);
} else { if(is_A_transposable) {
nrowa = *K; ncola = *M; nrowa = *K; ncola = *M;
}
if(*TRANSB == 'N' || *TRANSB == 'n') {
nrowb = *K; ncolb = *N;
} else { } else {
nrowa = *M; ncola = *K;
}
if(is_B_transposable) {
nrowb = *N; ncolb = *K; nrowb = *N; ncolb = *K;
} else {
nrowb = *K; ncolb = *N;
} }
npy_intp dims_A[2] = {nrowa, ncola};
npy_intp dims_B[2] = {nrowb, ncolb};
npy_intp dims_C[2] = {*M, *N};
npy_intp strides_A[2] = {%(float_size)d, (*LDA) * %(float_size)d};
npy_intp strides_B[2] = {%(float_size)d, (*LDB) * %(float_size)d};
PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, %(npy_float)s, strides_A, A, 0, NPY_ARRAY_F_CONTIGUOUS, NULL);
PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, %(npy_float)s, strides_B, B, 0, NPY_ARRAY_F_CONTIGUOUS, NULL);
PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
if(*BETA == 0) { if(*BETA == 0) {
/*C is never red, just written.*/ /*C is never read, just written.
npy_intp strides_C[2] = {(*N) * %(float_size)d, %(float_size)d}; * matrix_C will be created as C-contiguous because
/*matrix_C is created as C-contiguous because the 3rd parameter of PyArray_MatrixProduct2 (below) expects a C-contiguous array.*/ * the 3rd parameter of PyArray_MatrixProduct2 (used below)
* expects a C-contiguous array.
* Also, to avoid some memory copy, transposition conditions
* for A and B will be reversed, so that C will contain
* C-contiguous op_B_transposed * op_A_transposed (N*M matrix).
* As the calling code will consider C as F-contiguous M*N matrix,
* it will get the transposed of op_B_transposed * op_A_transposed,
* that is op_A * op_B (M*N matrix) as expected. */
npy_intp dims_C[2] = {*N, *M};
npy_intp strides_C[2] = {(*M) * %(float_size)d, %(float_size)d};
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL); PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C); PyObject* op_A_transposed = alt_op_without_copy_%(float_type)s(!is_A_transposable, A, nrowa, ncola, *LDA);
PyObject* op_B_transposed = alt_op_without_copy_%(float_type)s(!is_B_transposable, B, nrowb, ncolb, *LDB);
PyArray_MatrixProduct2(op_B_transposed, op_A_transposed, (PyArrayObject*)matrix_C);
if(*ALPHA != 1.0) if(*ALPHA != 1.0)
alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*ALPHA, (PyArrayObject*)matrix_C); alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*ALPHA, (PyArrayObject*)matrix_C);
/*But it seems Python|NumPy expects C to be F-contiguous at output, so the convert it.*/ Py_XDECREF(op_B_transposed);
PyObject* matrix_C_as_f_contiguous = PyArray_FromAny(matrix_C, PyArray_DESCR((PyArrayObject*)matrix_C), 2, 2, NPY_ARRAY_F_CONTIGUOUS, NULL); Py_XDECREF(op_A_transposed);
if(matrix_C_as_f_contiguous != matrix_C) {
memcpy(C, PyArray_DATA((PyArrayObject*)matrix_C_as_f_contiguous), (*M)*(*N)*sizeof(%(float_type)s));
Py_XDECREF(matrix_C_as_f_contiguous);
}
Py_XDECREF(matrix_C); Py_XDECREF(matrix_C);
} else { } else {
/*C is read, so we must consider it as F-contiguous, as we do for A and B.*/ /*C is read, so we must consider it as F-contiguous, as we do for A and B.*/
npy_intp dims_C[2] = {*M, *N};
npy_intp strides_C[2] = {%(float_size)d, (*LDC) * %(float_size)d}; npy_intp strides_C[2] = {%(float_size)d, (*LDC) * %(float_size)d};
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_F_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL); PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, %(npy_float)s, strides_C, C, 0, NPY_ARRAY_F_CONTIGUOUS | NPY_ARRAY_WRITEABLE, NULL);
PyObject* op_A = alt_op_without_copy_%(float_type)s(is_A_transposable, A, nrowa, ncola, *LDA);
PyObject* op_B = alt_op_without_copy_%(float_type)s(is_B_transposable, B, nrowb, ncolb, *LDB);
PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B); PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
if(*ALPHA != 1.0) alt_numpy_matrix_extended_sum_in_place_%(float_type)s(ALPHA, op_A_times_op_B, BETA, (PyArrayObject*)matrix_C);
alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*ALPHA, op_A_times_op_B);
if(*BETA != 1.0)
alt_numpy_scalar_matrix_product_in_place_%(float_type)s(*BETA, (PyArrayObject*)matrix_C);
alt_numpy_matrix_sum_in_place_%(float_type)s(op_A_times_op_B, (PyArrayObject*)matrix_C);
/*C is already F-contiguous, thus no conversion needed for output.*/ /*C is already F-contiguous, thus no conversion needed for output.*/
Py_XDECREF(op_A_times_op_B); Py_XDECREF(op_A_times_op_B);
Py_XDECREF(op_B);
Py_XDECREF(op_A);
Py_XDECREF(matrix_C); Py_XDECREF(matrix_C);
} }
if(op_B != matrix_B) Py_XDECREF(op_B);
if(op_A != matrix_A) Py_XDECREF(op_A);
Py_XDECREF(matrix_B);
Py_XDECREF(matrix_A);
} }
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论