提交 9ca9474b authored 作者: notoraptor's avatar notoraptor

Required corrections and modifications are done.

Recall: code is tested with: $ theano-cache purge && THEANO_FLAGS=blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestCorrConv2d NB: 1) dgemm_ is never called in these tests. Only sgemm_ is called. 2) All LDA,LDB,LDC are always in the set {M;N;K} during these tests.
上级 6b3afd89
/** /** C Implementation of dgemm_ based on NumPy
C Implementation of dgemm_ based on NumPy * Used instead of blas when Theano config flag blas.ldflags is empty.
Used instead of blas when Theano config flag blas.ldflags is empty. * PS: For further comments, see equivalent functions in alt_sgemm.c.
**/ **/
void alt_double_scalar_matrix_product_in_place(double scalar, double* matrix, int size_to_compute) { void alt_numpy_double_scalar_matrix_product_in_place(double scalar, PyArrayObject* matrix) {
for(int i = 0; i < size_to_compute; ++i) { NpyIter* iterator = NpyIter_New(matrix,
matrix[i] *= scalar; NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL);
if(iterator == NULL)
alt_fatal_error("Unable to iterate over a matrix for a scalar * matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
char** data_ptr = NpyIter_GetDataPtrArray(iterator);
npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count) {
double new_value = scalar * (*((double*)data));
memcpy(data, &new_value, sizeof(double));
data += stride;
--count;
} }
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
} }
void alt_double_matrix_sum_in_place(double* A, double* B, double* out, int size_to_compute) { void alt_numpy_double_matrix_sum(PyArrayObject* A, PyArrayObject* B, PyArrayObject* out) {
for(int i = 0; i < size_to_compute; ++i) { PyArrayObject* op[3] = {A, B, out};
out[i] = A[i] + B[i]; npy_uint32 op_flags[3] = {NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_WRITEONLY};
npy_uint32 flags = NPY_ITER_EXTERNAL_LOOP;
NpyIter* iterators = NpyIter_MultiNew(3, op, flags, NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
if(iterators == NULL)
alt_fatal_error("Unable to iterate over some matrices for matrix + matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
npy_intp innerstride = NpyIter_GetInnerStrideArray(iterators)[0];
npy_intp *innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterators);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
do {
char* from_A = data_ptr_array[0];
char* from_B = data_ptr_array[1];
char* from_out = data_ptr_array[2];
npy_intp size = *innersize_ptr;
for(npy_intp i = 0; i < size; ++i, from_A += innerstride, from_B += innerstride, from_out += innerstride) {
double sum = *((double*)from_A);
sum += *((double*)from_B);
memcpy(from_out, &sum, sizeof(double));
} }
} while(get_next(iterators));
NpyIter_Deallocate(iterators);
} }
/* dgemm /* dgemm */
* NB: See sgemm_ (in alt_sgemm.c) for same assumptions.
* */
void dgemm_(char* TRANSA, char* TRANSB, void dgemm_(char* TRANSA, char* TRANSB,
const int* M, const int* N, const int* K, const int* M, const int* N, const int* K,
const double* ALPHA, double* A, const int* LDA, const double* ALPHA, double* A, const int* LDA,
...@@ -22,38 +57,45 @@ void dgemm_(char* TRANSA, char* TRANSB, ...@@ -22,38 +57,45 @@ void dgemm_(char* TRANSA, char* TRANSB,
double* C, const int* LDC) { double* C, const int* LDC) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0) if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
return; return;
if(C == NULL) int nrowa, ncola, nrowb, ncolb;
return; if(*TRANSA == 'N' || *TRANSA == 'n') {
int ka, kb; nrowa = *M;
if(*TRANSA == 'N' || *TRANSA == 'n') ncola = *K;
ka = *K; } else {
else nrowa = *K;
ka = *M; ncola = *M;
if(*TRANSB == 'N' || *TRANSB == 'n') }
kb = *N; if(*TRANSB == 'N' || *TRANSB == 'n') {
else nrowb = *K;
kb = *K; ncolb = *N;
npy_intp dims_A[2] = {*LDA, ka}; } else {
npy_intp dims_B[2] = {*LDB, kb}; nrowb = *N;
PyObject* matrix_A = PyArray_SimpleNewFromData(2, dims_A, NPY_FLOAT64, A); ncolb = *K;
PyObject* matrix_B = PyArray_SimpleNewFromData(2, dims_B, NPY_FLOAT64, B); }
npy_intp dims_A[2] = {nrowa, ncola};
npy_intp dims_B[2] = {nrowb, ncolb};
npy_intp dims_C[2] = {*M, *N};
npy_intp strides_A[2] = {ncola*8, 8};
npy_intp strides_B[2] = {ncolb*8, 8};
npy_intp strides_C[2] = {(*N)*8, 8};
PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, NPY_FLOAT64, strides_A, A, 0, 0, NULL);
PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, NPY_FLOAT64, strides_B, B, 0, 0, NULL);
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, NPY_FLOAT64, strides_C, C, 0, NPY_ARRAY_WRITEABLE, NULL);
PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A); PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B); PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
if(*BETA == 0) { if(*BETA == 0) {
npy_intp dims_C[2] = {*LDC, *N}; PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
PyObject* matrix_C = PyArray_SimpleNewFromData(2, dims_C, NPY_FLOAT64, C); alt_numpy_double_scalar_matrix_product_in_place(*ALPHA, (PyArrayObject*)matrix_C);
alt_matrix_matrix_product2(op_A, op_B, matrix_C);
alt_double_scalar_matrix_product_in_place(*ALPHA, C, (*M) * (*N));
Py_XDECREF(matrix_C);
} else { } else {
PyArrayObject* op_A_times_op_B = (PyArrayObject*)alt_matrix_matrix_product(op_A, op_B); PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
alt_double_scalar_matrix_product_in_place(*ALPHA, (double*)PyArray_DATA(op_A_times_op_B), (*M) * (*N)); alt_numpy_double_scalar_matrix_product_in_place(*ALPHA, op_A_times_op_B);
alt_double_scalar_matrix_product_in_place(*BETA, C, (*M) * (*N)); alt_numpy_double_scalar_matrix_product_in_place(*BETA, (PyArrayObject*)matrix_C);
alt_double_matrix_sum_in_place((double*)PyArray_DATA(op_A_times_op_B), C, C, (*M) * (*N)); alt_numpy_double_matrix_sum(op_A_times_op_B, (PyArrayObject*)matrix_C, (PyArrayObject*)matrix_C);
Py_XDECREF(op_A_times_op_B); Py_XDECREF(op_A_times_op_B);
} }
if(op_B != matrix_B) Py_XDECREF(op_B); if(op_B != matrix_B) Py_XDECREF(op_B);
if(op_A != matrix_A) Py_XDECREF(op_A); if(op_A != matrix_A) Py_XDECREF(op_A);
Py_XDECREF(matrix_C);
Py_XDECREF(matrix_B); Py_XDECREF(matrix_B);
Py_XDECREF(matrix_A); Py_XDECREF(matrix_A);
} }
/** /** C Implementation of sgemm_ based on NumPy
C Implementation of sgemm_ based on NumPy * Used instead of blas when Theano config flag blas.ldflags is empty.
Used instead of blas when Theano config flag blas.ldflags is empty. * PS: Comments are the same for equivalent functions in alt_dgemm.c.
**/ **/
inline PyObject* alt_transpose(PyArrayObject* o) { void alt_fatal_error(const char* message) {
return PyArray_Transpose(o, NULL); if(message != NULL) puts(message);
exit(-1);
} }
inline PyObject* alt_matrix_matrix_product(PyObject* o1, PyObject* o2) { void alt_numpy_scalar_matrix_product_in_place(float scalar, PyArrayObject* matrix) {
return PyArray_MatrixProduct(o1, o2); // Get an iterator on matrix.
} NpyIter* iterator = NpyIter_New(matrix,
inline PyObject* alt_matrix_matrix_product2(PyObject* o1, PyObject* o2, PyObject* out) { NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
return PyArray_MatrixProduct2(o1, o2, (PyArrayObject*)out); NPY_KEEPORDER, NPY_NO_CASTING, NULL);
} if(iterator == NULL)
void alt_scalar_matrix_product_in_place(float scalar, float* matrix, int size_to_compute) { alt_fatal_error("Unable to iterate over a matrix for a scalar * matrix operation.");
for(int i = 0; i < size_to_compute; ++i) { NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
matrix[i] *= scalar; char** data_ptr = NpyIter_GetDataPtrArray(iterator);
npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count) {
float new_value = scalar * (*((float*)data));
memcpy(data, &new_value, sizeof(float));
data += stride;
--count;
} }
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
} }
void alt_matrix_sum_in_place(float* A, float* B, float* out, int size_to_compute) { void alt_numpy_matrix_sum(PyArrayObject* A, PyArrayObject* B, PyArrayObject* out) {
for(int i = 0; i < size_to_compute; ++i) { /* NB: It may be better to check if A,B and out have the same dimensions.
out[i] = A[i] + B[i]; * But for now, we made this assumption. */
PyArrayObject* op[3] = {A, B, out};
npy_uint32 op_flags[3] = {NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_WRITEONLY};
npy_uint32 flags = NPY_ITER_EXTERNAL_LOOP;
NpyIter* iterators = NpyIter_MultiNew(3, op, flags, NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
if(iterators == NULL)
alt_fatal_error("Unable to iterate over some matrices for matrix + matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
npy_intp innerstride = NpyIter_GetInnerStrideArray(iterators)[0];
npy_intp *innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterators);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
do {
char* from_A = data_ptr_array[0];
char* from_B = data_ptr_array[1];
char* from_out = data_ptr_array[2];
npy_intp size = *innersize_ptr;
for(npy_intp i = 0; i < size; ++i, from_A += innerstride, from_B += innerstride, from_out += innerstride) {
float sum = *((float*)from_A);
sum += *((float*)from_B);
memcpy(from_out, &sum, sizeof(float));
} }
} while(get_next(iterators));
NpyIter_Deallocate(iterators);
} }
inline PyObject* alt_op(char* trans, PyArrayObject* matrix) { inline PyObject* alt_op(char* trans, PyArrayObject* matrix) {
return (*trans == 'N' || *trans == 'n') ? (PyObject*)matrix : alt_transpose(matrix); return (*trans == 'N' || *trans == 'n') ? (PyObject*)matrix : PyArray_Transpose(matrix, NULL);
} }
/* sgemm /* sgemm
* We assume that none of these 13 pointers passed as arguments is null. * Recall: operation performed:
* NB: We can more optimize this function (for example, when alpha == 0). * C = ALPHA * op(TRANSA, A) * op(TRANSB, B) + BETA * C
* NB: We assume that none of the 13 pointers passed as arguments is null.
* NB: We can more optimize this function (for example, when ALPHA == 0).
* */ * */
void sgemm_(char* TRANSA, char* TRANSB, void sgemm_(char* TRANSA, char* TRANSB,
const int* M, const int* N, const int* K, const int* M, const int* N, const int* K,
...@@ -35,43 +72,59 @@ void sgemm_(char* TRANSA, char* TRANSB, ...@@ -35,43 +72,59 @@ void sgemm_(char* TRANSA, char* TRANSB,
float* C, const int* LDC) { float* C, const int* LDC) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0) if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
return; return;
if(C == NULL)
return;
/* Recall: /* Recall:
* A is a *LDA by *ka matrix. * op(A) is a m by k matrix.
* B is a *LDB by *kb matrix. * op(B) is a k by n matrix.
* C is a *LDC By *N matrix. * C is a m by n matrix.
*/ */
int ka, kb; int nrowa, ncola, nrowb, ncolb;
if(*TRANSA == 'N' || *TRANSA == 'n') if(*TRANSA == 'N' || *TRANSA == 'n') {
ka = *K; nrowa = *M;
else ncola = *K;
ka = *M; } else {
if(*TRANSB == 'N' || *TRANSB == 'n') nrowa = *K;
kb = *N; ncola = *M;
else }
kb = *K; if(*TRANSB == 'N' || *TRANSB == 'n') {
npy_intp dims_A[2] = {*LDA, ka}; nrowb = *K;
npy_intp dims_B[2] = {*LDB, kb}; ncolb = *N;
PyObject* matrix_A = PyArray_SimpleNewFromData(2, dims_A, NPY_FLOAT32, A); } else {
PyObject* matrix_B = PyArray_SimpleNewFromData(2, dims_B, NPY_FLOAT32, B); nrowb = *N;
ncolb = *K;
}
npy_intp dims_A[2] = {nrowa, ncola};
npy_intp dims_B[2] = {nrowb, ncolb};
npy_intp dims_C[2] = {*M, *N};
/*NB: It seems that A,B and C are always row-major matrices, thus
* the stride for the 1st dimension (from a row to the next row) only depends on the number of elements in a row
* (that is, the size of the 2nd dimension, which is ncola for A, ncolb for B, and *N for C),
* and the stride for the 2nd dimension (from a column to the next column) is always the size of one element
* (that is, 4 bytes for a float32 matrix, 8 bytes for a float64 matrix).
* Then, LDA, LDB and LDC seems totally useless in the strides calcuulations.
* For LDA,LDB,LDC to be taken account, we need to have column-major matrices,
* which seems never to happen. */
npy_intp strides_A[2] = {ncola*4, 4};
npy_intp strides_B[2] = {ncolb*4, 4};
npy_intp strides_C[2] = {(*N)*4, 4};
/*NB: in fact, we could replace the strides with NULL as argument in the 3 following lines.*/
PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, NPY_FLOAT32, strides_A, A, 0, 0, NULL);
PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, NPY_FLOAT32, strides_B, B, 0, 0, NULL);
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, NPY_FLOAT32, strides_C, C, 0, NPY_ARRAY_WRITEABLE, NULL);
PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A); PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B); PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
if(*BETA == 0) { if(*BETA == 0) {
npy_intp dims_C[2] = {*LDC, *N}; PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
PyObject* matrix_C = PyArray_SimpleNewFromData(2, dims_C, NPY_FLOAT32, C); alt_numpy_scalar_matrix_product_in_place(*ALPHA, (PyArrayObject*)matrix_C);
alt_matrix_matrix_product2(op_A, op_B, matrix_C);
alt_scalar_matrix_product_in_place(*ALPHA, C, (*M) * (*N));
Py_XDECREF(matrix_C);
} else { } else {
PyArrayObject* op_A_times_op_B = (PyArrayObject*)alt_matrix_matrix_product(op_A, op_B); PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
alt_scalar_matrix_product_in_place(*ALPHA, (float*)PyArray_DATA(op_A_times_op_B), (*M) * (*N)); alt_numpy_scalar_matrix_product_in_place(*ALPHA, op_A_times_op_B);
alt_scalar_matrix_product_in_place(*BETA, C, (*M) * (*N)); alt_numpy_scalar_matrix_product_in_place(*BETA, (PyArrayObject*)matrix_C);
alt_matrix_sum_in_place((float*)PyArray_DATA(op_A_times_op_B), C, C, (*M) * (*N)); alt_numpy_matrix_sum(op_A_times_op_B, (PyArrayObject*)matrix_C, (PyArrayObject*)matrix_C);
Py_XDECREF(op_A_times_op_B); Py_XDECREF(op_A_times_op_B);
} }
if(op_B != matrix_B) Py_XDECREF(op_B); if(op_B != matrix_B) Py_XDECREF(op_B);
if(op_A != matrix_A) Py_XDECREF(op_A); if(op_A != matrix_A) Py_XDECREF(op_A);
Py_XDECREF(matrix_C);
Py_XDECREF(matrix_B); Py_XDECREF(matrix_B);
Py_XDECREF(matrix_A); Py_XDECREF(matrix_A);
} }
...@@ -9,6 +9,7 @@ import logging ...@@ -9,6 +9,7 @@ import logging
import textwrap import textwrap
import sys import sys
import os import os
from os.path import dirname, normpath
from theano import config from theano import config
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
...@@ -734,7 +735,6 @@ def blas_header_text(): ...@@ -734,7 +735,6 @@ def blas_header_text():
const = "const" const = "const"
if not config.blas.ldflags: if not config.blas.ldflags:
# Include the Numpy version implementation of sgemm_ and dgemm_ from alt_sgemm.c and alt_dgemm.c # Include the Numpy version implementation of sgemm_ and dgemm_ from alt_sgemm.c and alt_dgemm.c
from os.path import dirname, normpath
current_filedir = dirname(__file__) current_filedir = dirname(__file__)
sgemm_filepath = normpath(current_filedir + "/alt_sgemm.c") sgemm_filepath = normpath(current_filedir + "/alt_sgemm.c")
dgemm_filepath = normpath(current_filedir + "/alt_dgemm.c") dgemm_filepath = normpath(current_filedir + "/alt_dgemm.c")
...@@ -745,10 +745,10 @@ def blas_header_text(): ...@@ -745,10 +745,10 @@ def blas_header_text():
with open(dgemm_filepath) as code: with open(dgemm_filepath) as code:
dgemm_code = code.read() dgemm_code = code.read()
if not sgemm_code or not dgemm_code: if not sgemm_code or not dgemm_code:
_logger.warning("Unable to load Numpy implementation of gemm code from C source files.") raise IOError("Unable to load NumPy implementation of gemm code from C source files.")
else: else:
const = "" const = ""
# _logger.warning("Numpy implementation of gemm code loaded (config.blas.ldflags is empty)") # _logger.info("Numpy implementation of gemm code loaded (config.blas.ldflags is empty)")
gemm_code += sgemm_code gemm_code += sgemm_code
gemm_code += dgemm_code gemm_code += dgemm_code
......
...@@ -63,7 +63,7 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -63,7 +63,7 @@ class BaseCorrMM(gof.OpenMPOp):
self.filter_dilation = tuple(filter_dilation) self.filter_dilation = tuple(filter_dilation)
if not theano.config.blas.ldflags: if not theano.config.blas.ldflags:
# raise NotImplementedError("C code for corrMM* classes need a blas library.") # Theano will use a NumPy C implementation of [sd]gemm_ instead.
self.blas_type = '' self.blas_type = ''
else: else:
if 'openblas' in theano.config.blas.ldflags: if 'openblas' in theano.config.blas.ldflags:
......
...@@ -72,7 +72,9 @@ compile.optdb.register('local_inplace_sparse_block_outer', ...@@ -72,7 +72,9 @@ compile.optdb.register('local_inplace_sparse_block_outer',
# Conv opts # Conv opts
@local_optimizer([AbstractConv2d]) @local_optimizer([AbstractConv2d])
def local_abstractconv_gemm(node): def local_abstractconv_gemm(node):
if theano.config.cxx == "": # or not theano.config.blas.ldflags: # If theano.config.blas.ldflags is empty, Theano will use
# a NumPy C implementation of [sd]gemm_.
if theano.config.cxx == "":
return return
if not isinstance(node.op, AbstractConv2d): if not isinstance(node.op, AbstractConv2d):
return None return None
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论