提交 9ca9474b authored 作者: notoraptor's avatar notoraptor

Required corrections and modifications are done.

Recall: code is tested with: $ theano-cache purge && THEANO_FLAGS=blas.ldflags= nosetests theano/tensor/nnet/tests/test_abstract_conv.py:TestCorrConv2d NB: 1) dgemm_ is never called in these tests. Only sgemm_ is called. 2) All LDA,LDB,LDC are always in the set {M;N;K} during these tests.
上级 6b3afd89
/**
C Implementation of dgemm_ based on NumPy
Used instead of blas when Theano config flag blas.ldflags is empty.
/** C Implementation of dgemm_ based on NumPy
* Used instead of blas when Theano config flag blas.ldflags is empty.
* PS: For further comments, see equivalent functions in alt_sgemm.c.
**/
void alt_double_scalar_matrix_product_in_place(double scalar, double* matrix, int size_to_compute) {
for(int i = 0; i < size_to_compute; ++i) {
matrix[i] *= scalar;
}
void alt_numpy_double_scalar_matrix_product_in_place(double scalar, PyArrayObject* matrix) {
NpyIter* iterator = NpyIter_New(matrix,
NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL);
if(iterator == NULL)
alt_fatal_error("Unable to iterate over a matrix for a scalar * matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
char** data_ptr = NpyIter_GetDataPtrArray(iterator);
npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count) {
double new_value = scalar * (*((double*)data));
memcpy(data, &new_value, sizeof(double));
data += stride;
--count;
}
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
}
void alt_double_matrix_sum_in_place(double* A, double* B, double* out, int size_to_compute) {
for(int i = 0; i < size_to_compute; ++i) {
out[i] = A[i] + B[i];
}
void alt_numpy_double_matrix_sum(PyArrayObject* A, PyArrayObject* B, PyArrayObject* out) {
PyArrayObject* op[3] = {A, B, out};
npy_uint32 op_flags[3] = {NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_WRITEONLY};
npy_uint32 flags = NPY_ITER_EXTERNAL_LOOP;
NpyIter* iterators = NpyIter_MultiNew(3, op, flags, NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
if(iterators == NULL)
alt_fatal_error("Unable to iterate over some matrices for matrix + matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
npy_intp innerstride = NpyIter_GetInnerStrideArray(iterators)[0];
npy_intp *innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterators);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
do {
char* from_A = data_ptr_array[0];
char* from_B = data_ptr_array[1];
char* from_out = data_ptr_array[2];
npy_intp size = *innersize_ptr;
for(npy_intp i = 0; i < size; ++i, from_A += innerstride, from_B += innerstride, from_out += innerstride) {
double sum = *((double*)from_A);
sum += *((double*)from_B);
memcpy(from_out, &sum, sizeof(double));
}
} while(get_next(iterators));
NpyIter_Deallocate(iterators);
}
/* dgemm
* NB: See sgemm_ (in alt_sgemm.c) for same assumptions.
* */
/* dgemm */
void dgemm_(char* TRANSA, char* TRANSB,
const int* M, const int* N, const int* K,
const double* ALPHA, double* A, const int* LDA,
double* B, const int* LDB, const double* BETA,
double* C, const int* LDC) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
return;
if(C == NULL)
return;
int ka, kb;
if(*TRANSA == 'N' || *TRANSA == 'n')
ka = *K;
else
ka = *M;
if(*TRANSB == 'N' || *TRANSB == 'n')
kb = *N;
else
kb = *K;
npy_intp dims_A[2] = {*LDA, ka};
npy_intp dims_B[2] = {*LDB, kb};
PyObject* matrix_A = PyArray_SimpleNewFromData(2, dims_A, NPY_FLOAT64, A);
PyObject* matrix_B = PyArray_SimpleNewFromData(2, dims_B, NPY_FLOAT64, B);
PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
if(*BETA == 0) {
npy_intp dims_C[2] = {*LDC, *N};
PyObject* matrix_C = PyArray_SimpleNewFromData(2, dims_C, NPY_FLOAT64, C);
alt_matrix_matrix_product2(op_A, op_B, matrix_C);
alt_double_scalar_matrix_product_in_place(*ALPHA, C, (*M) * (*N));
Py_XDECREF(matrix_C);
} else {
PyArrayObject* op_A_times_op_B = (PyArrayObject*)alt_matrix_matrix_product(op_A, op_B);
alt_double_scalar_matrix_product_in_place(*ALPHA, (double*)PyArray_DATA(op_A_times_op_B), (*M) * (*N));
alt_double_scalar_matrix_product_in_place(*BETA, C, (*M) * (*N));
alt_double_matrix_sum_in_place((double*)PyArray_DATA(op_A_times_op_B), C, C, (*M) * (*N));
Py_XDECREF(op_A_times_op_B);
}
if(op_B != matrix_B) Py_XDECREF(op_B);
if(op_A != matrix_A) Py_XDECREF(op_A);
Py_XDECREF(matrix_B);
Py_XDECREF(matrix_A);
const int* M, const int* N, const int* K,
const double* ALPHA, double* A, const int* LDA,
double* B, const int* LDB, const double* BETA,
double* C, const int* LDC) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
return;
int nrowa, ncola, nrowb, ncolb;
if(*TRANSA == 'N' || *TRANSA == 'n') {
nrowa = *M;
ncola = *K;
} else {
nrowa = *K;
ncola = *M;
}
if(*TRANSB == 'N' || *TRANSB == 'n') {
nrowb = *K;
ncolb = *N;
} else {
nrowb = *N;
ncolb = *K;
}
npy_intp dims_A[2] = {nrowa, ncola};
npy_intp dims_B[2] = {nrowb, ncolb};
npy_intp dims_C[2] = {*M, *N};
npy_intp strides_A[2] = {ncola*8, 8};
npy_intp strides_B[2] = {ncolb*8, 8};
npy_intp strides_C[2] = {(*N)*8, 8};
PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, NPY_FLOAT64, strides_A, A, 0, 0, NULL);
PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, NPY_FLOAT64, strides_B, B, 0, 0, NULL);
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, NPY_FLOAT64, strides_C, C, 0, NPY_ARRAY_WRITEABLE, NULL);
PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
if(*BETA == 0) {
PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
alt_numpy_double_scalar_matrix_product_in_place(*ALPHA, (PyArrayObject*)matrix_C);
} else {
PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
alt_numpy_double_scalar_matrix_product_in_place(*ALPHA, op_A_times_op_B);
alt_numpy_double_scalar_matrix_product_in_place(*BETA, (PyArrayObject*)matrix_C);
alt_numpy_double_matrix_sum(op_A_times_op_B, (PyArrayObject*)matrix_C, (PyArrayObject*)matrix_C);
Py_XDECREF(op_A_times_op_B);
}
if(op_B != matrix_B) Py_XDECREF(op_B);
if(op_A != matrix_A) Py_XDECREF(op_A);
Py_XDECREF(matrix_C);
Py_XDECREF(matrix_B);
Py_XDECREF(matrix_A);
}
/**
C Implementation of sgemm_ based on NumPy
Used instead of blas when Theano config flag blas.ldflags is empty.
/** C Implementation of sgemm_ based on NumPy
* Used instead of blas when Theano config flag blas.ldflags is empty.
* PS: Comments are the same for equivalent functions in alt_dgemm.c.
**/
inline PyObject* alt_transpose(PyArrayObject* o) {
return PyArray_Transpose(o, NULL);
void alt_fatal_error(const char* message) {
if(message != NULL) puts(message);
exit(-1);
}
inline PyObject* alt_matrix_matrix_product(PyObject* o1, PyObject* o2) {
return PyArray_MatrixProduct(o1, o2);
void alt_numpy_scalar_matrix_product_in_place(float scalar, PyArrayObject* matrix) {
// Get an iterator on matrix.
NpyIter* iterator = NpyIter_New(matrix,
NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL);
if(iterator == NULL)
alt_fatal_error("Unable to iterate over a matrix for a scalar * matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
char** data_ptr = NpyIter_GetDataPtrArray(iterator);
npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count) {
float new_value = scalar * (*((float*)data));
memcpy(data, &new_value, sizeof(float));
data += stride;
--count;
}
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
}
inline PyObject* alt_matrix_matrix_product2(PyObject* o1, PyObject* o2, PyObject* out) {
return PyArray_MatrixProduct2(o1, o2, (PyArrayObject*)out);
}
void alt_scalar_matrix_product_in_place(float scalar, float* matrix, int size_to_compute) {
for(int i = 0; i < size_to_compute; ++i) {
matrix[i] *= scalar;
}
}
void alt_matrix_sum_in_place(float* A, float* B, float* out, int size_to_compute) {
for(int i = 0; i < size_to_compute; ++i) {
out[i] = A[i] + B[i];
}
void alt_numpy_matrix_sum(PyArrayObject* A, PyArrayObject* B, PyArrayObject* out) {
/* NB: It may be better to check if A,B and out have the same dimensions.
* But for now, we made this assumption. */
PyArrayObject* op[3] = {A, B, out};
npy_uint32 op_flags[3] = {NPY_ITER_READONLY, NPY_ITER_READONLY, NPY_ITER_WRITEONLY};
npy_uint32 flags = NPY_ITER_EXTERNAL_LOOP;
NpyIter* iterators = NpyIter_MultiNew(3, op, flags, NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
if(iterators == NULL)
alt_fatal_error("Unable to iterate over some matrices for matrix + matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
npy_intp innerstride = NpyIter_GetInnerStrideArray(iterators)[0];
npy_intp *innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterators);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
do {
char* from_A = data_ptr_array[0];
char* from_B = data_ptr_array[1];
char* from_out = data_ptr_array[2];
npy_intp size = *innersize_ptr;
for(npy_intp i = 0; i < size; ++i, from_A += innerstride, from_B += innerstride, from_out += innerstride) {
float sum = *((float*)from_A);
sum += *((float*)from_B);
memcpy(from_out, &sum, sizeof(float));
}
} while(get_next(iterators));
NpyIter_Deallocate(iterators);
}
inline PyObject* alt_op(char* trans, PyArrayObject* matrix) {
return (*trans == 'N' || *trans == 'n') ? (PyObject*)matrix : alt_transpose(matrix);
return (*trans == 'N' || *trans == 'n') ? (PyObject*)matrix : PyArray_Transpose(matrix, NULL);
}
/* sgemm
* We assume that none of these 13 pointers passed as arguments is null.
* NB: We can more optimize this function (for example, when alpha == 0).
* Recall: operation performed:
* C = ALPHA * op(TRANSA, A) * op(TRANSB, B) + BETA * C
* NB: We assume that none of the 13 pointers passed as arguments is null.
* NB: We can more optimize this function (for example, when ALPHA == 0).
* */
void sgemm_(char* TRANSA, char* TRANSB,
const int* M, const int* N, const int* K,
const float* ALPHA, float* A, const int* LDA,
float* B, const int* LDB, const float* BETA,
float* C, const int* LDC) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
return;
if(C == NULL)
return;
/* Recall:
* A is a *LDA by *ka matrix.
* B is a *LDB by *kb matrix.
* C is a *LDC By *N matrix.
*/
int ka, kb;
if(*TRANSA == 'N' || *TRANSA == 'n')
ka = *K;
else
ka = *M;
if(*TRANSB == 'N' || *TRANSB == 'n')
kb = *N;
else
kb = *K;
npy_intp dims_A[2] = {*LDA, ka};
npy_intp dims_B[2] = {*LDB, kb};
PyObject* matrix_A = PyArray_SimpleNewFromData(2, dims_A, NPY_FLOAT32, A);
PyObject* matrix_B = PyArray_SimpleNewFromData(2, dims_B, NPY_FLOAT32, B);
PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
if(*BETA == 0) {
npy_intp dims_C[2] = {*LDC, *N};
PyObject* matrix_C = PyArray_SimpleNewFromData(2, dims_C, NPY_FLOAT32, C);
alt_matrix_matrix_product2(op_A, op_B, matrix_C);
alt_scalar_matrix_product_in_place(*ALPHA, C, (*M) * (*N));
Py_XDECREF(matrix_C);
} else {
PyArrayObject* op_A_times_op_B = (PyArrayObject*)alt_matrix_matrix_product(op_A, op_B);
alt_scalar_matrix_product_in_place(*ALPHA, (float*)PyArray_DATA(op_A_times_op_B), (*M) * (*N));
alt_scalar_matrix_product_in_place(*BETA, C, (*M) * (*N));
alt_matrix_sum_in_place((float*)PyArray_DATA(op_A_times_op_B), C, C, (*M) * (*N));
Py_XDECREF(op_A_times_op_B);
}
if(op_B != matrix_B) Py_XDECREF(op_B);
if(op_A != matrix_A) Py_XDECREF(op_A);
Py_XDECREF(matrix_B);
Py_XDECREF(matrix_A);
const int* M, const int* N, const int* K,
const float* ALPHA, float* A, const int* LDA,
float* B, const int* LDB, const float* BETA,
float* C, const int* LDC) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
return;
/* Recall:
* op(A) is a m by k matrix.
* op(B) is a k by n matrix.
* C is a m by n matrix.
*/
int nrowa, ncola, nrowb, ncolb;
if(*TRANSA == 'N' || *TRANSA == 'n') {
nrowa = *M;
ncola = *K;
} else {
nrowa = *K;
ncola = *M;
}
if(*TRANSB == 'N' || *TRANSB == 'n') {
nrowb = *K;
ncolb = *N;
} else {
nrowb = *N;
ncolb = *K;
}
npy_intp dims_A[2] = {nrowa, ncola};
npy_intp dims_B[2] = {nrowb, ncolb};
npy_intp dims_C[2] = {*M, *N};
/*NB: It seems that A,B and C are always row-major matrices, thus
* the stride for the 1st dimension (from a row to the next row) only depends on the number of elements in a row
* (that is, the size of the 2nd dimension, which is ncola for A, ncolb for B, and *N for C),
* and the stride for the 2nd dimension (from a column to the next column) is always the size of one element
* (that is, 4 bytes for a float32 matrix, 8 bytes for a float64 matrix).
* Then, LDA, LDB and LDC seems totally useless in the strides calcuulations.
* For LDA,LDB,LDC to be taken account, we need to have column-major matrices,
* which seems never to happen. */
npy_intp strides_A[2] = {ncola*4, 4};
npy_intp strides_B[2] = {ncolb*4, 4};
npy_intp strides_C[2] = {(*N)*4, 4};
/*NB: in fact, we could replace the strides with NULL as argument in the 3 following lines.*/
PyObject* matrix_A = PyArray_New(&PyArray_Type, 2, dims_A, NPY_FLOAT32, strides_A, A, 0, 0, NULL);
PyObject* matrix_B = PyArray_New(&PyArray_Type, 2, dims_B, NPY_FLOAT32, strides_B, B, 0, 0, NULL);
PyObject* matrix_C = PyArray_New(&PyArray_Type, 2, dims_C, NPY_FLOAT32, strides_C, C, 0, NPY_ARRAY_WRITEABLE, NULL);
PyObject* op_A = alt_op(TRANSA, (PyArrayObject*)matrix_A);
PyObject* op_B = alt_op(TRANSB, (PyArrayObject*)matrix_B);
if(*BETA == 0) {
PyArray_MatrixProduct2(op_A, op_B, (PyArrayObject*)matrix_C);
alt_numpy_scalar_matrix_product_in_place(*ALPHA, (PyArrayObject*)matrix_C);
} else {
PyArrayObject* op_A_times_op_B = (PyArrayObject*)PyArray_MatrixProduct(op_A, op_B);
alt_numpy_scalar_matrix_product_in_place(*ALPHA, op_A_times_op_B);
alt_numpy_scalar_matrix_product_in_place(*BETA, (PyArrayObject*)matrix_C);
alt_numpy_matrix_sum(op_A_times_op_B, (PyArrayObject*)matrix_C, (PyArrayObject*)matrix_C);
Py_XDECREF(op_A_times_op_B);
}
if(op_B != matrix_B) Py_XDECREF(op_B);
if(op_A != matrix_A) Py_XDECREF(op_A);
Py_XDECREF(matrix_C);
Py_XDECREF(matrix_B);
Py_XDECREF(matrix_A);
}
......@@ -9,6 +9,7 @@ import logging
import textwrap
import sys
import os
from os.path import dirname, normpath
from theano import config
from theano.gof.cmodule import GCC_compiler
......@@ -734,7 +735,6 @@ def blas_header_text():
const = "const"
if not config.blas.ldflags:
# Include the Numpy version implementation of sgemm_ and dgemm_ from alt_sgemm.c and alt_dgemm.c
from os.path import dirname, normpath
current_filedir = dirname(__file__)
sgemm_filepath = normpath(current_filedir + "/alt_sgemm.c")
dgemm_filepath = normpath(current_filedir + "/alt_dgemm.c")
......@@ -745,10 +745,10 @@ def blas_header_text():
with open(dgemm_filepath) as code:
dgemm_code = code.read()
if not sgemm_code or not dgemm_code:
_logger.warning("Unable to load Numpy implementation of gemm code from C source files.")
raise IOError("Unable to load NumPy implementation of gemm code from C source files.")
else:
const = ""
# _logger.warning("Numpy implementation of gemm code loaded (config.blas.ldflags is empty)")
# _logger.info("Numpy implementation of gemm code loaded (config.blas.ldflags is empty)")
gemm_code += sgemm_code
gemm_code += dgemm_code
......
......@@ -63,7 +63,7 @@ class BaseCorrMM(gof.OpenMPOp):
self.filter_dilation = tuple(filter_dilation)
if not theano.config.blas.ldflags:
# raise NotImplementedError("C code for corrMM* classes need a blas library.")
# Theano will use a NumPy C implementation of [sd]gemm_ instead.
self.blas_type = ''
else:
if 'openblas' in theano.config.blas.ldflags:
......
......@@ -72,7 +72,9 @@ compile.optdb.register('local_inplace_sparse_block_outer',
# Conv opts
@local_optimizer([AbstractConv2d])
def local_abstractconv_gemm(node):
if theano.config.cxx == "": # or not theano.config.blas.ldflags:
# If theano.config.blas.ldflags is empty, Theano will use
# a NumPy C implementation of [sd]gemm_.
if theano.config.cxx == "":
return
if not isinstance(node.op, AbstractConv2d):
return None
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论