提交 6fd65836 authored 作者: notoraptor's avatar notoraptor

Fix alt blas code, make sure output is set to 0 when beta is 0.

上级 8ee15d0c
...@@ -1033,7 +1033,7 @@ def openblas_threads_text(): ...@@ -1033,7 +1033,7 @@ def openblas_threads_text():
def blas_header_version(): def blas_header_version():
# Version for the base header # Version for the base header
version = (7,) version = (8,)
if detect_macos_sdot_bug(): if detect_macos_sdot_bug():
if detect_macos_sdot_bug.fix_works: if detect_macos_sdot_bug.fix_works:
# Version with fix # Version with fix
......
/** Alternative template NumPy-based implementation of BLAS functions used in Theano. **/ /** Alternative template NumPy-based implementation of BLAS functions used in Theano. **/
/* Compute tensor[i] = scalar for every position i in tensor. */
void alt_numpy_memset_inplace_%(float_type)s(PyArrayObject* tensor, const %(float_type)s* scalar) {
if (PyArray_IS_C_CONTIGUOUS(tensor) && *scalar == (char)(*scalar)) {
// This will use memset.
PyArray_FILLWBYTE(tensor, (char)(*scalar));
return;
}
NpyIter* iterator = NpyIter_New(tensor,
NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL);
if(iterator == NULL)
alt_fatal_error("Unable to iterate over a tensor for a memory assignation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
char** data_ptr = NpyIter_GetDataPtrArray(iterator);
npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count) {
*((%(float_type)s*)data) = *scalar;
data += stride;
--count;
}
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
}
/* Scalar * Matrix function. /* Scalar * Matrix function.
* Computes: matrix = scalar * matrix. */ * Computes: matrix = scalar * matrix. */
void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) { void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) {
if (*scalar == 1)
return;
if (*scalar == 0) {
alt_numpy_memset_inplace_%(float_type)s(matrix, scalar);
return;
}
NpyIter* iterator = NpyIter_New(matrix, NpyIter* iterator = NpyIter_New(matrix,
NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL); NPY_KEEPORDER, NPY_NO_CASTING, NULL);
...@@ -32,6 +67,14 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s( ...@@ -32,6 +67,14 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
const %(float_type)s* scalar1, PyArrayObject* matrix1, const %(float_type)s* scalar1, PyArrayObject* matrix1,
const %(float_type)s* scalar2, PyArrayObject* matrix2 const %(float_type)s* scalar2, PyArrayObject* matrix2
) { ) {
if (*scalar1 == 0 && *scalar2 == 0) {
alt_numpy_memset_inplace_%(float_type)s(matrix2, scalar2);
return;
}
if (*scalar1 == 0) {
alt_numpy_scale_matrix_inplace_%(float_type)s(scalar2, matrix2);
return;
}
PyArrayObject* op[2] = {matrix1, matrix2}; PyArrayObject* op[2] = {matrix1, matrix2};
npy_uint32 op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE}; npy_uint32 op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
npy_uint32 flags = 0; npy_uint32 flags = 0;
...@@ -42,11 +85,19 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s( ...@@ -42,11 +85,19 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
"for matrix + matrix operation."); "for matrix + matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL); NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators); char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
do { if (*scalar2 == 0) {
%(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0]; do {
%(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1]; %(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0];
*from_matrix2 = (*scalar1)*(*from_matrix1) + (*scalar2)*(*from_matrix2); %(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1];
} while(get_next(iterators)); *from_matrix2 = (*scalar1)*(*from_matrix1);
} while(get_next(iterators));
} else {
do {
%(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0];
%(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1];
*from_matrix2 = (*scalar1)*(*from_matrix1) + (*scalar2)*(*from_matrix2);
} while(get_next(iterators));
}
NpyIter_Deallocate(iterators); NpyIter_Deallocate(iterators);
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论