提交 5baab3f1 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6467 from notoraptor/fix-alt-blas

Improve alternative BLAS code and add tests for alternative gemm function.
#section support_code_apply #section support_code_apply
int gpu_dimshuffle(PyGpuArrayObject* input, PyGpuArrayObject** out, PARAMS_TYPE* params) { int APPLY_SPECIFIC(gpu_dimshuffle)(PyGpuArrayObject* input, PyGpuArrayObject** out, PARAMS_TYPE* params) {
PyGpuArrayObject *tmp = NULL; PyGpuArrayObject *tmp = NULL;
npy_intp nd_in = PyArray_SIZE(params->input_broadcastable); npy_intp nd_in = PyArray_SIZE(params->input_broadcastable);
npy_intp nd_out = PyArray_SIZE(params->_new_order); npy_intp nd_out = PyArray_SIZE(params->_new_order);
......
...@@ -409,7 +409,7 @@ class GpuDimShuffle(DimShuffle): ...@@ -409,7 +409,7 @@ class GpuDimShuffle(DimShuffle):
""" """
_f16_ok = True _f16_ok = True
c_func_name = 'gpu_dimshuffle' c_func_name = 'APPLY_SPECIFIC(gpu_dimshuffle)'
def make_node(self, input): def make_node(self, input):
ctx_name = infer_context_name(input) ctx_name = infer_context_name(input)
......
...@@ -1033,7 +1033,7 @@ def openblas_threads_text(): ...@@ -1033,7 +1033,7 @@ def openblas_threads_text():
def blas_header_version(): def blas_header_version():
# Version for the base header # Version for the base header
version = (7,) version = (9,)
if detect_macos_sdot_bug(): if detect_macos_sdot_bug():
if detect_macos_sdot_bug.fix_works: if detect_macos_sdot_bug.fix_works:
# Version with fix # Version with fix
......
/** Alternative template NumPy-based implementation of BLAS functions used in Theano. **/ /** Alternative template NumPy-based implementation of BLAS functions used in Theano. **/
/* Compute matrix[i][j] = scalar for every position (i, j) in matrix. */
void alt_numpy_memset_inplace_%(float_type)s(PyArrayObject* matrix, const %(float_type)s* scalar) {
if (PyArray_IS_C_CONTIGUOUS(matrix) && *scalar == (char)(*scalar)) {
// This will use memset.
PyArray_FILLWBYTE(matrix, (char)(*scalar));
return;
}
NpyIter* iterator = NpyIter_New(matrix,
NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL);
if(iterator == NULL)
alt_fatal_error("Unable to iterate over a matrix for a memory assignation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
char** data_ptr = NpyIter_GetDataPtrArray(iterator);
npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count) {
*((%(float_type)s*)data) = *scalar;
data += stride;
--count;
}
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
}
/* Scalar * Matrix function. /* Scalar * Matrix function.
* Computes: matrix = scalar * matrix. */ * Computes: matrix = scalar * matrix. */
void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) { void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) {
if (*scalar == 1)
return;
if (*scalar == 0) {
alt_numpy_memset_inplace_%(float_type)s(matrix, scalar);
return;
}
NpyIter* iterator = NpyIter_New(matrix, NpyIter* iterator = NpyIter_New(matrix,
NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL); NPY_KEEPORDER, NPY_NO_CASTING, NULL);
...@@ -32,6 +67,14 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s( ...@@ -32,6 +67,14 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
const %(float_type)s* scalar1, PyArrayObject* matrix1, const %(float_type)s* scalar1, PyArrayObject* matrix1,
const %(float_type)s* scalar2, PyArrayObject* matrix2 const %(float_type)s* scalar2, PyArrayObject* matrix2
) { ) {
if (*scalar1 == 0 && *scalar2 == 0) {
alt_numpy_memset_inplace_%(float_type)s(matrix2, scalar2);
return;
}
if (*scalar1 == 0) {
alt_numpy_scale_matrix_inplace_%(float_type)s(scalar2, matrix2);
return;
}
PyArrayObject* op[2] = {matrix1, matrix2}; PyArrayObject* op[2] = {matrix1, matrix2};
npy_uint32 op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE}; npy_uint32 op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
npy_uint32 flags = 0; npy_uint32 flags = 0;
...@@ -42,11 +85,19 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s( ...@@ -42,11 +85,19 @@ void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
"for matrix + matrix operation."); "for matrix + matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL); NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators); char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
if (*scalar2 == 0) {
do {
%(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0];
%(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1];
*from_matrix2 = (*scalar1)*(*from_matrix1);
} while(get_next(iterators));
} else {
do { do {
%(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0]; %(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0];
%(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1]; %(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1];
*from_matrix2 = (*scalar1)*(*from_matrix1) + (*scalar2)*(*from_matrix2); *from_matrix2 = (*scalar1)*(*from_matrix1) + (*scalar2)*(*from_matrix2);
} while(get_next(iterators)); } while(get_next(iterators));
}
NpyIter_Deallocate(iterators); NpyIter_Deallocate(iterators);
} }
......
#section support_code_apply #section support_code_apply
int cpu_dimshuffle(PyArrayObject* input, PyArrayObject** res, PARAMS_TYPE* params) { int APPLY_SPECIFIC(cpu_dimshuffle)(PyArrayObject* input, PyArrayObject** res, PARAMS_TYPE* params) {
npy_bool* input_broadcastable; npy_bool* input_broadcastable;
npy_int64* new_order; npy_int64* new_order;
npy_intp nd_in; npy_intp nd_in;
......
...@@ -131,7 +131,7 @@ class DimShuffle(COp): ...@@ -131,7 +131,7 @@ class DimShuffle(COp):
check_input = False check_input = False
__props__ = ("input_broadcastable", "new_order", "inplace") __props__ = ("input_broadcastable", "new_order", "inplace")
c_func_file = 'c_code/dimshuffle.c' c_func_file = 'c_code/dimshuffle.c'
c_func_name = 'cpu_dimshuffle' c_func_name = 'APPLY_SPECIFIC(cpu_dimshuffle)'
@property @property
def params_type(self): def params_type(self):
......
...@@ -7,7 +7,7 @@ import numpy as np ...@@ -7,7 +7,7 @@ import numpy as np
from numpy import (arange, array, common_type, complex64, complex128, float32, from numpy import (arange, array, common_type, complex64, complex128, float32,
float64, newaxis, shape, transpose, zeros) float64, newaxis, shape, transpose, zeros)
from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_almost_equal
from itertools import product
from six.moves import xrange from six.moves import xrange
import theano import theano
...@@ -373,6 +373,97 @@ class t_gemm(TestCase): ...@@ -373,6 +373,97 @@ class t_gemm(TestCase):
1, 0, 2)), dt='float32') 1, 0, 2)), dt='float32')
class TestGemmNoFlags(object):
gemm = gemm_no_inplace
M = 4
N = 5
K = 6
slice_step = 3
def setUp(self):
unittest_tools.seed_rng()
def get_variable(self, V, to_transpose, to_slice):
if to_transpose:
V = V.T
if to_slice:
V = V[::self.slice_step]
return V
def get_function(self, dtype,
transpose_A=False, transpose_B=False, transpose_C=False,
slice_A=False, slice_B=False, slice_C=False):
alpha = theano.tensor.scalar(dtype=dtype, name='alpha')
beta = theano.tensor.scalar(dtype=dtype, name='beta')
A = theano.tensor.matrix(dtype=dtype, name='A')
B = theano.tensor.matrix(dtype=dtype, name='B')
C = theano.tensor.matrix(dtype=dtype, name='C')
A1 = self.get_variable(A, transpose_A, slice_A)
B1 = self.get_variable(B, transpose_B, slice_B)
C1 = self.get_variable(C, transpose_C, slice_C)
return theano.function([alpha, A, B, beta, C], self.gemm(C1, alpha, A1, B1, beta))
def generate_value(self, dtype, width, height, to_transpose, to_slice):
if to_slice:
if to_transpose:
shape = (height, width * self.slice_step)
else:
shape = (width * self.slice_step, height)
else:
if to_transpose:
shape = (height, width)
else:
shape = (width, height)
return np.random.random(shape).astype(dtype)
def get_data(self, dtype, alpha, beta,
transpose_A=False, transpose_B=False, transpose_C=False,
slice_A=False, slice_B=False, slice_C=False):
A = self.generate_value(dtype, self.M, self.N, transpose_A, slice_A)
B = self.generate_value(dtype, self.N, self.K, transpose_B, slice_B)
C = self.generate_value(dtype, self.M, self.K, transpose_C, slice_C)
return (alpha, A, B, beta, C)
def get_value(self, V, to_transpose, to_slice):
if to_transpose:
V = V.T
if to_slice:
V = V[::self.slice_step]
return V
def compute_ref(self, alpha, A, B, beta, C,
transpose_A, transpose_B, transpose_C,
slice_A, slice_B, slice_C):
A = self.get_value(A, transpose_A, slice_A)
B = self.get_value(B, transpose_B, slice_B)
C = self.get_value(C, transpose_C, slice_C)
return alpha * np.dot(A, B) + beta * C
@theano.change_flags({'blas.ldflags': ''})
def run_gemm(self, dtype, ALPHA, BETA,
transpose_A, transpose_B, transpose_C,
slice_A, slice_B, slice_C):
f = self.get_function(dtype, transpose_A, transpose_B, transpose_C, slice_A, slice_B, slice_C)
values = self.get_data(dtype, ALPHA, BETA, transpose_A, transpose_B, transpose_C, slice_A, slice_B, slice_C)
assert any(isinstance(node.op, Gemm) for node in f.maker.fgraph.apply_nodes)
z_val = f(*values)
assert z_val.dtype == dtype
assert tuple(z_val.shape) == (self.M, self.K)
ref_val = self.compute_ref(*(values + (transpose_A, transpose_B, transpose_C, slice_A, slice_B, slice_C)))
unittest_tools.assert_allclose(ref_val, z_val)
def test_gemm(self):
dtypes = ('float32', 'float64')
scalars = (0, 1, -2)
booleans = (False, True)
# dtype, alpha, beta, transA, transB, transC, sliceA, sliceB, sliceC
iterables = [dtypes] + ([scalars] * 2) + ([booleans] * 6)
for dtype, alpha, beta, tA, tB, tC, sA, sB, sC in product(*iterables):
yield (self.run_gemm, dtype, alpha, beta, tA, tB, tC, sA, sB, sC)
def test_res_is_a(): def test_res_is_a():
X, Y, Z, a, b = XYZab() X, Y, Z, a, b = XYZab()
......
...@@ -344,7 +344,7 @@ class TestCGemvNoFlags(object): ...@@ -344,7 +344,7 @@ class TestCGemvNoFlags(object):
A_2 = A_1 A_2 = A_1
x_2 = x x_2 = x
y_2 = y y_2 = y
return theano.function([alpha, A, x, beta, y], self.gemv(y_2, alpha, A_2, x_2, beta)) return theano.function([alpha, A, x, beta, y], self.gemv(y_2, alpha, A_2, x_2, beta), mode=self.mode)
def get_data(self, dtype, alpha, beta, transpose_A=False, slice_tensors=False): def get_data(self, dtype, alpha, beta, transpose_A=False, slice_tensors=False):
if slice_tensors: if slice_tensors:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论