提交 856b98d3 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #5076 from notoraptor/master

Issue 5008 fixed
/** C Implementation of [sd]gemm_ based on NumPy
* Used instead of blas when Theano config flag blas.ldflags is empty.
* This file contains the common code for [sd]gemm_.
* File alt_gemm_template.c contains template code for [sd]gemm_. **/
#define alt_fatal_error(message) { if(message != NULL) fprintf(stderr, message); exit(-1); }
#define alt_trans_to_bool(trans) (*trans != 'N' && *trans != 'n')
/**Template code for [sd]gemm_ follows in file alt_gemm_template.c
* (as Python string to be used with old formatting).
* PARAMETERS:
* float_type: "float" for sgemm_, "double" for dgemm_.
* float_size: 4 for float32 (sgemm_), 8 for float64 (dgemm_).
* npy_float: "NPY_FLOAT32" for sgemm_, "NPY_FLOAT64" for dgemm_.
* name: "sgemm_" for sgemm_, "dgemm_" for dgemm_.
* See blas_headers.py for current use.**/
/** %(name)s **/
/* Scalar*Matrix function.
* Computes: matrix = scalar*matrix. */
void alt_numpy_scale_matrix_inplace_%(float_type)s(const %(float_type)s* scalar, PyArrayObject* matrix) {
NpyIter* iterator = NpyIter_New(matrix,
NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
NPY_KEEPORDER, NPY_NO_CASTING, NULL);
if(iterator == NULL)
alt_fatal_error("Unable to iterate over a matrix "
"for a scalar * matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
char** data_ptr = NpyIter_GetDataPtrArray(iterator);
npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count) {
*((%(float_type)s*)data) *= *scalar;
data += stride;
--count;
}
} while(get_next(iterator));
NpyIter_Deallocate(iterator);
}
/* Matrix+Matrix function.
* Computes: matrix2 = (scalar1 * matrix1) + (scalar2 * matrix2) */
void alt_numpy_matrix_extended_sum_inplace_%(float_type)s(
const %(float_type)s* scalar1, PyArrayObject* matrix1,
const %(float_type)s* scalar2, PyArrayObject* matrix2
) {
PyArrayObject* op[2] = {matrix1, matrix2};
npy_uint32 op_flags[2] = {NPY_ITER_READONLY, NPY_ITER_READWRITE};
npy_uint32 flags = 0;
NpyIter* iterators = NpyIter_MultiNew(
2, op, flags, NPY_CORDER, NPY_NO_CASTING, op_flags, NULL);
if(iterators == NULL)
alt_fatal_error("Unable to iterate over some matrices "
"for matrix + matrix operation.");
NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterators, NULL);
char** data_ptr_array = NpyIter_GetDataPtrArray(iterators);
do {
%(float_type)s* from_matrix1 = (%(float_type)s*)data_ptr_array[0];
%(float_type)s* from_matrix2 = (%(float_type)s*)data_ptr_array[1];
*from_matrix2 = (*scalar1)*(*from_matrix1) + (*scalar2)*(*from_matrix2);
} while(get_next(iterators));
NpyIter_Deallocate(iterators);
}
/* NumPy Wrapping function. Wraps a data into a NumPy's PyArrayObject.
* By default, data is considered as Fortran-style array (column by column).
* If to_transpose, data will be considered as C-style array (row by row)
* with dimensions reversed. */
PyObject* alt_op_%(float_type)s(int to_transpose, %(float_type)s* M, int nrow, int ncol, int LDM) {
npy_intp dims[2];
npy_intp strides[2];
if(to_transpose) {
dims[0] = ncol;
dims[1] = nrow;
strides[0] = LDM * %(float_size)d;
strides[1] = %(float_size)d;
} else {
dims[0] = nrow;
dims[1] = ncol;
strides[0] = %(float_size)d;
strides[1] = LDM * %(float_size)d;
}
return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, M, 0, 0, NULL);
}
/* Special wrapping case used for matrix C in gemm implementation. */
inline PyObject* alt_wrap_fortran_writeable_matrix_%(float_type)s(
%(float_type)s* matrix, const int* nrow, const int* ncol, const int* LD
) {
npy_intp dims[2] = {*nrow, *ncol};
npy_intp strides[2] = {%(float_size)d, (*LD) * %(float_size)d};
return PyArray_New(&PyArray_Type, 2, dims, %(npy_float)s, strides, matrix, 0, NPY_ARRAY_WRITEABLE, NULL);
}
/* %(name)s template code */
void %(name)s(
char* TRANSA, char* TRANSB, const int* M, const int* N, const int* K,
const %(float_type)s* ALPHA, %(float_type)s* A, const int* LDA,
%(float_type)s* B, const int* LDB, const %(float_type)s* BETA,
%(float_type)s* C, const int* LDC
) {
if(*M < 0 || *N < 0 || *K < 0 || *LDA < 0 || *LDB < 0 || *LDC < 0)
alt_fatal_error("The integer arguments passed to %(name)s must all be at least 0.");
/* If M or N is null, there is nothing to do with C,
* as C should contain M*N == 0 items. */
if(*M == 0 || *N == 0)
return;
int nrowa, ncola, nrowb, ncolb;
int to_transpose_A = alt_trans_to_bool(TRANSA);
int to_transpose_B = alt_trans_to_bool(TRANSB);
if(to_transpose_A) {
nrowa = *K;
ncola = *M;
} else {
nrowa = *M;
ncola = *K;
}
if(to_transpose_B) {
nrowb = *N;
ncolb = *K;
} else {
nrowb = *K;
ncolb = *N;
}
int computation_flags;
void* computation_pointer;
npy_intp* computation_strides;
npy_intp computation_dims[2] = {*N, *M};
npy_intp default_computation_strides[2] = {(*LDC) * %(float_size)d, %(float_size)d};
if(*BETA == 0 && *LDC == *M) {
/* BETA == 0, so C is never read.
* LDC == M, so C is contiguous in memory
* (that condition is needed for dot operation, se below).
* Then we can compute ALPHA*op(A)*op(B) directly in C. */
computation_flags = NPY_ARRAY_WRITEABLE;
computation_pointer = C;
computation_strides = default_computation_strides;
} else {
/* Either BETA != 0 (C will be read)
* or LDC != M (C is not read but is not contiguous in memory).
* Then in both cases, we need to allocate a new memory
* to compute ALPHA*op(A)*op(B). */
computation_flags = 0;
computation_pointer = NULL;
computation_strides = NULL;
}
/* The memory buffer used to compute op(A)*op(B) (either C or
* new allocated buffer) will be considered as C-contiguous because
* the 3rd parameter of PyArray_MatrixProduct2 (used below)
* expects a C-contiguous array.
* Also, to avoid some memory copy, transposition conditions
* for A and B will be reversed, so that the buffer will contain
* C-contiguous opB_transposed * opA_transposed (N*M matrix).
* After that, the code that uses the buffer (either the code calling
* this function, or this function if BETA != 0) just has to
* consider the buffer as a F-contiguous M*N matrix, so that
* it will get the transposed of op_B_transposed * op_A_transposed,
* that is op_A * op_B (M*N matrix) as expected. */
PyObject* opA_transposed = alt_op_%(float_type)s(!to_transpose_A, A, nrowa, ncola, *LDA);
PyObject* opB_transposed = alt_op_%(float_type)s(!to_transpose_B, B, nrowb, ncolb, *LDB);
PyObject* opB_trans_dot_opA_trans = PyArray_New(&PyArray_Type, 2, computation_dims, %(npy_float)s, computation_strides, computation_pointer, 0, computation_flags, NULL);
PyArray_MatrixProduct2(opB_transposed, opA_transposed, (PyArrayObject*)opB_trans_dot_opA_trans);
if(*BETA == 0) {
if(*ALPHA != 1.0)
alt_numpy_scale_matrix_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opB_trans_dot_opA_trans);
if(*LDC != *M) {
/* A buffer has been created to compute ALPHA*op(A)*op(B),
* so we must copy it to the real output, that is C. */
PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
PyObject* alpha_opA_dot_opB = PyArray_Transpose((PyArrayObject*)opB_trans_dot_opA_trans, NULL);
if(0 != PyArray_CopyInto((PyArrayObject*)matrix_C, (PyArrayObject*)alpha_opA_dot_opB))
alt_fatal_error("NumPy %(name)s implementation: unable to copy ALPHA*op(A)*op(B) into C when BETA == 0.");
Py_XDECREF(alpha_opA_dot_opB);
Py_XDECREF(matrix_C);
}
} else {
/* C is read, so we must consider it as Fortran-style matrix. */
PyObject* matrix_C = alt_wrap_fortran_writeable_matrix_%(float_type)s(C, M, N, LDC);
PyObject* opA_dot_opB = PyArray_Transpose((PyArrayObject*)opB_trans_dot_opA_trans, NULL);
alt_numpy_matrix_extended_sum_inplace_%(float_type)s(ALPHA, (PyArrayObject*)opA_dot_opB, BETA, (PyArrayObject*)matrix_C);
Py_XDECREF(opA_dot_opB);
Py_XDECREF(matrix_C);
}
Py_XDECREF(opB_trans_dot_opA_trans);
Py_XDECREF(opB_transposed);
Py_XDECREF(opA_transposed);
}
...@@ -1037,10 +1037,6 @@ class Gemm(GemmRelated): ...@@ -1037,10 +1037,6 @@ class Gemm(GemmRelated):
if node.inputs[0].type.dtype.startswith('complex'): if node.inputs[0].type.dtype.startswith('complex'):
raise utils.MethodNotDefined('%s.c_code' raise utils.MethodNotDefined('%s.c_code'
% self.__class__.__name__) % self.__class__.__name__)
if not config.blas.ldflags:
return super(Gemm, self).c_code(node, name,
(_z, _a, _x, _y, _b), (_zout, ),
sub)
full_code = self.build_gemm_call() % dict(locals(), **sub) full_code = self.build_gemm_call() % dict(locals(), **sub)
return full_code return full_code
...@@ -2154,10 +2150,6 @@ class BatchedDot(Op): ...@@ -2154,10 +2150,6 @@ class BatchedDot(Op):
_z, = out _z, = out
fail = sub["fail"] fail = sub["fail"]
if not config.blas.ldflags:
return super(BatchedDot, self).c_code(node, name,
inp, out, sub)
# generate contiguity condition # generate contiguity condition
def contiguous(var, ndim): def contiguous(var, ndim):
strides = "PyArray_STRIDES(%s)" % var strides = "PyArray_STRIDES(%s)" % var
......
...@@ -9,6 +9,7 @@ import logging ...@@ -9,6 +9,7 @@ import logging
import textwrap import textwrap
import sys import sys
import os import os
from os.path import dirname, normpath
from theano import config from theano import config
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
...@@ -729,6 +730,31 @@ def cblas_header_text(): ...@@ -729,6 +730,31 @@ def cblas_header_text():
def blas_header_text(): def blas_header_text():
"""C header for the fortran blas interface""" """C header for the fortran blas interface"""
gemm_code = ""
const = "const"
if not config.blas.ldflags:
# Include the Numpy version implementation of [sd]gemm_.
current_filedir = dirname(__file__)
gemm_common_filepath = normpath(current_filedir + "/alt_gemm_common.c")
gemm_template_filepath = normpath(current_filedir + "/alt_gemm_template.c")
common_code = ""
sgemm_code = ""
dgemm_code = ""
with open(gemm_common_filepath) as code:
common_code = code.read()
with open(gemm_template_filepath) as code:
template_code = code.read()
sgemm_code = template_code % {"float_type": "float", "float_size": 4, "npy_float": "NPY_FLOAT32", "name": "sgemm_"}
dgemm_code = template_code % {"float_type": "double", "float_size": 8, "npy_float": "NPY_FLOAT64", "name": "dgemm_"}
if not common_code or not sgemm_code:
raise IOError("Unable to load NumPy implementation of gemm code from C source files.")
else:
const = ""
gemm_code += common_code
gemm_code += sgemm_code
gemm_code += dgemm_code
header = """ header = """
extern "C" extern "C"
{ {
...@@ -890,7 +916,7 @@ def blas_header_text(): ...@@ -890,7 +916,7 @@ def blas_header_text():
/* Single Precision */ /* Single Precision */
void sgemm_(char*, char*, const int*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*); void sgemm_(char*, char*, const int*, const int*, const int*, const float *, %(const)s float *, const int*, %(const)s float *, const int*, const float *, float *, const int*);
void ssymm_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*); void ssymm_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*);
void ssyrk_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, float *, const int*); void ssyrk_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, float *, const int*);
void ssyr2k_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*); void ssyr2k_(char*, char*, const int*, const int*, const float *, const float *, const int*, const float *, const int*, const float *, float *, const int*);
...@@ -899,7 +925,7 @@ def blas_header_text(): ...@@ -899,7 +925,7 @@ def blas_header_text():
/* Double Precision */ /* Double Precision */
void dgemm_(char*, char*, const int*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*); void dgemm_(char*, char*, const int*, const int*, const int*, const double *, %(const)s double *, const int*, %(const)s double *, const int*, const double *, double *, const int*);
void dsymm_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*); void dsymm_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*);
void dsyrk_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, double *, const int*); void dsyrk_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, double *, const int*);
void dsyr2k_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*); void dsyr2k_(char*, char*, const int*, const int*, const double *, const double *, const int*, const double *, const int*, const double *, double *, const int*);
...@@ -958,7 +984,7 @@ def blas_header_text(): ...@@ -958,7 +984,7 @@ def blas_header_text():
} }
""") """)
return header return (header % {'const': const}) + gemm_code
def mkl_threads_text(): def mkl_threads_text():
......
...@@ -63,7 +63,8 @@ class BaseCorrMM(gof.OpenMPOp): ...@@ -63,7 +63,8 @@ class BaseCorrMM(gof.OpenMPOp):
self.filter_dilation = tuple(filter_dilation) self.filter_dilation = tuple(filter_dilation)
if not theano.config.blas.ldflags: if not theano.config.blas.ldflags:
raise NotImplementedError("C code for corrMM* classes need a blas library.") # Theano will use a NumPy C implementation of [sd]gemm_ instead.
self.blas_type = ''
else: else:
if 'openblas' in theano.config.blas.ldflags: if 'openblas' in theano.config.blas.ldflags:
self.blas_type = 'openblas' self.blas_type = 'openblas'
......
...@@ -63,7 +63,8 @@ class BaseCorr3dMM(gof.OpenMPOp): ...@@ -63,7 +63,8 @@ class BaseCorr3dMM(gof.OpenMPOp):
self.filter_dilation = tuple(filter_dilation) self.filter_dilation = tuple(filter_dilation)
if not theano.config.blas.ldflags: if not theano.config.blas.ldflags:
raise NotImplementedError("C code for corrMM* classes need a blas library.") # Theano will use a NumPy C implementation of [sd]gemm_ instead.
self.blas_type = ''
else: else:
if 'openblas' in theano.config.blas.ldflags: if 'openblas' in theano.config.blas.ldflags:
self.blas_type = 'openblas' self.blas_type = 'openblas'
......
...@@ -72,7 +72,9 @@ compile.optdb.register('local_inplace_sparse_block_outer', ...@@ -72,7 +72,9 @@ compile.optdb.register('local_inplace_sparse_block_outer',
# Conv opts # Conv opts
@local_optimizer([AbstractConv2d]) @local_optimizer([AbstractConv2d])
def local_abstractconv_gemm(node): def local_abstractconv_gemm(node):
if theano.config.cxx == "" or not theano.config.blas.ldflags: # If theano.config.blas.ldflags is empty, Theano will use
# a NumPy C implementation of [sd]gemm_.
if theano.config.cxx == "":
return return
if not isinstance(node.op, AbstractConv2d): if not isinstance(node.op, AbstractConv2d):
return None return None
...@@ -94,7 +96,9 @@ def local_abstractconv_gemm(node): ...@@ -94,7 +96,9 @@ def local_abstractconv_gemm(node):
@local_optimizer([AbstractConv3d]) @local_optimizer([AbstractConv3d])
def local_abstractconv3d_gemm(node): def local_abstractconv3d_gemm(node):
if theano.config.cxx == "" or not theano.config.blas.ldflags: # If theano.config.blas.ldflags is empty, Theano will use
# a NumPy C implementation of [sd]gemm_.
if theano.config.cxx == "":
return return
if not isinstance(node.op, AbstractConv3d): if not isinstance(node.op, AbstractConv3d):
return None return None
...@@ -116,7 +120,9 @@ def local_abstractconv3d_gemm(node): ...@@ -116,7 +120,9 @@ def local_abstractconv3d_gemm(node):
@local_optimizer([AbstractConv2d_gradWeights]) @local_optimizer([AbstractConv2d_gradWeights])
def local_abstractconv_gradweight_gemm(node): def local_abstractconv_gradweight_gemm(node):
if theano.config.cxx == "" or not theano.config.blas.ldflags: # If theano.config.blas.ldflags is empty, Theano will use
# a NumPy C implementation of [sd]gemm_.
if theano.config.cxx == "":
return return
if not isinstance(node.op, AbstractConv2d_gradWeights): if not isinstance(node.op, AbstractConv2d_gradWeights):
return None return None
...@@ -141,7 +147,9 @@ def local_abstractconv_gradweight_gemm(node): ...@@ -141,7 +147,9 @@ def local_abstractconv_gradweight_gemm(node):
@local_optimizer([AbstractConv3d_gradWeights]) @local_optimizer([AbstractConv3d_gradWeights])
def local_abstractconv3d_gradweight_gemm(node): def local_abstractconv3d_gradweight_gemm(node):
if theano.config.cxx == "" or not theano.config.blas.ldflags: # If theano.config.blas.ldflags is empty, Theano will use
# a NumPy C implementation of [sd]gemm_.
if theano.config.cxx == "":
return return
if not isinstance(node.op, AbstractConv3d_gradWeights): if not isinstance(node.op, AbstractConv3d_gradWeights):
return None return None
...@@ -166,7 +174,9 @@ def local_abstractconv3d_gradweight_gemm(node): ...@@ -166,7 +174,9 @@ def local_abstractconv3d_gradweight_gemm(node):
@local_optimizer([AbstractConv2d_gradInputs]) @local_optimizer([AbstractConv2d_gradInputs])
def local_abstractconv_gradinputs_gemm(node): def local_abstractconv_gradinputs_gemm(node):
if theano.config.cxx == "" or not theano.config.blas.ldflags: # If theano.config.blas.ldflags is empty, Theano will use
# a NumPy C implementation of [sd]gemm_.
if theano.config.cxx == "":
return return
if not isinstance(node.op, AbstractConv2d_gradInputs): if not isinstance(node.op, AbstractConv2d_gradInputs):
return None return None
...@@ -189,7 +199,9 @@ def local_abstractconv_gradinputs_gemm(node): ...@@ -189,7 +199,9 @@ def local_abstractconv_gradinputs_gemm(node):
@local_optimizer([AbstractConv3d_gradInputs]) @local_optimizer([AbstractConv3d_gradInputs])
def local_abstractconv3d_gradinputs_gemm(node): def local_abstractconv3d_gradinputs_gemm(node):
if theano.config.cxx == "" or not theano.config.blas.ldflags: # If theano.config.blas.ldflags is empty, Theano will use
# a NumPy C implementation of [sd]gemm_.
if theano.config.cxx == "":
return return
if not isinstance(node.op, AbstractConv3d_gradInputs): if not isinstance(node.op, AbstractConv3d_gradInputs):
return None return None
...@@ -603,6 +615,5 @@ def local_abstractconv_check(node): ...@@ -603,6 +615,5 @@ def local_abstractconv_check(node):
node.op.__class__.__name__) node.op.__class__.__name__)
optdb.register('AbstractConvCheck', optdb.register('AbstractConvCheck',
opt.in2out(local_abstractconv_check, opt.in2out(local_abstractconv_check, name="AbstractConvCheck"),
name="AbstractConvCheck"),
48.7, 'fast_compile', 'fast_run') 48.7, 'fast_compile', 'fast_run')
...@@ -363,8 +363,7 @@ class BaseTestConv(object): ...@@ -363,8 +363,7 @@ class BaseTestConv(object):
class BaseTestConv2d(BaseTestConv): class BaseTestConv2d(BaseTestConv):
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
if theano.config.blas.ldflags == '': # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest("BLAS required for reference")
cls.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7), cls.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
(6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)] (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
cls.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3), cls.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
...@@ -414,14 +413,13 @@ class BaseTestConv2d(BaseTestConv): ...@@ -414,14 +413,13 @@ class BaseTestConv2d(BaseTestConv):
class TestCorrConv2d(BaseTestConv2d): class TestCorrConv2d(BaseTestConv2d):
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
if theano.config.blas.ldflags == "": # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest()
BaseTestConv2d.setup_class() BaseTestConv2d.setup_class()
def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)): def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
o = self.get_output_shape(i, f, s, b, fd) o = self.get_output_shape(i, f, s, b, fd)
if (not theano.config.blas.ldflags or # This tests can run even when theano.config.blas.ldflags is empty.
not theano.config.cxx or if (not theano.config.cxx or
theano.config.mode == "FAST_COMPILE"): theano.config.mode == "FAST_COMPILE"):
raise SkipTest("Need blas to test conv2d") raise SkipTest("Need blas to test conv2d")
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s, self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
...@@ -444,8 +442,7 @@ class TestCorrConv2d(BaseTestConv2d): ...@@ -444,8 +442,7 @@ class TestCorrConv2d(BaseTestConv2d):
class TestAbstractConvNoOptim(BaseTestConv2d): class TestAbstractConvNoOptim(BaseTestConv2d):
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
if theano.config.blas.ldflags == "": # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest()
BaseTestConv2d.setup_class() BaseTestConv2d.setup_class()
cls.inputs_shapes = [(8, 1, 6, 6)] cls.inputs_shapes = [(8, 1, 6, 6)]
cls.filters_shapes = [(5, 1, 2, 2)] cls.filters_shapes = [(5, 1, 2, 2)]
...@@ -518,8 +515,7 @@ class TestCpuConv2d(BaseTestConv2d): ...@@ -518,8 +515,7 @@ class TestCpuConv2d(BaseTestConv2d):
gradinput_OK = False gradinput_OK = False
if fwd_OK: if fwd_OK:
if not theano.config.blas.ldflags: # This test can run even when theano.config.blas.ldflags is empty.
raise SkipTest("Need blas to test conv2d")
self.run_fwd(inputs_shape=i, filters_shape=f, self.run_fwd(inputs_shape=i, filters_shape=f,
subsample=s, verify_grad=(gradweight_OK and gradinput_OK), subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
mode=mode, provide_shape=provide_shape, mode=mode, provide_shape=provide_shape,
...@@ -541,8 +537,7 @@ class TestCpuConv2d(BaseTestConv2d): ...@@ -541,8 +537,7 @@ class TestCpuConv2d(BaseTestConv2d):
filter_dilation=fd) filter_dilation=fd)
if gradweight_OK: if gradweight_OK:
if not theano.config.blas.ldflags: # This test can run even when theano.config.blas.ldflags is empty.
raise SkipTest("Need blas to test conv2d")
self.run_gradweight(inputs_shape=i, filters_shape=f, self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s, output_shape=o, subsample=s,
verify_grad=False, mode=mode, verify_grad=False, mode=mode,
...@@ -567,8 +562,7 @@ class TestCpuConv2d(BaseTestConv2d): ...@@ -567,8 +562,7 @@ class TestCpuConv2d(BaseTestConv2d):
filter_dilation=fd) filter_dilation=fd)
if gradinput_OK: if gradinput_OK:
if not theano.config.blas.ldflags: # This test can run even when theano.config.blas.ldflags is empty.
raise SkipTest("Need blas to test conv2d")
self.run_gradinput(inputs_shape=i, filters_shape=f, self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s, output_shape=o, subsample=s,
verify_grad=False, mode=mode, verify_grad=False, mode=mode,
...@@ -596,8 +590,7 @@ class TestCpuConv2d(BaseTestConv2d): ...@@ -596,8 +590,7 @@ class TestCpuConv2d(BaseTestConv2d):
class BaseTestConv3d(BaseTestConv): class BaseTestConv3d(BaseTestConv):
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
if theano.config.blas.ldflags == '': # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest("BLAS required for reference")
cls.inputs_shapes = [(2, 1, 5, 5, 5), (1, 2, 7, 5, 6)] cls.inputs_shapes = [(2, 1, 5, 5, 5), (1, 2, 7, 5, 6)]
cls.filters_shapes = [(2, 1, 2, 2, 2), (1, 2, 2, 1, 3)] cls.filters_shapes = [(2, 1, 2, 2, 2), (1, 2, 2, 1, 3)]
cls.subsamples = [(1, 1, 1), (2, 2, 2), (1, 2, 3)] cls.subsamples = [(1, 1, 1), (2, 2, 2), (1, 2, 3)]
...@@ -645,14 +638,13 @@ class BaseTestConv3d(BaseTestConv): ...@@ -645,14 +638,13 @@ class BaseTestConv3d(BaseTestConv):
class TestCorrConv3d(BaseTestConv3d): class TestCorrConv3d(BaseTestConv3d):
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
if theano.config.blas.ldflags == "": # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest()
BaseTestConv3d.setup_class() BaseTestConv3d.setup_class()
def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)): def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
o = self.get_output_shape(i, f, s, b, fd) o = self.get_output_shape(i, f, s, b, fd)
if (not theano.config.blas.ldflags or # This test can run even when theano.config.blas.ldflags is empty.
not theano.config.cxx or if (not theano.config.cxx or
theano.config.mode == "FAST_COMPILE"): theano.config.mode == "FAST_COMPILE"):
raise SkipTest("Need blas to test conv3d") raise SkipTest("Need blas to test conv3d")
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s, self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
...@@ -699,8 +691,7 @@ class TestCpuConv3d(BaseTestConv3d): ...@@ -699,8 +691,7 @@ class TestCpuConv3d(BaseTestConv3d):
gradinput_OK = False gradinput_OK = False
if fwd_OK: if fwd_OK:
if not theano.config.blas.ldflags: # This test can run even when theano.config.blas.ldflags is empty.
raise SkipTest("Need blas to test conv3d")
self.run_fwd(inputs_shape=i, filters_shape=f, self.run_fwd(inputs_shape=i, filters_shape=f,
subsample=s, verify_grad=(gradweight_OK and gradinput_OK), subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
mode=mode, provide_shape=provide_shape, mode=mode, provide_shape=provide_shape,
...@@ -722,8 +713,7 @@ class TestCpuConv3d(BaseTestConv3d): ...@@ -722,8 +713,7 @@ class TestCpuConv3d(BaseTestConv3d):
filter_dilation=fd) filter_dilation=fd)
if gradweight_OK: if gradweight_OK:
if not theano.config.blas.ldflags: # This test can run even when theano.config.blas.ldflags is empty.
raise SkipTest("Need blas to test conv3d")
self.run_gradweight(inputs_shape=i, filters_shape=f, self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s, output_shape=o, subsample=s,
verify_grad=False, mode=mode, verify_grad=False, mode=mode,
...@@ -748,8 +738,7 @@ class TestCpuConv3d(BaseTestConv3d): ...@@ -748,8 +738,7 @@ class TestCpuConv3d(BaseTestConv3d):
filter_dilation=fd) filter_dilation=fd)
if gradinput_OK: if gradinput_OK:
if not theano.config.blas.ldflags: # This test can run even when theano.config.blas.ldflags is empty.
raise SkipTest("Need blas to test conv3d")
self.run_gradinput(inputs_shape=i, filters_shape=f, self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s, output_shape=o, subsample=s,
verify_grad=False, mode=mode, verify_grad=False, mode=mode,
...@@ -913,13 +902,13 @@ class TestConvTypes(unittest.TestCase): ...@@ -913,13 +902,13 @@ class TestConvTypes(unittest.TestCase):
class TestBilinearUpsampling(unittest.TestCase): class TestBilinearUpsampling(unittest.TestCase):
# If BLAS is not available on CPU, then we accept the fallback to the # If theano.config.blas.ldflags is empty, Theano will use
# slow Python implementation for that test. # a NumPy C implementation of [sd]gemm_.
compile_mode = theano.compile.mode.get_default_mode() compile_mode = theano.compile.mode.get_default_mode()
if theano.config.mode == "FAST_COMPILE": if theano.config.mode == "FAST_COMPILE":
compile_mode = compile_mode.excluding("conv_gemm") compile_mode = compile_mode.excluding("conv_gemm")
compile_mode = compile_mode.excluding('AbstractConvCheck') compile_mode = compile_mode.excluding('AbstractConvCheck')
elif not theano.config.blas.ldflags or not theano.config.cxx: elif not theano.config.cxx:
compile_mode = compile_mode.excluding('AbstractConvCheck') compile_mode = compile_mode.excluding('AbstractConvCheck')
def numerical_kernel_1D(self, ratio): def numerical_kernel_1D(self, ratio):
......
...@@ -27,8 +27,7 @@ class TestCorr2D(utt.InferShapeTester): ...@@ -27,8 +27,7 @@ class TestCorr2D(utt.InferShapeTester):
self.filters.name = 'default_filters' self.filters.name = 'default_filters'
if not conv.imported_scipy_signal and theano.config.cxx == "": if not conv.imported_scipy_signal and theano.config.cxx == "":
raise SkipTest("CorrMM tests need SciPy or a c++ compiler") raise SkipTest("CorrMM tests need SciPy or a c++ compiler")
if not theano.config.blas.ldflags: # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest("CorrMM tests need a BLAS")
def validate(self, image_shape, filter_shape, def validate(self, image_shape, filter_shape,
border_mode='valid', subsample=(1, 1), border_mode='valid', subsample=(1, 1),
...@@ -131,7 +130,7 @@ class TestCorr2D(utt.InferShapeTester): ...@@ -131,7 +130,7 @@ class TestCorr2D(utt.InferShapeTester):
icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1] icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
).sum() ).sum()
utt.assert_allclose(theano_output, ref_output) utt.assert_allclose(ref_output, theano_output)
# TEST GRADIENT # TEST GRADIENT
if verify_grad: if verify_grad:
......
...@@ -27,8 +27,7 @@ class TestCorr3D(utt.InferShapeTester): ...@@ -27,8 +27,7 @@ class TestCorr3D(utt.InferShapeTester):
self.filters.name = 'default_filters' self.filters.name = 'default_filters'
if not conv.imported_scipy_signal and theano.config.cxx == "": if not conv.imported_scipy_signal and theano.config.cxx == "":
raise SkipTest("Corr3dMM tests need SciPy or a c++ compiler") raise SkipTest("Corr3dMM tests need SciPy or a c++ compiler")
if not theano.config.blas.ldflags: # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest("Corr3dMM tests need a BLAS")
def validate(self, image_shape, filter_shape, def validate(self, image_shape, filter_shape,
border_mode='valid', subsample=(1, 1, 1), border_mode='valid', subsample=(1, 1, 1),
......
...@@ -95,10 +95,11 @@ class t_gemm(TestCase): ...@@ -95,10 +95,11 @@ class t_gemm(TestCase):
cmp_linker(copy(z), a, x, y, b, 'c|py') cmp_linker(copy(z), a, x, y, b, 'c|py')
cmp_linker(copy(z), a, x, y, b, 'py') cmp_linker(copy(z), a, x, y, b, 'py')
if (config.blas.ldflags and not dtype.startswith("complex")
if (not dtype.startswith("complex")
and theano.config.cxx): and theano.config.cxx):
# If blas.ldflags is equal to '', the C code will not # If theano.config.blas.ldflags is empty, Theano will use
# be generated # a NumPy C implementation of [sd]gemm_.
cmp_linker(copy(z), a, x, y, b, 'c') cmp_linker(copy(z), a, x, y, b, 'c')
def test0a(self): def test0a(self):
...@@ -2160,6 +2161,24 @@ class TestBlasStrides(TestCase): ...@@ -2160,6 +2161,24 @@ class TestBlasStrides(TestCase):
self.cmp_ger((1, 0), 1, 0) self.cmp_ger((1, 0), 1, 0)
self.cmp_ger((0, 0), 0, 0) self.cmp_ger((0, 0), 0, 0)
def test_gemm_non_contiguous(self):
"""test_gemm_non_contiguous: Test if GEMM works well with non-contiguous matrices."""
aval = numpy.ones((6, 2))
bval = numpy.ones((2, 7))
cval = numpy.arange(7) + numpy.arange(0, .6, .1)[:, numpy.newaxis]
a = theano.shared(aval[:3], borrow=True)
b = theano.shared(bval[:, :5], borrow=True)
c = theano.shared(cval[:3, :5], borrow=True)
s = theano.tensor.scalar()
upd_c = s * c + theano.tensor.dot(a, b)
f = theano.function([s], [], updates={c: upd_c})
f(0)
ref_output = numpy.ones((3, 5)) * 2
unittest_tools.assert_allclose(c.get_value(), ref_output)
class test_infer_shape(unittest_tools.InferShapeTester): class test_infer_shape(unittest_tools.InferShapeTester):
def test_dot22(self): def test_dot22(self):
......
...@@ -26,13 +26,18 @@ mode_blas_opt = theano.compile.get_default_mode().including( ...@@ -26,13 +26,18 @@ mode_blas_opt = theano.compile.get_default_mode().including(
'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas') 'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
def skip_if_blas_ldflags_empty(*functions_detected):
if theano.config.blas.ldflags == "":
functions_string = ""
if functions_detected:
functions_string = " (at least " + (", ".join(functions_detected)) + ")"
raise SkipTest("This test is useful only when Theano can access to BLAS functions" + functions_string + " other than [sd]gemm_.")
class TestCGer(TestCase, TestOptimizationMixin): class TestCGer(TestCase, TestOptimizationMixin):
def setUp(self, dtype='float64'): def setUp(self, dtype='float64'):
if theano.config.blas.ldflags == "": # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest("This test is useful only when Theano"
" is directly linked to blas.")
self.dtype = dtype self.dtype = dtype
self.mode = theano.compile.get_default_mode().including('fast_run') self.mode = theano.compile.get_default_mode().including('fast_run')
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False)) self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
...@@ -76,11 +81,13 @@ class TestCGer(TestCase, TestOptimizationMixin): ...@@ -76,11 +81,13 @@ class TestCGer(TestCase, TestOptimizationMixin):
self.assertTrue(hash(CGer(False)) != hash(CGer(True))) self.assertTrue(hash(CGer(False)) != hash(CGer(True)))
def test_optimization_pipeline(self): def test_optimization_pipeline(self):
skip_if_blas_ldflags_empty()
f = self.function([self.x, self.y], tensor.outer(self.x, self.y)) f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
self.assertFunctionContains(f, CGer(destructive=True)) self.assertFunctionContains(f, CGer(destructive=True))
f(self.xval, self.yval) # DebugMode tests correctness f(self.xval, self.yval) # DebugMode tests correctness
def test_optimization_pipeline_float(self): def test_optimization_pipeline_float(self):
skip_if_blas_ldflags_empty()
self.setUp('float32') self.setUp('float32')
f = self.function([self.x, self.y], tensor.outer(self.x, self.y)) f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
self.assertFunctionContains(f, CGer(destructive=True)) self.assertFunctionContains(f, CGer(destructive=True))
...@@ -93,12 +100,14 @@ class TestCGer(TestCase, TestOptimizationMixin): ...@@ -93,12 +100,14 @@ class TestCGer(TestCase, TestOptimizationMixin):
self.assertFunctionContains0(f, CGer(destructive=False)) self.assertFunctionContains0(f, CGer(destructive=False))
def test_A_plus_outer(self): def test_A_plus_outer(self):
skip_if_blas_ldflags_empty()
f = self.function([self.A, self.x, self.y], f = self.function([self.A, self.x, self.y],
self.A + tensor.outer(self.x, self.y)) self.A + tensor.outer(self.x, self.y))
self.assertFunctionContains(f, CGer(destructive=False)) self.assertFunctionContains(f, CGer(destructive=False))
self.run_f(f) # DebugMode tests correctness self.run_f(f) # DebugMode tests correctness
def test_A_plus_scaled_outer(self): def test_A_plus_scaled_outer(self):
skip_if_blas_ldflags_empty()
f = self.function([self.A, self.x, self.y], f = self.function([self.A, self.x, self.y],
self.A + 0.1 * tensor.outer(self.x, self.y)) self.A + 0.1 * tensor.outer(self.x, self.y))
self.assertFunctionContains(f, CGer(destructive=False)) self.assertFunctionContains(f, CGer(destructive=False))
...@@ -113,9 +122,7 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -113,9 +122,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
""" """
def setUp(self, dtype='float64'): def setUp(self, dtype='float64'):
if theano.config.blas.ldflags == "": # This tests can run even when theano.config.blas.ldflags is empty.
raise SkipTest("This test is useful only when Theano"
" is directly linked to blas.")
self.dtype = dtype self.dtype = dtype
self.mode = theano.compile.get_default_mode().including('fast_run') self.mode = theano.compile.get_default_mode().including('fast_run')
# matrix # matrix
...@@ -144,6 +151,7 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -144,6 +151,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert not numpy.isnan(zval).any() assert not numpy.isnan(zval).any()
def test_optimizations_vm(self): def test_optimizations_vm(self):
skip_if_blas_ldflags_empty()
''' Test vector dot matrix ''' ''' Test vector dot matrix '''
f = theano.function([self.x, self.A], f = theano.function([self.x, self.A],
theano.dot(self.x, self.A), theano.dot(self.x, self.A),
...@@ -165,6 +173,7 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -165,6 +173,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
numpy.dot(self.xval, self.Aval[::-1, ::-1])) numpy.dot(self.xval, self.Aval[::-1, ::-1]))
def test_optimizations_mv(self): def test_optimizations_mv(self):
skip_if_blas_ldflags_empty()
''' Test matrix dot vector ''' ''' Test matrix dot vector '''
f = theano.function([self.A, self.y], f = theano.function([self.A, self.y],
theano.dot(self.A, self.y), theano.dot(self.A, self.y),
...@@ -235,6 +244,7 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -235,6 +244,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
numpy.dot(m.get_value(), v1.get_value()) + v2_orig) numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
def test_gemv1(self): def test_gemv1(self):
skip_if_blas_ldflags_empty()
self.t_gemv1((3, 2)) self.t_gemv1((3, 2))
self.t_gemv1((1, 2)) self.t_gemv1((1, 2))
self.t_gemv1((0, 2)) self.t_gemv1((0, 2))
...@@ -269,6 +279,7 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -269,6 +279,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
self.assertRaises(ValueError, f, A_val, ones_4, ones_6) self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
def test_multiple_inplace(self): def test_multiple_inplace(self):
skip_if_blas_ldflags_empty()
x = tensor.dmatrix('x') x = tensor.dmatrix('x')
y = tensor.dvector('y') y = tensor.dvector('y')
z = tensor.dvector('z') z = tensor.dvector('z')
...@@ -292,9 +303,7 @@ class TestCGemvFloat32(TestCase, BaseGemv, TestOptimizationMixin): ...@@ -292,9 +303,7 @@ class TestCGemvFloat32(TestCase, BaseGemv, TestOptimizationMixin):
gemv_inplace = CGemv(inplace=True) gemv_inplace = CGemv(inplace=True)
def setUp(self): def setUp(self):
if theano.config.blas.ldflags == "": skip_if_blas_ldflags_empty()
raise SkipTest("This test is useful only when Theano"
" is directly linked to blas.")
class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin): class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
...@@ -304,9 +313,7 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin): ...@@ -304,9 +313,7 @@ class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
gemv_inplace = CGemv(inplace=True) gemv_inplace = CGemv(inplace=True)
def setUp(self): def setUp(self):
if theano.config.blas.ldflags == "": skip_if_blas_ldflags_empty()
raise SkipTest("This test is useful only when Theano"
" is directly linked to blas.")
class TestBlasStridesC(TestBlasStrides): class TestBlasStridesC(TestBlasStrides):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论