提交 d12a03be authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1936 from daemonmaker/issue1569

Added to test to determine the behavior of the BLAS and intialize the me...
import numpy
from theano import config from theano import config
from theano.tensor.opt import in2out from theano.tensor.opt import in2out
...@@ -5,6 +7,8 @@ from theano.tensor.blas import ldflags, blas_header_text, blas_header_version ...@@ -5,6 +7,8 @@ from theano.tensor.blas import ldflags, blas_header_text, blas_header_version
from theano.tensor.blas import blas_optdb, optdb, local_optimizer, EquilibriumOptimizer from theano.tensor.blas import blas_optdb, optdb, local_optimizer, EquilibriumOptimizer
from theano.tensor.blas import Ger, ger, ger_destructive from theano.tensor.blas import Ger, ger, ger_destructive
from theano.tensor.blas import Gemv, gemv_inplace, gemv_no_inplace from theano.tensor.blas import Gemv, gemv_inplace, gemv_no_inplace
from theano.tensor import basic as T
import theano.compile
class BaseBLAS(object): class BaseBLAS(object):
...@@ -280,13 +284,13 @@ def make_c_ger_destructive(node): ...@@ -280,13 +284,13 @@ def make_c_ger_destructive(node):
####### ####### ####### ####### ####### #######
def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail, force_init_beta=False):
""" """
zz <- beta * aa + alpha * dot(xx, yy) zz <- beta * aa + alpha * dot(xx, yy)
where xx is a matrix, yy and aa are vectors (ergo zz is vector) where xx is a matrix, yy and aa are vectors (ergo zz is vector)
""" """
return """ code = """
int elemsize ; int elemsize ;
float fbeta; float fbeta;
...@@ -378,7 +382,7 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -378,7 +382,7 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
zoutdata[Zi*i] = fbeta * zdata[Ai*i]; zoutdata[Zi*i] = fbeta * zdata[Ai*i];
} }
} }
else if (PyArray_DESCR(%(xx)s)->type_num == NPY_DOUBLE) else if (PyArray_DESCR(%(zz)s)->type_num == NPY_DOUBLE)
{ {
double * zoutdata = (double*) PyArray_DATA(%(zz)s); double * zoutdata = (double*) PyArray_DATA(%(zz)s);
const double * zdata = (double*)PyArray_DATA(%(aa)s); const double * zdata = (double*)PyArray_DATA(%(aa)s);
...@@ -397,6 +401,40 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -397,6 +401,40 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
} }
fbeta = dbeta = 1.0; fbeta = dbeta = 1.0;
} }
else if (%(force_init_beta)d)
{
if (PyArray_CHKFLAGS(%(zz)s, NPY_ARRAY_C_CONTIGUOUS))
{
memset((void *)PyArray_DATA(%(zz)s), 0, PyArray_SIZE(%(zz)s)*PyArray_ITEMSIZE(%(zz)s));
}
else
{
if (PyArray_DESCR(%(zz)s)->type_num == NPY_FLOAT)
{
float *zoutdata = (float *)PyArray_DATA(%(zz)s);
int Zi = PyArray_STRIDES(%(zz)s)[0]/sizeof(float);
for (int i = 0; i < PyArray_DIMS(%(aa)s)[0]; ++i)
{
zoutdata[Zi*i] = 0.0f;
}
}
else if (PyArray_DESCR(%(zz)s)->type_num == NPY_DOUBLE)
{
double *zoutdata = (double *)PyArray_DATA(%(zz)s);
int Zi = PyArray_STRIDES(%(zz)s)[0]/sizeof(double);
for (int i = 0; i < PyArray_DIMS(%(aa)s)[0]; ++i)
{
zoutdata[Zi*i] = 0.0;
}
}
else
{
PyErr_SetString(PyExc_AssertionError,
"neither float nor double dtype");
%(fail)s
}
}
}
} }
else else
{ {
...@@ -566,24 +604,81 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail): ...@@ -566,24 +604,81 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
} }
} }
""" % locals() """
return code % locals()
class CGemv(BaseBLAS, Gemv): class CGemv(BaseBLAS, Gemv):
def __init__(self, inplace, force_init_beta=False):
super(CGemv, self).__init__(inplace)
self.force_init_beta = force_init_beta
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
aa, alpha, xx, yy, beta = inp aa, alpha, xx, yy, beta = inp
zz, = out zz, = out
code = gemv_c_code( code = gemv_c_code(
aa, xx, yy, zz, alpha, beta, aa, xx, yy, zz, alpha, beta,
destructive=int(self.inplace), destructive=int(self.inplace),
fail=sub['fail']) fail=sub['fail'],
force_init_beta=self.force_init_beta
)
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (10, blas_header_version()) return (11, blas_header_version())
cgemv_inplace = CGemv(inplace=True) cgemv_inplace = CGemv(inplace=True)
cgemv_no_inplace = CGemv(inplace=False) cgemv_no_inplace = CGemv(inplace=False)
def check_force_gemv_init():
if check_force_gemv_init._force_init_beta is None:
"""
Test issue 1569.
Namely when evaulating
beta*aa + alpha*dot(xx, yy)
where we set aa = betas = zeros of the correct dimensions we do not
actually set aa = zeros and instead let the BLAS perform beta*aa with
uninitialized memory for speed. Occasionally the memory contains values
that are equivalent to NaN in which case the product beta*aa contains
NaN's for correctly implemented BLAS libraries. In this situation, since
we are introducing the NaN's, we need to test whether the BLAS performs
correctly. If it *does*, i.e. it actually performs the multiplication
beta*aa which will result in NaN's in the result, then we need intialize
the memory to zeros.
"""
aa = T.vector('aa')
yy = T.vector('yy')
xx = T.matrix('xx')
f = theano.function(
[aa, yy, xx],
gemv_no_inplace(aa, 1., xx, yy, 0.),
theano.compile.Mode(optimizer='fast_compile')
)
# Here we introduce NaNs into the data, if they are returned by the BLAS
# then we want gemv_c_code to initiliaze the memory to 0 so that we
# don't inadvertantly introduce NaNs to the users data.
aa_data = numpy.array(
float('NaN')*numpy.ones((2,)),
dtype=theano.config.floatX
)
yy_data = numpy.array(
numpy.ones((2,))*2,
dtype=theano.config.floatX
)
xx_data = numpy.array(
numpy.ones((2, 2)),
dtype=theano.config.floatX
)
zz = f(aa_data, yy_data, xx_data)
check_force_gemv_init._force_init_beta = numpy.isnan(zz).any()
return check_force_gemv_init._force_init_beta
check_force_gemv_init._force_init_beta = None
@local_optimizer([gemv_inplace, gemv_no_inplace]) @local_optimizer([gemv_inplace, gemv_no_inplace])
def use_c_gemv(node): def use_c_gemv(node):
...@@ -592,7 +687,29 @@ def use_c_gemv(node): ...@@ -592,7 +687,29 @@ def use_c_gemv(node):
# Only float32 and float64 are supported for now. # Only float32 and float64 are supported for now.
if (node.op == gemv_no_inplace and if (node.op == gemv_no_inplace and
node.outputs[0].dtype in ['float32', 'float64']): node.outputs[0].dtype in ['float32', 'float64']):
return [CGemv(inplace=False)(*node.inputs)]
"""
We want to maintain the behavoir of any operation that the user adds
even if it results in NaNs. However we do not want optimizations to
introduce NaNs.
GEMV is not always implemented consistenly across BLAS libraries.
Sometimes, when beta is 0, they do not perform the multiplication with
beta. Other implmentations do. This can cause problems for the inplace
GEMV implementation if NaNs happen to be in the newly allocated but
uninitalized memory. When the multiplication is not done we do not need
to initialize the output memory resulting in a speed up. Otherwise we
must initialize the memory to avoid introducing NaN's in the output
that weren't in the original graph.
The following check determines whether the output memory needs to be
initiliazed. It is done here, as opposed to in global scope, because
the setup has not been completed at that time and therefore the check
cannot be performed at that time.
"""
force_init_beta = check_force_gemv_init()
return [CGemv(inplace=False, force_init_beta=force_init_beta)(*node.inputs)]
if (node.op == gemv_inplace and if (node.op == gemv_inplace and
node.outputs[0].dtype in ['float32', 'float64']): node.outputs[0].dtype in ['float32', 'float64']):
return [CGemv(inplace=True)(*node.inputs)] return [CGemv(inplace=True)(*node.inputs)]
......
...@@ -15,6 +15,8 @@ from theano.tensor.blas_c import CGemv ...@@ -15,6 +15,8 @@ from theano.tensor.blas_c import CGemv
from theano.tensor.blas_scipy import ScipyGer from theano.tensor.blas_scipy import ScipyGer
from theano.tensor.blas import Gemv from theano.tensor.blas import Gemv
from theano.tensor.blas_c import check_force_gemv_init
from theano.tests import unittest_tools from theano.tests import unittest_tools
from theano.tests.unittest_tools import TestOptimizationMixin from theano.tests.unittest_tools import TestOptimizationMixin
...@@ -137,7 +139,10 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -137,7 +139,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
# Assert that the dot was optimized somehow # Assert that the dot was optimized somehow
self.assertFunctionContains0(f, tensor.dot) self.assertFunctionContains0(f, tensor.dot)
self.assertFunctionContains1(f, CGemv(True)) self.assertFunctionContains1(
f,
CGemv(inplace=True, force_init_beta=True)
)
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(f(self.xval, self.Aval), assert numpy.allclose(f(self.xval, self.Aval),
...@@ -155,7 +160,10 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -155,7 +160,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
# Assert that the dot was optimized somehow # Assert that the dot was optimized somehow
self.assertFunctionContains0(f, tensor.dot) self.assertFunctionContains0(f, tensor.dot)
self.assertFunctionContains1(f, CGemv(True)) self.assertFunctionContains1(
f,
CGemv(inplace=True, force_init_beta=True)
)
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(f(self.Aval, self.yval), assert numpy.allclose(f(self.Aval, self.yval),
...@@ -164,6 +172,14 @@ class TestCGemv(TestCase, TestOptimizationMixin): ...@@ -164,6 +172,14 @@ class TestCGemv(TestCase, TestOptimizationMixin):
assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval), assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
numpy.dot(self.Aval[::-1, ::-1], self.yval)) numpy.dot(self.Aval[::-1, ::-1], self.yval))
def test_force_gemv_init(self):
if check_force_gemv_init():
sys.stderr.write(
"WARNING: The current BLAS requires Theano to initialize"
+ " memory for some GEMV calls which will result in a minor"
+ " degradation in performance for such calls."
)
def t_gemv1(self, m_shp): def t_gemv1(self, m_shp):
''' test vector2 + dot(matrix, vector1) ''' ''' test vector2 + dot(matrix, vector1) '''
rng = numpy.random.RandomState(unittest_tools.fetch_seed()) rng = numpy.random.RandomState(unittest_tools.fetch_seed())
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论