Merge pull request #1936 from daemonmaker/issue1569

Added to test to determine the behavior of the BLAS and intialize the me...

Merge pull request #1936 from daemonmaker/issue1569
d12a03be · Frédéric Bastien · 0f1c5ac3 · 7790b329 · d12a03be · d12a03be
--- a/theano/tensor/blas_c.py
+++ b/theano/tensor/blas_c.py
+import numpy
 from theano import config
 from theano.tensor.opt import in2out
@@ -5,6 +7,8 @@ from theano.tensor.blas import ldflags, blas_header_text, blas_header_version
 from theano.tensor.blas import blas_optdb, optdb, local_optimizer, EquilibriumOptimizer
 from theano.tensor.blas import Ger, ger, ger_destructive
 from theano.tensor.blas import Gemv, gemv_inplace, gemv_no_inplace
+from theano.tensor import basic as T
+import theano.compile
 class BaseBLAS(object):
@@ -280,13 +284,13 @@ def make_c_ger_destructive(node):
 ####### ####### #######
-def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
+def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail, force_init_beta=False):
    """
    zz <- beta * aa + alpha * dot(xx, yy)
    where xx is a matrix, yy and aa are vectors (ergo zz is vector)
    """
-    return """
+    code = """
    int elemsize ;
    float fbeta;
@@ -378,7 +382,7 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
                    zoutdata[Zi*i] = fbeta * zdata[Ai*i];
                }
            }
-            else if (PyArray_DESCR(%(xx)s)->type_num == NPY_DOUBLE)
+            else if (PyArray_DESCR(%(zz)s)->type_num == NPY_DOUBLE)
            {
                double * zoutdata = (double*) PyArray_DATA(%(zz)s);
                const double * zdata = (double*)PyArray_DATA(%(aa)s);
@@ -397,6 +401,40 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
            }
            fbeta = dbeta = 1.0;
        }
+        else if (%(force_init_beta)d)
+        {
+            if (PyArray_CHKFLAGS(%(zz)s, NPY_ARRAY_C_CONTIGUOUS))
+            {
+                memset((void *)PyArray_DATA(%(zz)s), 0, PyArray_SIZE(%(zz)s)*PyArray_ITEMSIZE(%(zz)s));
+            }
+            else
+            {
+                if (PyArray_DESCR(%(zz)s)->type_num == NPY_FLOAT)
+                {
+                    float *zoutdata = (float *)PyArray_DATA(%(zz)s);
+                    int Zi = PyArray_STRIDES(%(zz)s)[0]/sizeof(float);
+                    for (int i = 0; i < PyArray_DIMS(%(aa)s)[0]; ++i)
+                    {
+                        zoutdata[Zi*i] = 0.0f;
+                    }
+                }
+                else if (PyArray_DESCR(%(zz)s)->type_num == NPY_DOUBLE)
+                {
+                    double *zoutdata = (double *)PyArray_DATA(%(zz)s);
+                    int Zi = PyArray_STRIDES(%(zz)s)[0]/sizeof(double);
+                    for (int i = 0; i < PyArray_DIMS(%(aa)s)[0]; ++i)
+                    {
+                        zoutdata[Zi*i] = 0.0;
+                    }
+                }
+                else
+                {
+                    PyErr_SetString(PyExc_AssertionError,
+                                    "neither float nor double dtype");
+                    %(fail)s
+                }
+            }
+        }
    }
    else
    {
@@ -566,24 +604,81 @@ def gemv_c_code(aa, xx, yy, zz, alpha, beta, destructive, fail):
        }
    }
-    """ % locals()
+    """
+    return code % locals()
 class CGemv(BaseBLAS, Gemv):
+    def __init__(self, inplace, force_init_beta=False):
+        super(CGemv, self).__init__(inplace)
+	self.force_init_beta = force_init_beta
    def c_code(self, node, name, inp, out, sub):
        aa, alpha, xx, yy, beta = inp
        zz, = out
        code = gemv_c_code(
                aa, xx, yy, zz, alpha, beta,
                destructive=int(self.inplace),
-                fail=sub['fail'])
+                fail=sub['fail'],
+                force_init_beta=self.force_init_beta
+                )
        return code
    def c_code_cache_version(self):
-        return (10, blas_header_version())
+        return (11, blas_header_version())
 cgemv_inplace = CGemv(inplace=True)
 cgemv_no_inplace = CGemv(inplace=False)
+def check_force_gemv_init():
+    if check_force_gemv_init._force_init_beta is None:
+        """
+        Test issue 1569.
+        Namely when evaulating
+            beta*aa + alpha*dot(xx, yy)
+        where we set aa = betas = zeros of the correct dimensions we do not
+        actually set aa = zeros and instead let the BLAS perform beta*aa with
+        uninitialized memory for speed. Occasionally the memory contains values
+        that are equivalent to NaN in which case the product beta*aa contains
+        NaN's for correctly implemented BLAS libraries. In this situation, since
+        we are introducing the NaN's, we need to test whether the BLAS performs
+        correctly. If it *does*, i.e. it actually performs the multiplication
+        beta*aa which will result in NaN's in the result, then we need intialize
+        the memory to zeros.
+        """
+        aa = T.vector('aa')
+        yy = T.vector('yy')
+        xx = T.matrix('xx')
+        f = theano.function(
+            [aa, yy, xx],
+            gemv_no_inplace(aa, 1., xx, yy, 0.),
+            theano.compile.Mode(optimizer='fast_compile')
+        )
+        # Here we introduce NaNs into the data, if they are returned by the BLAS
+        # then we want gemv_c_code to initiliaze the memory to 0 so that we
+        # don't inadvertantly introduce NaNs to the users data.
+        aa_data = numpy.array(
+            float('NaN')*numpy.ones((2,)),
+            dtype=theano.config.floatX
+        )
+        yy_data = numpy.array(
+            numpy.ones((2,))*2,
+            dtype=theano.config.floatX
+        )
+        xx_data = numpy.array(
+            numpy.ones((2, 2)),
+            dtype=theano.config.floatX
+        )
+        zz = f(aa_data, yy_data, xx_data)
+        check_force_gemv_init._force_init_beta = numpy.isnan(zz).any()
+    return check_force_gemv_init._force_init_beta
+check_force_gemv_init._force_init_beta = None
 @local_optimizer([gemv_inplace, gemv_no_inplace])
 def use_c_gemv(node):
@@ -592,7 +687,29 @@ def use_c_gemv(node):
    # Only float32 and float64 are supported for now.
    if (node.op == gemv_no_inplace and
            node.outputs[0].dtype in ['float32', 'float64']):
-        return [CGemv(inplace=False)(*node.inputs)]
+        """
+        We want to maintain the behavoir of any operation that the user adds
+        even if it results in NaNs. However we do not want optimizations to
+        introduce NaNs.
+        GEMV is not always implemented consistenly across BLAS libraries.
+        Sometimes, when beta is 0, they do not perform the multiplication with
+        beta. Other implmentations do. This can cause problems for the inplace
+        GEMV implementation if NaNs happen to be in the newly allocated but
+        uninitalized memory. When the multiplication is not done we do not need
+        to initialize the output memory resulting in a speed up. Otherwise we
+        must initialize the memory to avoid introducing NaN's in the output
+        that weren't in the original graph.
+        The following check determines whether the output memory needs to be
+        initiliazed. It is done here, as opposed to in global scope, because
+        the setup has not been completed at that time and therefore the check
+        cannot be performed at that time.
+        """
+        force_init_beta = check_force_gemv_init()
+        return [CGemv(inplace=False, force_init_beta=force_init_beta)(*node.inputs)]
    if (node.op == gemv_inplace and
            node.outputs[0].dtype in ['float32', 'float64']):
        return [CGemv(inplace=True)(*node.inputs)]

--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
@@ -15,6 +15,8 @@ from theano.tensor.blas_c import CGemv
 from theano.tensor.blas_scipy import ScipyGer
 from theano.tensor.blas import Gemv
+from theano.tensor.blas_c import check_force_gemv_init
 from theano.tests import unittest_tools
 from theano.tests.unittest_tools import TestOptimizationMixin
@@ -137,7 +139,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        # Assert that the dot was optimized somehow
        self.assertFunctionContains0(f, tensor.dot)
-        self.assertFunctionContains1(f, CGemv(True))
+        self.assertFunctionContains1(
+            f,
+            CGemv(inplace=True, force_init_beta=True)
+        )
        # Assert they produce the same output
        assert numpy.allclose(f(self.xval, self.Aval),
@@ -155,7 +160,10 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        # Assert that the dot was optimized somehow
        self.assertFunctionContains0(f, tensor.dot)
-        self.assertFunctionContains1(f, CGemv(True))
+        self.assertFunctionContains1(
+            f,
+            CGemv(inplace=True, force_init_beta=True)
+        )
        # Assert they produce the same output
        assert numpy.allclose(f(self.Aval, self.yval),
@@ -164,6 +172,14 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        assert numpy.allclose(f(self.Aval[::-1, ::-1], self.yval),
                numpy.dot(self.Aval[::-1, ::-1], self.yval))
+    def test_force_gemv_init(self):
+        if check_force_gemv_init():
+            sys.stderr.write(
+                "WARNING: The current BLAS requires Theano to initialize"
+                + " memory for some GEMV calls which will result in a minor"
+                + " degradation in performance for such calls."
+            )
    def t_gemv1(self, m_shp):
        ''' test vector2 + dot(matrix, vector1) '''
        rng = numpy.random.RandomState(unittest_tools.fetch_seed())