Merge pull request #604 from lamblin/debugmode_preallocated_output

Improved testing of preallocated outputs in DebugMode

Merge pull request #604 from lamblin/debugmode_preallocated_output
421b712f · nouiz · 8b90c9ae · 270ffede · 421b712f · 421b712f
--- a/doc/library/compile/debugmode.txt
+++ b/doc/library/compile/debugmode.txt
@@ -63,7 +63,10 @@ Reference

    This mode catches several kinds of internal error:

-    - inconsistent c_code and perform implementations (see `BadCLinkerOutput`)
+    - inconsistent outputs when calling the same Op twice with the same
+      inputs, for instance if c_code and perform implementations, are
+      inconsistent, or in case of incorrect handling of output memory
+      (see `BadThunkOutput`)

    - a variable replacing another when their runtime values don't match.  This is a symptom of
      an incorrect optimization step, or faulty Op implementation (raises `BadOptimization`)
@@ -144,11 +147,17 @@ There following are DebugMode exceptions you might encounter:



-.. class:: BadCLinkerOutput(DebugModeError)
+.. class:: BadThunkOutput(DebugModeError)

-    This exception means that python (``perform``) and c (``c_code``) for an Op
-    didn't compute the same thing like they were supposed to.
-    The problem might be a bug in either ``perform`` or ``c_code`` (or both).
+    This exception means that different calls to the same Op with the same
+    inputs did not compute the same thing like they were supposed to.
+    For instance, it can happen if the python (``perform``) and c (``c_code``)
+    implementations of the Op are inconsistent (the problem might be a bug in
+    either ``perform`` or ``c_code`` (or both)).  It can also happen if
+    ``perform`` or ``c_code`` does not handle correctly output memory that
+    has been preallocated (for instance, if it did not clear the memory before
+    accumulating into it, or if it assumed the memory layout was C-contiguous
+    even if it is not).




--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -385,6 +385,8 @@ import theano and print the config variable, as in:
    A list of kinds of preallocated memory to use as output buffers for
    each Op's computations, separated by ``:``. Implemented modes are:

+    * ``"initial"``: initial storage present in storage map
+      (for instance, it can happen in the inner function of Scan),
    * ``"previous"``: reuse previously-returned memory,
    * ``"c_contiguous"``: newly-allocated C-contiguous memory,
    * ``"f_contiguous"``: newly-allocated Fortran-contiguous memory,
@@ -394,6 +396,15 @@ import theano and print the config variable, as in:

    In order not to test with preallocated memory, use an empty string, ``""``.

+.. attribute:: config.DebugMode.check_preallocated_output_ndim
+
+    Positive int value, default: 4.
+
+    When testing with "strided" preallocated output memory, test
+    all combinations of strides over that number of (inner-most)
+    dimensions. You may want to reduce that number to reduce memory or
+    time usage, but it is advised to keep a minimum of 2.
+
 .. attribute:: config.DebugMode.warn_input_not_reused

    Bool value, default: True

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
+from nose.plugins.skip import SkipTest
+import unittest
+
 import numpy

 from theano import config
@@ -7,7 +10,6 @@ import theano.tensor
 from theano.compile import debugmode
 import theano.compile
 from theano.tests import unittest_tools as utt
-import unittest


 def test0():
@@ -194,7 +196,7 @@ wb1i = WeirdBrokenOp('times1_inplace')
 wb1 = WeirdBrokenOp('times1')


-def test_badclinkeroutput():
+def test_badthunkoutput():

    a = theano.tensor.dvector()
    b = theano.tensor.dvector()
@@ -210,7 +212,7 @@ def test_badclinkeroutput():
    f_good([1.0, 2.0, 3.0], [2, 3, 4])
    try:
        f_inconsistent([1.0, 2.0, 3.0], [2, 3, 4])
-    except debugmode.BadCLinkerOutput, e:
+    except debugmode.BadThunkOutput, e:
        #print repr(e)
        assert e.r.owner.op is inconsistent
        return  # TEST PASS
@@ -651,7 +653,48 @@ class BrokenCImplementationAdd(gof.Op):
        """ % dict(locals(), **sub)


+class VecAsRowAndCol(gof.Op):
+    """
+    Transforms a vector into a row and a column.
+
+    This Op exists to check everything is correct when an Op has
+    two outputs with different broadcasting patterns.
+    """
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def make_node(self, v):
+        if not isinstance(v, gof.Variable):
+            v = theano.tensor.as_tensor_variable(v)
+        assert v.type.ndim == 1
+        type_class = type(v.type)
+        out_r_type = type_class(dtype=v.dtype, broadcastable=(True, False))
+        out_c_type = type_class(dtype=v.dtype, broadcastable=(False, True))
+        return gof.Apply(self, [v], [out_r_type(), out_c_type()])
+
+    def perform(self, node, inp, out):
+        v, = inp
+        r, c = out
+        lv = v.shape[0]
+        if (r[0] is None) or (r[0].shape != (1, lv)):
+            r[0] = node.outputs[0].type.value_zeros((1, lv))
+
+        if (c[0] is None) or (c[0].shape != (lv, 1)):
+            c[0] = node.outputs[1].type.value_zeros((lv, 1))
+
+        # Python loop because CudaNdarrays do not support newaxis
+        for i in range(lv):
+            r[0][0, i] = v[i]
+            c[0][i, 0] = v[i]
+
+
 class Test_preallocated_output(unittest.TestCase):
+    def setUp(self):
+        self.rng = numpy.random.RandomState(seed=utt.fetch_seed())

    def test_f_contiguous(self):
        a = theano.tensor.fmatrix('a')
@@ -660,30 +703,42 @@ class Test_preallocated_output(unittest.TestCase):
        # Needed so that z is not the output of the graph
        out = theano.tensor.dot(z, numpy.eye(7))

-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        a_val = rng.randn(7, 7).astype('float32')
-        b_val = rng.randn(7, 7).astype('float32')
+        a_val = self.rng.randn(7, 7).astype('float32')
+        b_val = self.rng.randn(7, 7).astype('float32')

-        init_conf_val = config.DebugMode.check_preallocated_output
-        try:
-            # Should work
-            config.DebugMode.check_preallocated_output = 'c_contiguous'
-
-            f = theano.function([a, b], out, mode='DEBUG_MODE')
-            out_val = f(a_val, b_val)
-            #print 'out_val =', out_val
-            #print out_val.strides
-
-            # Should work for now (0.4.0), because the C thunk does not care
-            # at all of what is in storage_map initially.
-            # When it changes, the call to f should raise an Exception,
-            # since the output buffer is used incorrectly.
-            config.DebugMode.check_preallocated_output = 'f_contiguous'
-
-            f = theano.function([a, b], out, mode='DEBUG_MODE')
-            out_val = f(a_val, b_val)
-            #print 'out_val =', out_val
-            #print out_val.strides
-
-        finally:
-            config.DebugMode.check_preallocated_output = init_conf_val
+        # Should work
+        mode = debugmode.DebugMode(
+                check_preallocated_output=['c_contiguous'])
+
+        f = theano.function([a, b], out, mode=mode)
+        out_val = f(a_val, b_val)
+        #print 'out_val =', out_val
+        #print out_val.strides
+
+        # Should raise an Exception, since the output buffer is
+        # used incorrectly.
+        mode = debugmode.DebugMode(
+                check_preallocated_output=['f_contiguous'])
+
+        f = theano.function([a, b], out, mode=mode)
+        self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
+
+    def test_output_broadcast_tensor(self):
+        v = theano.tensor.fvector('v')
+        c, r = VecAsRowAndCol()(v)
+        f = theano.function([v], [c, r])
+
+        v_val = self.rng.randn(5).astype('float32')
+        f(v_val)
+
+    def test_output_broadcast_cuda(self):
+        from theano.sandbox import cuda
+        if not cuda.cuda_available:
+            raise SkipTest("Optional package Cuda disabled")
+
+        v = cuda.fvector('v')
+        c, r = VecAsRowAndCol()(v)
+        f = theano.function([v], [c, r])
+
+        v_val = cuda.CudaNdarray(self.rng.randn(5).astype('float32'))
+        f(v_val)