Merge pull request #604 from lamblin/debugmode_preallocated_output

Improved testing of preallocated outputs in DebugMode

Merge pull request #604 from lamblin/debugmode_preallocated_output
421b712f · nouiz · 8b90c9ae · 270ffede · 421b712f · 421b712f
--- a/doc/library/compile/debugmode.txt
+++ b/doc/library/compile/debugmode.txt
@@ -63,7 +63,10 @@ Reference

    This mode catches several kinds of internal error:

-    - inconsistent c_code and perform implementations (see `BadCLinkerOutput`)
+    - inconsistent outputs when calling the same Op twice with the same
+      inputs, for instance if c_code and perform implementations, are
+      inconsistent, or in case of incorrect handling of output memory
+      (see `BadThunkOutput`)

    - a variable replacing another when their runtime values don't match.  This is a symptom of
      an incorrect optimization step, or faulty Op implementation (raises `BadOptimization`)
@@ -144,11 +147,17 @@ There following are DebugMode exceptions you might encounter:



-.. class:: BadCLinkerOutput(DebugModeError)
+.. class:: BadThunkOutput(DebugModeError)

-    This exception means that python (``perform``) and c (``c_code``) for an Op
-    didn't compute the same thing like they were supposed to.
-    The problem might be a bug in either ``perform`` or ``c_code`` (or both).
+    This exception means that different calls to the same Op with the same
+    inputs did not compute the same thing like they were supposed to.
+    For instance, it can happen if the python (``perform``) and c (``c_code``)
+    implementations of the Op are inconsistent (the problem might be a bug in
+    either ``perform`` or ``c_code`` (or both)).  It can also happen if
+    ``perform`` or ``c_code`` does not handle correctly output memory that
+    has been preallocated (for instance, if it did not clear the memory before
+    accumulating into it, or if it assumed the memory layout was C-contiguous
+    even if it is not).




--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -385,6 +385,8 @@ import theano and print the config variable, as in:
    A list of kinds of preallocated memory to use as output buffers for
    each Op's computations, separated by ``:``. Implemented modes are:

+    * ``"initial"``: initial storage present in storage map
+      (for instance, it can happen in the inner function of Scan),
    * ``"previous"``: reuse previously-returned memory,
    * ``"c_contiguous"``: newly-allocated C-contiguous memory,
    * ``"f_contiguous"``: newly-allocated Fortran-contiguous memory,
@@ -394,6 +396,15 @@ import theano and print the config variable, as in:

    In order not to test with preallocated memory, use an empty string, ``""``.

+.. attribute:: config.DebugMode.check_preallocated_output_ndim
+
+    Positive int value, default: 4.
+
+    When testing with "strided" preallocated output memory, test
+    all combinations of strides over that number of (inner-most)
+    dimensions. You may want to reduce that number to reduce memory or
+    time usage, but it is advised to keep a minimum of 2.
+
 .. attribute:: config.DebugMode.warn_input_not_reused

    Bool value, default: True

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -16,7 +16,7 @@ from theano import gof
 from theano.gof import Env, graph, utils, link, ops_with_inner_function
 from theano.gof.link import raise_with_op
 from theano.gof.cc import CLinker
-from theano.gof.python25 import any, product as itertools_product
+from theano.gof.python25 import all, any, product as itertools_product
 from theano.configparser import (config, AddConfigVar, BoolParam, IntParam,
        StrParam)
 from theano.compile.function_module import (FunctionMaker,
@@ -64,7 +64,7 @@ AddConfigVar('DebugMode.warn_input_not_reused',
 def is_valid_check_preallocated_output_param(param):
    if not isinstance(param, basestring):
        return False
-    valid = ["previous", "c_contiguous", "f_contiguous",
+    valid = ["initial", "previous", "c_contiguous", "f_contiguous",
             "strided", "wrong_size", "ALL", ""]
    for p in param.split(":"):
        if p not in valid:
@@ -74,6 +74,7 @@ def is_valid_check_preallocated_output_param(param):
 AddConfigVar('DebugMode.check_preallocated_output',
        ('Test thunks with pre-allocated memory as output storage. '
         'This is a list of strings separated by ":". Valid values are: '
+         '"initial" (initial storage in storage map, happens with Scan),'
         '"previous" (previously-returned memory), '
         '"c_contiguous", "f_contiguous", '
         '"strided" (positive and negative strides), '
@@ -82,6 +83,15 @@ AddConfigVar('DebugMode.check_preallocated_output',
        StrParam('', is_valid=is_valid_check_preallocated_output_param),
        in_c_key=False)

+AddConfigVar('DebugMode.check_preallocated_output_ndim',
+        ('When testing with "strided" preallocated output memory, '
+         'test all combinations of strides over that number of '
+         '(inner-most) dimensions. You may want to reduce that number '
+         'to reduce memory or time usage, but it is advised to keep a '
+         'minimum of 2.'),
+        IntParam(4, lambda i: i > 0),
+        in_c_key=False)
+
 import logging
 _logger = logging.getLogger("theano.compile.debugmode")
 _logger.setLevel(logging.WARNING)
@@ -114,24 +124,35 @@ class DebugModeError(Exception):
    pass


-class BadCLinkerOutput(DebugModeError):
-    """Exception: an Op's c_code and perform implementations don't agree."""
+class BadThunkOutput(DebugModeError):
+    """
+    Exception: Calling the same Op twice gives inconsistent outputs.
+
+    It can be raised, for instance, if an Op's c_code and perform method
+    do not agree, or if one of these methods do not give the same result
+    when called twice with the same inputs (but different memory layouts
+    for the output).
+    """

    r = None
    """The `Variable` instance for which conflicting values were computed"""

-    val_py = None
-    """The value computed by `r.owner.op.perform`"""
+    thunk1 = ''
+    val1 = None
+    """The value computed by `thunk1`"""

-    val_c = None
-    """The value computed by `r.owner.op.c_code`"""
+    thunk2 = ''
+    val2 = None
+    """The value computed by `thunk2`"""

-    def __init__(self, r, val_py, val_c):
+    def __init__(self, r, thunk1, val1, thunk2, val2):
        """Initialize members"""
        DebugModeError.__init__(self)  # to be compatible with python2.4
        self.r = r
-        self.val_py = val_py
-        self.val_c = val_c
+        self.thunk1 = thunk1
+        self.val1 = val1
+        self.thunk2 = thunk2
+        self.val2 = val2

    def offending_op(self):
        """Return the Op class whose c_code and perform
@@ -145,45 +166,47 @@ class BadCLinkerOutput(DebugModeError):
        """Return a pretty multiline string representating the cause
        of the exception"""
        sio = StringIO()
-        print >> sio, "BadCLinkerOutput"
-        print >> sio, "  variable:", self.r
-        print >> sio, "  Outputs Type    :", self.r.type
-        print >> sio, "  Inputs Type:", [i.type for i in self.r.owner.inputs]
+        print >> sio, "BadThunkOutput"
+        print >> sio, "  variable    :", self.r
+        print >> sio, "  Outputs Type:", self.r.type
+        print >> sio, "  Inputs Type :", [i.type for i in self.r.owner.inputs]
        print >> sio, "  Apply   :", self.r.owner
-        print >> sio, "  val_py  :", self.val_py
-        print >> sio, "  val_c   :", self.val_c
+        print >> sio, "  thunk1  :", self.thunk1
+        print >> sio, "  thunk2  :", self.thunk2
+        print >> sio, "  val1    :", self.val1
+        print >> sio, "  val2    :", self.val2
        print >> sio, "  op      :", self.offending_op()
        try:
            ssio = StringIO()
-            print >> ssio, "  PyValue shape, dtype, strides, min, max, n_inf, n_nan:",
-            print >> ssio, self.val_py.shape,
-            print >> ssio, self.val_py.dtype,
-            print >> ssio, self.val_py.strides,
-            print >> ssio, self.val_py.min(),
-            print >> ssio, self.val_py.max(),
-            print >> ssio, numpy.isinf(self.val_py).sum(),
-            print >> ssio, numpy.isnan(self.val_py).sum(),
+            print >> ssio, "  Value 1 : shape, dtype, strides, min, max, n_inf, n_nan:",
+            print >> ssio, self.val1.shape,
+            print >> ssio, self.val1.dtype,
+            print >> ssio, self.val1.strides,
+            print >> ssio, self.val1.min(),
+            print >> ssio, self.val1.max(),
+            print >> ssio, numpy.isinf(self.val1).sum(),
+            print >> ssio, numpy.isnan(self.val1).sum(),
            # only if all succeeds to we add anything to sio
            print >> sio, ssio.getvalue()
        except Exception:
            pass
        try:
            ssio = StringIO()
-            print >> ssio, "  CValue shape, dtype, strides, min, max, n_inf, n_nan:",
-            print >> ssio, self.val_c.shape,
-            print >> ssio, self.val_c.dtype,
-            print >> ssio, self.val_c.strides,
-            print >> ssio, self.val_c.min(),
-            print >> ssio, self.val_c.max(),
-            print >> ssio, numpy.isinf(self.val_c).sum(),
-            print >> ssio, numpy.isnan(self.val_c).sum(),
+            print >> ssio, "  Value 2 : shape, dtype, strides, min, max, n_inf, n_nan:",
+            print >> ssio, self.val2.shape,
+            print >> ssio, self.val2.dtype,
+            print >> ssio, self.val2.strides,
+            print >> ssio, self.val2.min(),
+            print >> ssio, self.val2.max(),
+            print >> ssio, numpy.isinf(self.val2).sum(),
+            print >> ssio, numpy.isnan(self.val2).sum(),
            # only if all succeeds to we add anything to sio
            print >> sio, ssio.getvalue()
        except Exception:
            pass
        try:
-            ov = numpy.asarray(self.val_c)
-            nv = numpy.asarray(self.val_py)
+            ov = numpy.asarray(self.val1)
+            nv = numpy.asarray(self.val2)
            ssio = StringIO()
            absdiff = numpy.absolute(nv - ov)
            print >> ssio, "  Max Abs Diff: ", numpy.max(absdiff)
@@ -670,18 +693,27 @@ def _optcheck_env(input_specs, output_specs, accept_inplace=False):
 def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
                  clobber_dr_vals=True,
                  perform=None, warn_input_not_reused=True):
-    """Raise BadDestroyMap if necessary, update dr_vals"""
+    """
+    Raise BadDestroyMap if necessary, update dr_vals
+
+    Returns a list of output variables that actually worked inplace
+    (their value is aliased to the value of at least one input).
+    """
    destroyed_idx_list = []
    destroy_map = getattr(node.op, 'destroy_map', {})
    for o_pos, i_pos_list in destroy_map.iteritems():
        destroyed_idx_list.extend(i_pos_list)
    destroyed_res_list = [node.inputs[i] for i in destroyed_idx_list]

-    if warn_input_not_reused and destroyed_res_list:
-        dmap = getattr(node.op, 'destroy_map', {})
-        for oo, ii in dmap.iteritems():
-            out_var = storage_map[node.outputs[oo]][0]
-            in_var = storage_map[node.inputs[ii[0]]][0]
+    actually_inplace_outputs = []
+    dmap = getattr(node.op, 'destroy_map', {})
+    for oo, ii in dmap.iteritems():
+        out_var = storage_map[node.outputs[oo]][0]
+        in_var = storage_map[node.inputs[ii[0]]][0]
+        if _may_share_memory(out_var, in_var):
+            actually_inplace_outputs.append(node.outputs[oo])
+
+        if warn_input_not_reused and destroyed_res_list:
            if isinstance(node.op, theano.compile.mode.OutputGuard):
                # The point of OutputGuard is to be declared as destructive
                # while not destroying anything
@@ -691,11 +723,14 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
                        "as destroyed was not changed for node '%s'",
                        ii[0], str(node))

-    if warn_input_not_reused:
-        vmap = getattr(node.op, 'view_map', {})
-        for oo, ii in vmap.iteritems():
-            out_var = storage_map[node.outputs[oo]][0]
-            in_var = storage_map[node.inputs[ii[0]]][0]
+    vmap = getattr(node.op, 'view_map', {})
+    for oo, ii in vmap.iteritems():
+        out_var = storage_map[node.outputs[oo]][0]
+        in_var = storage_map[node.inputs[ii[0]]][0]
+        if _may_share_memory(out_var, in_var):
+            actually_inplace_outputs.append(node.outputs[oo])
+
+        if warn_input_not_reused:
            # We don't try to optimize simple scalar and empty ndarray,
            # as this is not worth our time. This happen at least in
            # Subtensor when the output is a scalar But this depend on
@@ -727,6 +762,8 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
                raise BadDestroyMap(node, r_idx, r_vals[r],
                                    storage_map[r][0], perform)

+    return actually_inplace_outputs
+

 def _check_viewmap(node, storage_map):
    """
@@ -994,7 +1031,8 @@ _find_bad_optimizations = _find_bad_optimizations0


 def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
-        storage_map, r_vals, dr_vals, perform, active_order_set):
+        storage_map, r_vals, dr_vals, perform, active_order_set,
+        inplace_outs, init_outputs):
    '''Preallocate outputs in different memory layouts'''

    # To avoid circular imports
@@ -1004,21 +1042,49 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        from theano.sandbox.cuda import CudaNdarray
        from theano.sandbox.cuda import dimshuffle as cuda_dimshuffle

-    # TODO: Sparse, Scalar
+    # TODO: Sparse? Scalar does not really make sense.
+
+    # Do not preallocate memory for outputs that actually work inplace
+    considered_outputs = []
+    for r in node.outputs:
+        if r not in inplace_outs:
+            considered_outputs.append(r)
+
+    # Output storage that was initially present in the storage_map
+    if 'initial' in prealloc_modes or 'ALL' in prealloc_modes:
+        initial_outputs = {}
+        for r in considered_outputs:
+            if r in init_outputs:
+                initial_outputs[r] = init_outputs[r]
+
+        if initial_outputs:
+            yield ('initial', initial_outputs)

    # reuse_output: use a copy of the same storage returned the first time
    # TODO: optimization warning if the storage in reuse_outputs
    # is not reused
    if 'previous' in prealloc_modes or 'ALL' in prealloc_modes:
        reuse_outputs = {}
-        for r in node.outputs:
+        for r in considered_outputs:
            # We want to reuse the exact same memory buffer,
            # so we keep the copy in r_vals
            new_r = _lessbroken_deepcopy(r_vals[r])
            reuse_outputs[r] = r_vals[r]
            r_vals[r] = new_r
+            # Sometimes, outputs can be aliased together.
+            # I'm not sure why it is legitimate, but there are tests about it.
+            # So, we cannot fill r_vals[r] with def_val yet, we have to wait
+            # until all output values are deepcopied.
+
+        for r in considered_outputs:
+            # There is no risk to overwrite inputs, since r does not work
+            # inplace.
+            if isinstance(r.type, (TensorType, CudaNdarrayType)):
+                reuse_outputs[r][...] = numpy.asarray(
+                        def_val).astype(r.type.dtype)

-        yield ('previous', reuse_outputs)
+        if reuse_outputs:
+            yield ('previous', reuse_outputs)
        # clear memory that is not needed any more
        del reuse_outputs

@@ -1026,13 +1092,13 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
    # (for TensorType and CudaNdarray, else None)
    if 'c_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
        c_cont_outputs = {}
-        for r in node.outputs:
+        for r in considered_outputs:
            if isinstance(r.type, (TensorType, CudaNdarrayType)):
                # Build a C-contiguous buffer
                new_buf = r.type.value_zeros(r_vals[r].shape)
                # CudaNdarray don't have flags field
                # assert new_buf.flags["C_CONTIGUOUS"]
-                new_buf += numpy.asarray(def_val).astype(r.type.dtype)
+                new_buf[...] = numpy.asarray(def_val).astype(r.type.dtype)

                c_cont_outputs[r] = new_buf

@@ -1044,13 +1110,13 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
    # (for TensorType, only)
    if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
        f_cont_outputs = {}
-        for r in node.outputs:
+        for r in considered_outputs:
            if isinstance(r.type, (TensorType, CudaNdarrayType)):
                new_buf = numpy.zeros(
                        shape=r_vals[r].shape,
                        dtype=r_vals[r].dtype,
                        order='F')
-                new_buf += def_val
+                new_buf[...] = def_val
                if isinstance(r.type, CudaNdarrayType):
                    # When the CudaNdarray is built, the underlying memory
                    # is c-contiguous, so we transpose it before and after.
@@ -1067,34 +1133,79 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
    # We assume that the different outputs of a same Op will behave
    # independently, and there is no need to test over all combinations
    # of outputs (the time taken is prohibitive).
+    # When all outputs on a certain dimension are broadcastable, the Op
+    # can assume that the shape is 1 on that dimension, and stride testing
+    # is less relevant.
+    # Dimensions should be align by the innermost index, so we iterate
+    # from the end of shapes.
    max_ndim = 0
-    for r in node.outputs:
+    rev_out_broadcastable = []
+    for r in considered_outputs:
        if isinstance(r.type, (TensorType, CudaNdarrayType)):
-            max_ndim = max(max_ndim, r.ndim)
+            if max_ndim < r.ndim:
+                rev_out_broadcastable += [True] * (r.ndim - max_ndim)
+                max_ndim = r.ndim
+            assert len(rev_out_broadcastable) == max_ndim
+
+            for i, b in enumerate(r.broadcastable[::-1]):
+                rev_out_broadcastable[i] = rev_out_broadcastable[i] and b
+    out_broadcastable = rev_out_broadcastable[::-1]

    if 'strided' in prealloc_modes or 'ALL' in prealloc_modes:
+        check_ndim = config.DebugMode.check_preallocated_output_ndim
        # Initial allocation
        init_strided = {}
-        for r in node.outputs:
+        for r in considered_outputs:
            if isinstance(r.type, (TensorType, CudaNdarrayType)):
-                # Create a buffer twice as large in every dimension
-                new_buf = r.type.value_zeros(
-                        [(s * 2) for s in r_vals[r].shape])
+                # Create a buffer twice as large in every dimension,
+                # except if broadcastable, or for dimensions above
+                # config.DebugMode.check_preallocated_output_ndim
+                buf_shape = []
+                for s, b in zip(r_vals[r].shape, r.broadcastable):
+                    if b or ((r.ndim - len(buf_shape)) > check_ndim):
+                        buf_shape.append(s)
+                    else:
+                        buf_shape.append(s * 2)
+                new_buf = r.type.value_zeros(buf_shape)
+                new_buf[...] = numpy.asarray(def_val).astype(r.type.dtype)
                init_strided[r] = new_buf

-        for step_signs in itertools_product((-1, 1), repeat=max_ndim):
+        # The number of combinations is exponential in the number of
+        # dimensions, and some ops can have tens of outputs. To prevent
+        # tests from lasting days, we use the same strides for all
+        # dimensions but the last check_ndim ones.
+        # Moreover, to avoid memory problems, we do not test with strides
+        # 2 and -2 on those dimensions.
+        step_signs_list = []
+        for b in out_broadcastable[-check_ndim:]:
+            if b:
+                step_signs_list.append((1,))
+            else:
+                step_signs_list.append((-1, 1))
+
+        # Use the same step on all dimensions before the last check_ndim.
+        if all(out_broadcastable[:-check_ndim]):
+            step_signs_list = [(1,)] + step_signs_list
+        else:
+            step_signs_list = [(-1, 1)] + step_signs_list
+
+        for step_signs in itertools_product(*step_signs_list):
            for step_size in (1, 2):
                strided = {}
-                steps = [s * step_size for s in step_signs]
+
+                # First, the dimensions above check_ndim, then the other ones
+                # Do not test with 2 or -2 for dimensions above check_ndim
+                steps = [step_signs[0]] * len(out_broadcastable[:-check_ndim])
+                steps += [s * step_size for s in step_signs[1:]]
+
                name = 'strided%s' % str(tuple(steps))
-                for r in node.outputs:
+                for r in considered_outputs:
                    if r in init_strided:
-                        # Build lists of slices, for strides and shapes
                        strides = []
                        shapes = []
                        for i, size in enumerate(r_vals[r].shape):
-                            strides.append(slice(None, None, steps[i]))
                            shapes.append(slice(None, size, None))
+                            strides.append(slice(None, None, steps[i]))

                        r_buf = init_strided[r]

@@ -1103,15 +1214,19 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                        assert r_buf.shape == r_vals[r].shape

                        r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)
-
                        strided[r] = r_buf

-                yield (name, strided)
+                if strided:
+                    yield (name, strided)
                del strided

    if 'wrong_size' in prealloc_modes or 'ALL' in prealloc_modes:
        # For each dimension, try size-1, size, size+1
-        for dim in xrange(max_ndim):
+        for dim, b in enumerate(out_broadcastable):
+            if b:
+                # The shape has to be 1
+                continue
+
            shape_diff = [0] * max_ndim
            for diff in (-1, 1):
                shape_diff[dim] = diff
@@ -1119,22 +1234,25 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                wrong_size = {}
                name = 'wrong_size%s' % str(tuple(shape_diff))

-                for r in node.outputs:
+                for r in considered_outputs:
                    if isinstance(r.type, (TensorType, CudaNdarrayType)):
                        r_shape_diff = shape_diff[:r.ndim]
                        out_shape = [max((s + sd), 0)
                                for s, sd in zip(r_vals[r].shape,
                                                 r_shape_diff)]
-                        new_buf = r.type.value_zeros(r_vals[r].shape)
-                        new_buf += numpy.asarray(def_val).astype(r.type.dtype)
+                        new_buf = r.type.value_zeros(out_shape)
+                        new_buf[...] = numpy.asarray(
+                                def_val).astype(r.type.dtype)
                        wrong_size[r] = new_buf

-                yield (name, wrong_size)
+                if wrong_size:
+                    yield (name, wrong_size)
                del wrong_size


 def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
-        storage_map, r_vals, dr_vals, perform, active_order_set):
+        storage_map, r_vals, dr_vals, perform, active_order_set,
+        inplace_outs, init_outputs):
    '''Try to apply thunk() on different output storages'''

    # If node has an inner compiled Theano function with mode DebugMode,
@@ -1163,17 +1281,33 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
                    changed_inner_mode = True
                    _logger.info('changing inner mode')

+        # Set of inputs that are marked as destroyed or viewed
+        aliased_inputs = set()
+        dmap = getattr(node.op, 'destroy_map', {})
+        vmap = getattr(node.op, 'view_map', {})
+        for i, r in enumerate(node.inputs):
+            if any(i in v for v in (dmap.values() + vmap.values())):
+                aliased_inputs.add(r)
+
        _logger.debug('starting preallocated output checking')
        for (name, out_map) in _get_preallocated_maps(
                node, thunk, prealloc_modes, def_val, storage_map, r_vals,
-                dr_vals, perform, active_order_set):
+                dr_vals, perform, active_order_set, inplace_outs,
+                init_outputs):
            _logger.debug('  name = %s', name)

-            # Copy the inputs over, if they were marked as destroyed
-            dmap = getattr(node.op, 'destroy_map', {})
-            for i, r in enumerate(node.inputs):
-                if any(i in v for v in dmap.values()):
-                    storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
+            thunk_name = '%s with %s output' % (perform, name)
+
+            if not out_map:
+                # Map is empty, there is no need to execute thunk() again
+                _logger.warn('%s: out_map is empty', name)
+                continue
+
+            # Copy the inputs over, if they were marked as destroyed or viewed
+            # (we will destroy the output at some point so it can destroy
+            # the input)
+            for r in aliased_inputs:
+                storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])

            # Get the appropriate output storages
            # (no copy)
@@ -1186,13 +1320,13 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
            for r in node.outputs:
                if not r.type.is_valid_value(storage_map[r][0]):
                    raise InvalidValueError(r, storage_map[r][0],
-                            hint='%s with %s output' % (perform, name),
+                            hint=thunk_name,
                            specific_hint=r.type.value_validity_msg(
                            storage_map[r][0]))

            _check_inputs(node, storage_map, r_vals, dr_vals, active_order_set,
                          clobber_dr_vals=False,
-                          perform='%s with output %s' % (perform, name),
+                          perform=thunk_name,
                          warn_input_not_reused=False)

            _check_viewmap(node, storage_map)
@@ -1200,8 +1334,9 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
            for r in node.outputs:
                if not r.type.values_eq_approx(r_vals[r], storage_map[r][0]):
                    # TODO: indicate it is not a C/Py problem
-                    raise BadCLinkerOutput(r, val_py=r_vals[r],
-                                           val_c=storage_map[r][0])
+                    raise BadThunkOutput(r,
+                            thunk1='Reference value', val1=r_vals[r],
+                            thunk2=thunk_name, val2=storage_map[r][0])

            # Clear storage_map
            for r in node.outputs:
@@ -1617,11 +1752,14 @@ class _Linker(gof.link.LocalLinker):
                        storage_map[r][0] = None
                        r_vals_initialized.append(r)

-                # TODO: store them in another map, and test the thunks on
+                # store preallocated outputs in another map, and test the thunks on
                # them as output storages.
+                init_outputs = {}
                for r in storage_map:
                    if r in env.outputs:
-                        storage_map[r][0] = None
+                        if storage_map[r][0] is not None:
+                            init_outputs[r] = storage_map[r][0]
+                            storage_map[r][0] = None

                #####
                #  Precondition: the storage map is empty, transferred
@@ -1673,11 +1811,11 @@ class _Linker(gof.link.LocalLinker):
                                raise InvalidValueError(r, storage_map[r][0],
                                                        hint='perform output',
                                                        specific_hint=hint2)
-
-                        _check_inputs(node, storage_map, r_vals, dr_vals,
-                                      active_order_set,
-                                      clobber_dr_vals=True, perform='py',
-                                      warn_input_not_reused=config.DebugMode.warn_input_not_reused)
+                        py_inplace_outs = _check_inputs(
+                                node, storage_map, r_vals, dr_vals,
+                                active_order_set,
+                                clobber_dr_vals=True, perform='py',
+                                warn_input_not_reused=config.DebugMode.warn_input_not_reused)

                        _check_viewmap(node, storage_map)

@@ -1705,7 +1843,9 @@ class _Linker(gof.link.LocalLinker):
                                    r_vals=r_vals,
                                    dr_vals=dr_vals,
                                    perform='py',
-                                    active_order_set=active_order_set)
+                                    active_order_set=active_order_set,
+                                    inplace_outs=py_inplace_outs,
+                                    init_outputs=init_outputs)

                        # print >> sys.stderr, i, "DEBUGMODE thunk_py %100s %50s %30s" % (node,
                            #[(id(o), numpy.asarray(storage_map[o][0])[0,0]) for o in node.inputs],
@@ -1717,6 +1857,7 @@ class _Linker(gof.link.LocalLinker):
                        clobber = True
                        if thunk_py:
                            dmap = getattr(node.op, 'destroy_map', {})
+                            vmap = getattr(node.op, 'view_map', {})
                            for i, r in enumerate(node.inputs):
                                # if thunk_py ran, and we still got this far,
                                # it means that the destroy_map of the Op (and view_map) are
@@ -1725,7 +1866,10 @@ class _Linker(gof.link.LocalLinker):
                                # fact not been destroyed.
                                # Therefore... we only need to overwrite inputs that *have*
                                # been marked as destroyed.
-                                if any(i in v for v in dmap.values()):
+                                # Inputs marked as viewd are unsafe too,
+                                # because the corresponding output can
+                                # be destroyed.
+                                if any(i in v for v in (dmap.values() + vmap.values())):
                                    storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])

                            clobber = False
@@ -1750,10 +1894,11 @@ class _Linker(gof.link.LocalLinker):
                                    self.maker.mode.require_matching_strides,
                                    node.op)

-                        _check_inputs(node, storage_map, r_vals,
-                                      dr_vals, active_order_set,
-                                      clobber_dr_vals=clobber, perform='c',
-                                      warn_input_not_reused=config.DebugMode.warn_input_not_reused)
+                        c_inplace_outs = _check_inputs(
+                                node, storage_map, r_vals,
+                                dr_vals, active_order_set,
+                                clobber_dr_vals=clobber, perform='c',
+                                warn_input_not_reused=config.DebugMode.warn_input_not_reused)

                        _check_viewmap(node, storage_map)

@@ -1766,7 +1911,9 @@ class _Linker(gof.link.LocalLinker):
                                if not r.type.values_eq_approx(r_vals[r], storage_map[r][0]):
                                    #import pdb; pdb.set_trace()
                                    #r.type.values_eq_approx(r_vals[r], storage_map[r][0])
-                                    raise BadCLinkerOutput(r, val_py=r_vals[r], val_c=storage_map[r][0])
+                                    raise BadThunkOutput(r,
+                                            thunk1='perform', val1=r_vals[r],
+                                            thunk2='c_code', val2=storage_map[r][0])
                            else:
                                #print >> sys.stderr, i, "DEBUGMODE storing reference output %x" % id(storage_map[r][0])
                                #retrieve each output from the storage_map
@@ -1793,7 +1940,9 @@ class _Linker(gof.link.LocalLinker):
                                    r_vals=r_vals,
                                    dr_vals=dr_vals,
                                    perform='c code',
-                                    active_order_set=active_order_set)
+                                    active_order_set=active_order_set,
+                                    inplace_outs=c_inplace_outs,
+                                    init_outputs=init_outputs)

                        # print >> sys.stderr, i, "DEBUGMODE thunk_c  %100s %50s %30s" % (node,
                            #[(id(o), numpy.asarray(storage_map[o][0])[0,0]) for o in node.inputs],
@@ -2176,7 +2325,10 @@ class DebugMode(Mode):

    This mode catches several kinds of internal error:

-    - inconsistent c_code and perform implementations (see `BadCLinkerOutput`)
+    - inconsistent outputs when calling the same Op twice with the same
+      inputs, for instance if c_code and perform implementations, are
+      inconsistent, or in case of incorrect handling of output memory
+      (see `BadThunkOutput`),

    - a variable replacing another when their runtime values don't
      match.  This is a symptom of an incorrect optimization step, or

--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
+from nose.plugins.skip import SkipTest
+import unittest
+
 import numpy

 from theano import config
@@ -7,7 +10,6 @@ import theano.tensor
 from theano.compile import debugmode
 import theano.compile
 from theano.tests import unittest_tools as utt
-import unittest


 def test0():
@@ -194,7 +196,7 @@ wb1i = WeirdBrokenOp('times1_inplace')
 wb1 = WeirdBrokenOp('times1')


-def test_badclinkeroutput():
+def test_badthunkoutput():

    a = theano.tensor.dvector()
    b = theano.tensor.dvector()
@@ -210,7 +212,7 @@ def test_badclinkeroutput():
    f_good([1.0, 2.0, 3.0], [2, 3, 4])
    try:
        f_inconsistent([1.0, 2.0, 3.0], [2, 3, 4])
-    except debugmode.BadCLinkerOutput, e:
+    except debugmode.BadThunkOutput, e:
        #print repr(e)
        assert e.r.owner.op is inconsistent
        return  # TEST PASS
@@ -651,7 +653,48 @@ class BrokenCImplementationAdd(gof.Op):
        """ % dict(locals(), **sub)


+class VecAsRowAndCol(gof.Op):
+    """
+    Transforms a vector into a row and a column.
+
+    This Op exists to check everything is correct when an Op has
+    two outputs with different broadcasting patterns.
+    """
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def make_node(self, v):
+        if not isinstance(v, gof.Variable):
+            v = theano.tensor.as_tensor_variable(v)
+        assert v.type.ndim == 1
+        type_class = type(v.type)
+        out_r_type = type_class(dtype=v.dtype, broadcastable=(True, False))
+        out_c_type = type_class(dtype=v.dtype, broadcastable=(False, True))
+        return gof.Apply(self, [v], [out_r_type(), out_c_type()])
+
+    def perform(self, node, inp, out):
+        v, = inp
+        r, c = out
+        lv = v.shape[0]
+        if (r[0] is None) or (r[0].shape != (1, lv)):
+            r[0] = node.outputs[0].type.value_zeros((1, lv))
+
+        if (c[0] is None) or (c[0].shape != (lv, 1)):
+            c[0] = node.outputs[1].type.value_zeros((lv, 1))
+
+        # Python loop because CudaNdarrays do not support newaxis
+        for i in range(lv):
+            r[0][0, i] = v[i]
+            c[0][i, 0] = v[i]
+
+
 class Test_preallocated_output(unittest.TestCase):
+    def setUp(self):
+        self.rng = numpy.random.RandomState(seed=utt.fetch_seed())

    def test_f_contiguous(self):
        a = theano.tensor.fmatrix('a')
@@ -660,30 +703,42 @@ class Test_preallocated_output(unittest.TestCase):
        # Needed so that z is not the output of the graph
        out = theano.tensor.dot(z, numpy.eye(7))

-        rng = numpy.random.RandomState(seed=utt.fetch_seed())
-        a_val = rng.randn(7, 7).astype('float32')
-        b_val = rng.randn(7, 7).astype('float32')
+        a_val = self.rng.randn(7, 7).astype('float32')
+        b_val = self.rng.randn(7, 7).astype('float32')

-        init_conf_val = config.DebugMode.check_preallocated_output
-        try:
-            # Should work
-            config.DebugMode.check_preallocated_output = 'c_contiguous'
-
-            f = theano.function([a, b], out, mode='DEBUG_MODE')
-            out_val = f(a_val, b_val)
-            #print 'out_val =', out_val
-            #print out_val.strides
-
-            # Should work for now (0.4.0), because the C thunk does not care
-            # at all of what is in storage_map initially.
-            # When it changes, the call to f should raise an Exception,
-            # since the output buffer is used incorrectly.
-            config.DebugMode.check_preallocated_output = 'f_contiguous'
-
-            f = theano.function([a, b], out, mode='DEBUG_MODE')
-            out_val = f(a_val, b_val)
-            #print 'out_val =', out_val
-            #print out_val.strides
-
-        finally:
-            config.DebugMode.check_preallocated_output = init_conf_val
+        # Should work
+        mode = debugmode.DebugMode(
+                check_preallocated_output=['c_contiguous'])
+
+        f = theano.function([a, b], out, mode=mode)
+        out_val = f(a_val, b_val)
+        #print 'out_val =', out_val
+        #print out_val.strides
+
+        # Should raise an Exception, since the output buffer is
+        # used incorrectly.
+        mode = debugmode.DebugMode(
+                check_preallocated_output=['f_contiguous'])
+
+        f = theano.function([a, b], out, mode=mode)
+        self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
+
+    def test_output_broadcast_tensor(self):
+        v = theano.tensor.fvector('v')
+        c, r = VecAsRowAndCol()(v)
+        f = theano.function([v], [c, r])
+
+        v_val = self.rng.randn(5).astype('float32')
+        f(v_val)
+
+    def test_output_broadcast_cuda(self):
+        from theano.sandbox import cuda
+        if not cuda.cuda_available:
+            raise SkipTest("Optional package Cuda disabled")
+
+        v = cuda.fvector('v')
+        c, r = VecAsRowAndCol()(v)
+        f = theano.function([v], [c, r])
+
+        v_val = cuda.CudaNdarray(self.rng.randn(5).astype('float32'))
+        f(v_val)