Merge pull request #562 from lamblin/test_preallocated_output_rebase

Checks for preallocated output memory, take 2

Merge pull request #562 from lamblin/test_preallocated_output_rebase
eace991b · nouiz · 1fcd0905 · f94d63f4 · eace991b · eace991b
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -380,7 +380,7 @@ import theano and print the config variable, as in:

 .. attribute:: config.DebugMode.check_preallocated_output

-    Default: ``'ALL'``
+    Default: ``''``

    A list of kinds of preallocated memory to use as output buffers for
    each Op's computations, separated by ``:``. Implemented modes are:
@@ -388,6 +388,8 @@ import theano and print the config variable, as in:
    * ``"previous"``: reuse previously-returned memory,
    * ``"c_contiguous"``: newly-allocated C-contiguous memory,
    * ``"f_contiguous"``: newly-allocated Fortran-contiguous memory,
+    * ``"strided"``: non-contiguous memory with various stride patterns,
+    * ``"wrong_size"``: memory with bigger or smaller dimensions,
    * ``"ALL"``: placeholder for all of the above.

    In order not to test with preallocated memory, use an empty string, ``""``.

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
 from theano import gof
 from theano import gradient as G
-from function_module import orig_function
+from theano.compile.function_module import orig_function
+from theano.gof import ops_with_inner_function


 class OpFromGraph(gof.Op):
@@ -99,3 +100,7 @@ class OpFromGraph(gof.Op):
            return [go(*(inputs + output_grads)) for go in self.grad_ops]
        else:
            raise NotImplementedError
+
+# Since OpFromGraph contains a Theano compiled function, we should let
+# DebugMode know about it
+ops_with_inner_function[OpFromGraph] = 'fn'
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -13,9 +13,10 @@ import numpy

 import theano
 from theano import gof
-from theano.gof import Env, graph, utils, link
+from theano.gof import Env, graph, utils, link, ops_with_inner_function
 from theano.gof.link import raise_with_op
 from theano.gof.cc import CLinker
+from theano.gof.python25 import product as itertools_product
 from theano.configparser import (config, AddConfigVar, BoolParam, IntParam,
        StrParam)
 from theano.compile.function_module import (FunctionMaker,
@@ -64,7 +65,7 @@ def is_valid_check_preallocated_output_param(param):
    if not isinstance(param, basestring):
        return False
    valid = ["previous", "c_contiguous", "f_contiguous",
-             "neg_strides", "ALL", ""]
+             "strided", "wrong_size", "ALL", ""]
    for p in param.split(":"):
        if p not in valid:
            return False
@@ -75,9 +76,10 @@ AddConfigVar('DebugMode.check_preallocated_output',
         'This is a list of strings separated by ":". Valid values are: '
         '"previous" (previously-returned memory), '
         '"c_contiguous", "f_contiguous", '
-         '"neg_strides" (negative strides), and '
+         '"strided" (positive and negative strides), '
+         '"wrong_size" (larger and smaller dimensions), and '
         '"ALL" (all of the above).'),
-        StrParam('ALL', is_valid=is_valid_check_preallocated_output_param),
+        StrParam('', is_valid=is_valid_check_preallocated_output_param),
        in_c_key=False)

 import logging
@@ -988,20 +990,18 @@ def _find_bad_optimizations2(order, reasons, r_vals):
 _find_bad_optimizations = _find_bad_optimizations0


-def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
+def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        storage_map, r_vals, dr_vals, perform, active_order_set):
-    '''Try to apply thunk() on different output storages'''
+    '''Preallocate outputs in different memory layouts'''

    # To avoid circular imports
    from theano.tensor import TensorType
    from theano.sandbox.cuda import cuda_available, CudaNdarrayType
    if cuda_available:
        from theano.sandbox.cuda import CudaNdarray
+        from theano.sandbox.cuda import dimshuffle as cuda_dimshuffle

-    # List of (name, map) pairs of the settings to test
-    prealloc_maps = []
    # TODO: Sparse, Scalar
-    # TODO: wrong shape, more stride patterns

    # reuse_output: use a copy of the same storage returned the first time
    # TODO: optimization warning if the storage in reuse_outputs
@@ -1015,7 +1015,9 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
            reuse_outputs[r] = r_vals[r]
            r_vals[r] = new_r

-        prealloc_maps.append(('previous', reuse_outputs))
+        yield ('previous', reuse_outputs)
+        # clear memory that is not needed any more
+        del reuse_outputs

    # c_cont_output: use a c-continuous array
    # (for TensorType and CudaNdarray, else None)
@@ -1034,65 +1036,194 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
                c_cont_outputs[r] = new_buf

        if len(c_cont_outputs):
-            prealloc_maps.append(('c_contiguous', c_cont_outputs))
+            yield ('c_contiguous', c_cont_outputs)
+            del c_cont_outputs

    # f_cont_output: use a fortran-continuous ndarray
    # (for TensorType, only)
    if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
        f_cont_outputs = {}
        for r in node.outputs:
-            if isinstance(r.type, TensorType):
+            if isinstance(r.type, (TensorType, CudaNdarrayType)):
                new_buf = numpy.zeros(
                        shape=r_vals[r].shape,
                        dtype=r_vals[r].dtype,
                        order='F')
                new_buf += def_val
+                if isinstance(r.type, CudaNdarrayType):
+                    # When the CudaNdarray is built, the underlying memory
+                    # is c-contiguous, so we transpose it before and after.
+                    new_buf = CudaNdarray(new_buf.T)
+                    new_buf = cuda_dimshuffle(new_buf,
+                            range(new_buf.ndim)[::-1])
+
                f_cont_outputs[r] = new_buf

        if len(f_cont_outputs):
-            prealloc_maps.append(('f_contiguous', f_cont_outputs))
+            yield ('f_contiguous', f_cont_outputs)
+            del f_cont_outputs
+
+    # We assume that the different outputs of a same Op will behave
+    # independantly, and there is no need to test over all combinations
+    # of outputs (the time taken is prohibitive).
+    max_ndim = 0
+    for r in node.outputs:
+        if isinstance(r.type, (TensorType, CudaNdarrayType)):
+            max_ndim = max(max_ndim, r.ndim)
+
+    if 'strided' in prealloc_modes or 'ALL' in prealloc_modes:
+        # Initial allocation
+        init_strided = {}
+        for r in node.outputs:
+            if isinstance(r.type, (TensorType, CudaNdarrayType)):
+                # Create a buffer twice as large in every dimension
+                new_buf = r.type.value_zeros(
+                        [(s * 2) for s in r_vals[r].shape])
+                init_strided[r] = new_buf
+
+        for step_signs in itertools_product((-1, 1), repeat=max_ndim):
+            for step_size in (1, 2):
+                strided = {}
+                steps = [s * step_size for s in step_signs]
+                name = 'strided%s' % str(tuple(steps))
+                for r in node.outputs:
+                    if r in init_strided:
+                        # Build lists of slices, for strides and shapes
+                        strides = []
+                        shapes = []
+                        for i, size in enumerate(r_vals[r].shape):
+                            strides.append(slice(None, None, steps[i]))
+                            shapes.append(slice(None, size, None))
+
+                        r_buf = init_strided[r]
+                        if r_buf.ndim > 0:
+                            r_buf = r_buf[tuple(strides)][tuple(shapes)]
+                        assert r_buf.shape == r_vals[r].shape
+
+                        if isinstance(r.type, CudaNdarrayType):
+                            # It seems stupid, but we need to allocate a
+                            # new ndarray and copy it into the GPU one.
+                            # TODO: When it is possible to simply do
+                            # r_buff[...] = def_val, do so.
+                            new_rbuf = numpy.zeros(r_vals[r].shape,
+                                    dtype=r.dtype)
+                            new_rbuf += def_val
+                            r_buf[...] = CudaNdarray(new_rbuf)
+                        else:
+                            r_buf[...] = def_val

-    if 'neg_strides' in prealloc_maps:
-        raise NotImplementedError('Negative strides in'
-                                  ' check_preallocated_output')
+                        strided[r] = r_buf

-    for (name, out_map) in prealloc_maps:
-        # _logger.debug('name = %s, perform = %s', name, perform)
-        # Copy the inputs over again
-        for r in node.inputs:
-            storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
+                yield (name, strided)
+                del strided

-        # Get the appropriate output storages
-        # (no copy)
-        for r in node.outputs:
-            storage_map[r][0] = out_map.get(r, None)
+    if 'wrong_size' in prealloc_modes or 'ALL' in prealloc_modes:
+        # For each dimension, try size-1, size, size+1
+        for dim in xrange(max_ndim):
+            shape_diff = [0] * max_ndim
+            for diff in (-1, 1):
+                shape_diff[dim] = diff

-        thunk()
+                wrong_size = {}
+                name = 'wrong_size%s' % str(tuple(shape_diff))

-        # Check outputs
-        for r in node.outputs:
-            if not r.type.is_valid_value(storage_map[r][0]):
-                raise InvalidValueError(r, storage_map[r][0],
-                        hint='%s with %s output' % (perform, name),
-                        specific_hint=r.type.value_validity_msg(
-                        storage_map[r][0]))
+                for r in node.outputs:
+                    if isinstance(r.type, (TensorType, CudaNdarrayType)):
+                        r_shape_diff = shape_diff[:r.ndim]
+                        out_shape = [max((s + sd), 0)
+                                for s, sd in zip(r_vals[r].shape,
+                                                 r_shape_diff)]
+                        new_buf = numpy.zeros(
+                                shape=out_shape,
+                                dtype=r.dtype)
+                        new_buf += def_val
+                        if isinstance(r.type, CudaNdarrayType):
+                            new_buf = CudaNdarray(new_buf)
+                        wrong_size[r] = new_buf

-        _check_inputs(node, storage_map, r_vals, dr_vals, active_order_set,
-                      clobber_dr_vals=False,
-                      perform='%s with output %s' % (perform, name),
-                      warn_input_not_reused=False)
+                yield (name, wrong_size)
+                del wrong_size

-        _check_viewmap(node, storage_map)

-        for r in node.outputs:
-            if not r.type.values_eq_approx(r_vals[r], storage_map[r][0]):
-                # TODO: indicate it is not a C/Py problem
-                raise BadCLinkerOutput(r, val_py=r_vals[r],
-                                       val_c=storage_map[r][0])
+def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
+        storage_map, r_vals, dr_vals, perform, active_order_set):
+    '''Try to apply thunk() on different output storages'''

-        # Clear storage_map
-        for r in node.outputs:
-            storage_map[r][0] = None
+    # If node has an inner compiled Theano function with mode DebugMode,
+    # disable memory checks in that mode, since they were already run.
+    try:
+        changed_inner_mode = False
+        if type(getattr(node, 'op', None)) in ops_with_inner_function:
+            fn_attr_name = ops_with_inner_function[type(node.op)]
+            fn = getattr(node.op, fn_attr_name, None)
+            if (not fn
+                    or not hasattr(fn, 'maker')
+                    or not hasattr(fn.maker, 'mode')):
+                _logger.warn('Expected theano function not found in %s.%s',
+                        node.op, fn_attr_name)
+            else:
+                if isinstance(fn.maker.mode, DebugMode):
+                    backup_mode = fn.maker.mode
+                    new_mode = copy.copy(backup_mode)
+                    # Disactivate as many checks as possible
+                    new_mode.check_py_code = False
+                    new_mode.check_isfinite = False
+                    new_mode.require_matching_strides = 0
+                    new_mode.check_preallocated_output = []
+                    new_mode.stability_patience = 1
+                    fn.maker.mode = new_mode
+                    changed_inner_mode = True
+                    _logger.info('changing inner mode')
+
+        _logger.debug('starting preallocated output checking')
+        for (name, out_map) in _get_preallocated_maps(
+                node, thunk, prealloc_modes, def_val, storage_map, r_vals,
+                dr_vals, perform, active_order_set):
+            _logger.debug('  name = %s', name)
+
+            # Copy the inputs over, if they were marked as destroyed
+            dmap = getattr(node.op, 'destroy_map', {})
+            for i, r in enumerate(node.inputs):
+                if any(i in v for v in dmap.values()):
+                    storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
+
+            # Get the appropriate output storages
+            # (no copy)
+            for r in node.outputs:
+                storage_map[r][0] = out_map.get(r, None)
+
+            thunk()
+
+            # Check outputs
+            for r in node.outputs:
+                if not r.type.is_valid_value(storage_map[r][0]):
+                    raise InvalidValueError(r, storage_map[r][0],
+                            hint='%s with %s output' % (perform, name),
+                            specific_hint=r.type.value_validity_msg(
+                            storage_map[r][0]))
+
+            _check_inputs(node, storage_map, r_vals, dr_vals, active_order_set,
+                          clobber_dr_vals=False,
+                          perform='%s with output %s' % (perform, name),
+                          warn_input_not_reused=False)
+
+            _check_viewmap(node, storage_map)
+
+            for r in node.outputs:
+                if not r.type.values_eq_approx(r_vals[r], storage_map[r][0]):
+                    # TODO: indicate it is not a C/Py problem
+                    raise BadCLinkerOutput(r, val_py=r_vals[r],
+                                           val_c=storage_map[r][0])
+
+            # Clear storage_map
+            for r in node.outputs:
+                storage_map[r][0] = None
+
+        _logger.debug('finished preallocated output checking')
+    finally:
+        if changed_inner_mode:
+            _logger.info('changing mode back')
+            fn.maker.mode = backup_mode


 class _EnvEvent(object):
@@ -1435,7 +1566,6 @@ class _Linker(gof.link.LocalLinker):
                            if r not in env.inputs]

        # Precompute some things for storage pre-allocation
-        prealloc_modes = config.DebugMode.check_preallocated_output.split(':')
        try:
            def_val = int(config.unittests.rseed)
        except ValueError:
@@ -1451,6 +1581,8 @@ class _Linker(gof.link.LocalLinker):
            # for now.
            #####
            _logger.debug("starting a DebugMode call")
+            _logger.debug("self.maker.mode.check_preallocated_output: %s",
+                    self.maker.mode.check_preallocated_output)
            for x in no_recycling:
                x[0] = None

@@ -1568,7 +1700,9 @@ class _Linker(gof.link.LocalLinker):
                            # clear the storage_map of outputs for the thunk_c
                            storage_map[r][0] = None

-                        if config.DebugMode.check_preallocated_output:
+                        if self.maker.mode.check_preallocated_output:
+                            prealloc_modes = \
+                                    self.maker.mode.check_preallocated_output
                            _logger.debug(
                                    '%i - calling _check_preallocated_output '
                                    'with thunk_py', i)
@@ -1592,7 +1726,8 @@ class _Linker(gof.link.LocalLinker):

                        clobber = True
                        if thunk_py:
-                            for r in node.inputs:
+                            dmap = getattr(node.op, 'destroy_map', {})
+                            for i, r in enumerate(node.inputs):
                                # if thunk_py ran, and we still got this far,
                                # it means that the destroy_map of the Op (and view_map) are
                                # accurate
@@ -1600,15 +1735,8 @@ class _Linker(gof.link.LocalLinker):
                                # fact not been destroyed.
                                # Therefore... we only need to overwrite inputs that *have*
                                # been marked as destroyed.
-
-                                #TODO: The following was tried on revision 6c613932a63c,
-                                # and made lots of tests fail, some complaining about
-                                # AttributeError: 'Env' object has no attribute 'destroyers'
-                                # some giving plain wrong numerical results.
-                                #if env.destroyers(r):
-                                #    storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
-
-                                storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
+                                if any(i in v for v in dmap.values()):
+                                    storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])

                            clobber = False

@@ -1655,7 +1783,9 @@ class _Linker(gof.link.LocalLinker):
                                r_vals[r] = storage_map[r][0]
                            storage_map[r][0] = None #clear the storage_map for the thunk_c

-                        if config.DebugMode.check_preallocated_output:
+                        if self.maker.mode.check_preallocated_output:
+                            prealloc_modes = \
+                                    self.maker.mode.check_preallocated_output
                            def thunk():
                                try:
                                    thunk_c()
@@ -2111,6 +2241,16 @@ class DebugMode(Mode):
    but is generally overly strict.) 0 no check, 1 warn, 2 err.
    """

+    check_preallocated_output = config.DebugMode.check_preallocated_output
+    check_preallocated_output = check_preallocated_output.split(':')
+    """
+    List of strings representing ways to pre-allocate output memory in
+    tests.  Valid values are: "previous" (previously-returned memory),
+    "c_contiguous", "f_contiguous", "strided" (positive and negative
+    strides), "wrong_size" (larger and smaller dimensions), and "ALL"
+    (all of the above).
+    """
+
    # This function will be used to create a FunctionMaker in
    # function_module.function
    def function_maker(self, i, o, m, *args, **kwargs):
@@ -2126,6 +2266,7 @@ class DebugMode(Mode):
            check_c_code=None,
            check_py_code=None,
            check_isfinite=None,
+            check_preallocated_output=None,
            require_matching_strides=None,
            linker=None):

@@ -2157,6 +2298,10 @@ class DebugMode(Mode):
        if check_isfinite is not None:
            self.check_isfinite = check_isfinite

+        if check_preallocated_output is not None:
+            # Copy to avoid sharing the same list across different instances
+            self.check_preallocated_output = check_preallocated_output[:]
+
        if require_matching_strides is not None:
            self.require_matching_strides = require_matching_strides

@@ -2164,4 +2309,8 @@ class DebugMode(Mode):
            raise ValueError('DebugMode has to check at least one of c and py '
                             'code')

+    def __str__(self):
+        return "DebugMode(linker=%s, optimizer=%s)" % (
+                self.provided_linker, self.provided_optimizer)
+
 register_mode('DEBUG_MODE', DebugMode(optimizer='fast_run'))
--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -264,7 +264,10 @@ def test_stochasticoptimization():
    try:
        theano.function([a, b],
                theano.tensor.add(a, b),
-                mode=debugmode.DebugMode(optimizer=opt, check_c_code=True))
+                mode=debugmode.DebugMode(
+                    optimizer=opt,
+                    check_c_code=True,
+                    stability_patience=max(2, config.DebugMode.patience)))
    except debugmode.StochasticOrder:
        return  # TEST PASS
    assert False

--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -18,7 +18,7 @@ from link import \
    Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany

 from op import \
-    Op, PureOp
+    Op, PureOp, ops_with_inner_function

 from opt import (Optimizer, optimizer, SeqOptimizer,
    MergeOptimizer, MergeOptMerge, 

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -717,3 +717,17 @@ def get_debug_values(*args):
        return rval

    return [tuple(rval)]
+
+
+ops_with_inner_function = {}
+"""
+Registry of Ops that have an inner compiled Theano function.
+
+The keys are Op classes (not instances), and values are the name of the
+attribute that contains the function. For instance, if the function is
+self.fn, the value will be 'fn'.
+
+We need that to be able not to run debug checks a number of times that is
+exponential in the nesting level of those ops.
+For instance, Scan will be registered here.
+"""
--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -37,11 +37,6 @@ def my_rand(*shape):
    return theano._asarray(numpy.random.rand(*shape), dtype='float32')


-def transpose(cuda_mat):
-    # The easiest way to transpose a cuda matrix for now
-    return tcn.dimshuffle(cuda_mat, [1, 0])
-
-
 def test_dot22():
    def cmp(a_shp, b_shp):
        a0 = my_rand(*a_shp)

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -54,6 +54,11 @@ class CudaNdarrayType(Type):
    A cyclic dependency is avoided by not hardcoding this class.
    """

+    value_zeros = staticmethod(cuda.CudaNdarray.zeros)
+    """
+    Create an CudaNdarray full of 0 values
+    """
+
    def __init__(self, broadcastable, name=None, dtype=None):
        if dtype != None and dtype != 'float32':
            raise TypeError('%s only supports dtype float32 for now. Tried '

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -278,8 +278,8 @@ class Scan(PureOp):
                                           str(outer_mitsot),
                                           argoffset + idx,
                                           outer_mitsot.type.dtype,
-                                           otuer_mitsot.type.ndim,
-                                           str(inner_mitsot[ipos + k]),
+                                           outer_mitsot.type.ndim,
+                                           str(inner_mitsots[ipos + k]),
                                           inner_mitsots[ipos + k].type.dtype,
                                           inner_mitsots[ipos + k].type.ndim))
            ipos += len(itaps)
@@ -1676,6 +1676,11 @@ class Scan(PureOp):
        return final_outs


+# Since Scan is an op that contains a Theano compiled function, it is
+# useful to let DebugMode know about it.
+gof.ops_with_inner_function[Scan] = 'fn'
+
+
 @theano.compile.profilemode.register_profiler_printer
 def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
                    apply_time, apply_cimpl, message, outputs_size,

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1024,6 +1024,13 @@ class TensorType(Type):
        else:
            return ()

+    def value_zeros(self, shape):
+        """
+        Create an numpy ndarray full of 0 values.
+        """
+        return numpy.zeros(shape, dtype=self.dtype)
+
+
 # Register CudaNdarrayType to the OutputGuard list of known types
 # to have OutputGuard generate C code for this type.
 theano.compile.mode.register_OutputGuard_c_code(TensorType)

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -742,34 +742,45 @@ class Elemwise(Op):
                    raise ValueError('\n'.join(msg_chunks))
                else:
                    raise ValueError(base_exc_str)
-                # Other mismatches will be caught by the ufunc
+
+        # Determine the shape of outputs
+        out_shape = []
+        for values in zip(*[input.shape for input in inputs]):
+            if numpy.prod(values) == 0:
+                # All non-broadcasted dimensions should be zero
+                assert max(values) <= 1
+                out_shape.append(0)
+            else:
+                out_shape.append(max(values))
+        out_shape = tuple(out_shape)

        if not self.inplace_pattern:
            for output, storage in zip(node.outputs, output_storage):
                odat = storage[0]
-                shape = [max(values)
-                        for values in zip(*[input.shape for input in inputs])]
                if odat is not None:
-                    # reuse storage if we can
-                    odat.resize(shape, refcheck=0)
-                else:
-                    odat = numpy.ndarray(shape, dtype=output.type.dtype)
+                    if odat.shape != out_shape:
+                        # It is unsafe to try to resize odat,
+                        # we have to allocate output storage.
+                        odat = None
+                if odat is None:
+                    odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
                storage[0] = odat
        else:
-            for i, (output, storage) in enumerate(zip(node.outputs,
-                    output_storage)):
+            for i, (output, storage) in enumerate(
+                    zip(node.outputs, output_storage)):
                #i is an output idx
                if i in self.inplace_pattern:
                    odat = inputs[self.inplace_pattern[i]]
                else:
                    odat = storage[0]
-                    shape = [max(values)
-                             for values in zip(*[input.shape
-                                 for input in inputs])]
                    if odat is not None:
-                        odat.resize(shape, refcheck=0)
-                    else:
-                        odat = numpy.ndarray(shape, dtype=output.type.dtype)
+                        if odat.shape != out_shape:
+                            # It is unsafe to try to resize odat,
+                            # we have to allocate output storage.
+                            odat = None
+                    if odat is None:
+                        odat = numpy.ndarray(out_shape,
+                                dtype=output.type.dtype)
                storage[0] = odat

        ufunc_args = inputs  # + output_storage
@@ -825,21 +836,16 @@ class Elemwise(Op):
                # always return an ndarray with dtype object
                variable = numpy.asarray(variable, dtype=nout.dtype)

-            if (hasattr(variable, 'shape')
-                    and storage[0].shape != variable.shape):
-                if numpy.prod(variable.shape) == 0:
-                    # numpy don't resize from a shape (1,5) to (0,5)
-                    # This bypass the inplace...
-                    # But I it is important in this case.
-                    storage[0] = variable
-                    continue
-                storage[0].resize(variable.shape)
-
-            if storage[0].shape:
-                storage[0][:] = variable
+            # The storage has been resized earlier.
+            if hasattr(variable, 'shape'):
+                assert storage[0].shape == variable.shape
            else:
-                storage[0].itemset(variable)
+                # If variable has not shape, then it is a scalar.
+                assert numpy.prod(storage[0].shape) == 1
+
+            storage[0][...] = variable
            assert str(storage[0].dtype) != 'object'
+
        # the following should be used instead of the previous loop,
        # unfortunately it tends to segfault
        # self.ufunc(*(ufunc_args+[s[0] for s in output_storage]))

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -521,7 +521,7 @@ class MakeVector(T.Op):
    def perform(self, node, inputs, out_):
        out, = out_
        # not calling theano._asarray as optimization
-        if out[0] is None:
+        if (out[0] is None) or (out[0].size != len(inputs)):
            out[0] = theano._asarray(inputs, dtype=node.outputs[0].dtype)
        else:
            # assume that out has correct dtype. there is no cheap way to check