Merge pull request #3380 from nouiz/mixed2

Mixed2

Merge pull request #3380 from nouiz/mixed2
d219054e · abergeron · 88eac16c · 2e4f475a · d219054e · d219054e
--- a/doc/library/sandbox/blocksparse.txt
+++ b/doc/library/sandbox/blocksparse.txt
 .. _libdoc_blocksparse:
-===================================================================
+===========================================================================
 :mod:`sandbox.blocksparse` --  Block sparse dot operations (gemv and outer)
-===================================================================
+===========================================================================
 .. module:: sandbox.blocksparse
   :platform: Unix, Windows

--- a/doc/library/sandbox/cuda/dnn.txt
+++ b/doc/library/sandbox/cuda/dnn.txt
@@ -24,6 +24,13 @@ There are at least three possible ways of doing so:
  ``LD_LIBRARY_PATH``, ``LIBRARY_PATH`` and ``CPATH`` to the directory
  extracted from the download. If needed, separate multiple directories
  with ``:`` as in the ``PATH`` environment variable.
+  example::
+      export LD_LIBRARY_PATH=/home/user/path_to_CUDNN_folder/lib64:$LD_LIBRARY_PATH
+      export CPATH=/home/user/path_to_CUDNN_folder/include:$CPATH
+      export LIBRARY_PATH=/home/user/path_to_CUDNN_folder/lib64:$LD_LIBRARY_PATH
 - And as a third way, also on Linux, you can copy the ``*.h`` files
  to ``/usr/include`` and the ``*.so*`` files to ``/lib64``.

--- a/doc/library/sandbox/cuda/op.txt
+++ b/doc/library/sandbox/cuda/op.txt
@@ -19,6 +19,7 @@ Blas Op
 .. automodule:: theano.sandbox.cuda.blas
    :members:
+.. autofunction:: theano.sandbox.cuda.blas.batched_dot
 Nnet Op
 =======

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -78,8 +78,8 @@ class OpFromGraph(gof.Op):
            if not isinstance(i, gof.Variable):
                raise TypeError(
                    'inputs and outputs must be Variable instances', i)
-        if 'updates' in kwargs:
+        if 'updates' in kwargs or 'givens' in kwargs:
-            raise TypeError('updates are not allowed in kwargs')
+            raise TypeError('updates and givens are not allowed in kwargs')
        # To support correctly shared variables the inner fct should
        # not see them. Otherwise their is problem with the gradient.

--- a/theano/compile/nanguardmode.py
+++ b/theano/compile/nanguardmode.py
+from __future__ import print_function
 import collections
 import logging
+from six.moves import StringIO
 import numpy as np
 import theano
-from theano.configparser import config, AddConfigVar, BoolParam
+from theano.configparser import config, AddConfigVar, BoolParam, EnumStr
 import theano.tensor as T
 import theano.sandbox.cuda as cuda
 from theano.compile import Mode
@@ -24,6 +26,11 @@ AddConfigVar('NanGuardMode.big_is_error',
             BoolParam(True),
             in_c_key=False)
+AddConfigVar('NanGuardMode.action',
+             "What NanGuardMode does when it finds a problem",
+             EnumStr('raise', 'warn', 'pdb'),
+             in_c_key=False)
 logger = logging.getLogger("theano.compile.nanguardmode")
@@ -55,13 +62,15 @@ def flatten(l):
    return rval
-def contains_nan(arr):
+def contains_nan(arr, node=None):
    """
    Test whether a numpy.ndarray contains any `np.nan` values.
    Parameters
    ----------
-    arr : np.ndarray
+    arr : np.ndarray or output of any Theano op
+    node : None or an Apply instance.
+        If arr is the output of a Theano op, the node associated to it.
    Returns
    -------
@@ -80,16 +89,31 @@ def contains_nan(arr):
        return False
    elif isinstance(arr, np.random.mtrand.RandomState):
        return False
+    elif arr.size == 0:
+        return False
+    elif cuda.cuda_available and isinstance(arr, cuda.CudaNdarray):
+        if (hasattr(theano.sandbox, 'rng_mrg') and
+            isinstance(
+                node.op,
+                # It store ints in float container
+                theano.sandbox.rng_mrg.GPU_mrg_uniform)):
+            return False
+        else:
+            compile_gpu_func(True, False, False)
+            return np.isnan(f_gpumin(arr.reshape(arr.size)))
    return np.isnan(np.min(arr))
-def contains_inf(arr):
+def contains_inf(arr, node=None):
    """
    Test whether a numpy.ndarray contains any `np.inf` values.
    Parameters
    ----------
-    arr : np.ndarray
+    arr : np.ndarray or output of any Theano op
+    node : None or an Apply instance.
+        If the output of a Theano op, the node associated to it.
    Returns
    -------
@@ -109,8 +133,69 @@ def contains_inf(arr):
        return False
    elif isinstance(arr, np.random.mtrand.RandomState):
        return False
+    elif arr.size == 0:
+        return False
+    elif cuda.cuda_available and isinstance(arr, cuda.CudaNdarray):
+        if (hasattr(theano.sandbox, 'rng_mrg') and
+            isinstance(
+                node.op,
+                # It store ints in float container
+                theano.sandbox.rng_mrg.GPU_mrg_uniform)):
+            return False
+        else:
+            compile_gpu_func(False, True, False)
+            return (np.isinf(f_gpumin(arr.reshape(arr.size))) or
+                    np.isinf(f_gpumax(arr.reshape(arr.size))))
    return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
+f_gpumin = None
+f_gpumax = None
+f_gpuabsmax = None
+def compile_gpu_func(nan_is_error, inf_is_error, big_is_error):
+    """ compile utility function used by contains_nan and contains_inf
+    """
+    global f_gpumin, f_gpumax, f_gpuabsmax
+    if not cuda.cuda_available:
+        return
+    guard_input = cuda.fvector('nan_guard')
+    cuda_compile_failed = False
+    if (nan_is_error or inf_is_error) and f_gpumin is None:
+        try:
+            f_gpumin = theano.function(
+                [guard_input], T.min(guard_input),
+                mode='FAST_RUN'
+            )
+        except RuntimeError:
+            # This can happen if cuda is available, but the
+            # device is in exclusive mode and used by another
+            # process.
+            cuda_compile_failed = True
+    if inf_is_error and not cuda_compile_failed and f_gpumax is None:
+        try:
+            f_gpumax = theano.function(
+                [guard_input], T.max(guard_input),
+                mode='FAST_RUN'
+            )
+        except RuntimeError:
+            # This can happen if cuda is available, but the
+            # device is in exclusive mode and used by another
+            # process.
+            cuda_compile_failed = True
+    if big_is_error and not cuda_compile_failed and f_gpuabsmax is None:
+        try:
+            f_gpuabsmax = theano.function(
+                [guard_input], T.max(T.abs_(guard_input)),
+                mode='FAST_RUN'
+                )
+        except RuntimeError:
+            # This can happen if cuda is available, but the
+            # device is in exclusive mode and used by another
+            # process.
+            cuda_compile_failed = True
 class NanGuardMode(Mode):
    """
@@ -137,7 +222,6 @@ class NanGuardMode(Mode):
    def __init__(self, nan_is_error=None, inf_is_error=None, big_is_error=None,
                 optimizer=None, linker=None):
        self.provided_optimizer = optimizer
-        cuda_compile_failed = False
        if nan_is_error is None:
            nan_is_error = config.NanGuardMode.nan_is_error
        if inf_is_error is None:
@@ -146,42 +230,7 @@ class NanGuardMode(Mode):
            big_is_error = config.NanGuardMode.big_is_error
        assert nan_is_error or inf_is_error or big_is_error
+        compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
-        if cuda.cuda_available:
-            self.guard_input = cuda.fvector('nan_guard')
-            if nan_is_error or inf_is_error:
-                try:
-                    self.gpumin = theano.function(
-                        [self.guard_input], T.min(self.guard_input),
-                        mode='FAST_RUN'
-                    )
-                except RuntimeError:
-                    # This can happen if cuda is available, but the
-                    # device is in exclusive mode and used by another
-                    # process.
-                    cuda_compile_failed = True
-            if inf_is_error and not cuda_compile_failed:
-                try:
-                    self.gpumax = theano.function(
-                        [self.guard_input], T.max(self.guard_input),
-                        mode='FAST_RUN'
-                    )
-                except RuntimeError:
-                    # This can happen if cuda is available, but the
-                    # device is in exclusive mode and used by another
-                    # process.
-                    cuda_compile_failed = True
-            if big_is_error and not cuda_compile_failed:
-                try:
-                    self.gpuabsmax = theano.function(
-                        [self.guard_input], T.max(T.abs_(self.guard_input)),
-                        mode='FAST_RUN'
-                    )
-                except RuntimeError:
-                    # This can happen if cuda is available, but the
-                    # device is in exclusive mode and used by another
-                    # process.
-                    cuda_compile_failed = True
        def do_check_on(var, nd, f, is_input):
            """
@@ -203,32 +252,21 @@ class NanGuardMode(Mode):
            """
            error = False
+            sio = StringIO()
            if nan_is_error:
-                err = False
+                if contains_nan(var, nd):
-                if cuda.cuda_available and isinstance(var, cuda.CudaNdarray):
+                    print('NaN detected', file=sio)
-                    if not isinstance(nd.op,
-                                      # It store ints in float container
-                                      theano.sandbox.rng_mrg.GPU_mrg_uniform):
-                        err = np.isnan(self.gpumin(var.reshape(var.size)))
-                else:
-                    err = contains_nan(var)
-                if err:
-                    logger.error('NaN detected')
                    error = True
            if inf_is_error:
-                err = False
+                if contains_inf(var, nd):
-                if cuda.cuda_available and isinstance(var, cuda.CudaNdarray):
+                    print('Inf detected', file=sio)
-                    err = (np.isinf(self.gpumin(var.reshape(var.size))) or
-                           np.isinf(self.gpumax(var.reshape(var.size))))
-                else:
-                    err = contains_inf(var)
-                if err:
-                    logger.error('Inf detected')
                    error = True
            if big_is_error:
                err = False
-                if cuda.cuda_available and isinstance(var, cuda.CudaNdarray):
+                if var.size == 0:
-                    err = (self.gpuabsmax(var.reshape(var.size)) > 1e10)
+                    err = False
+                elif cuda.cuda_available and isinstance(var, cuda.CudaNdarray):
+                    err = (f_gpuabsmax(var.reshape(var.size)) > 1e10)
                elif isinstance(var, theano.gof.type.CDataType._cdata_type):
                    err = False
                elif isinstance(var, np.random.mtrand.RandomState):
@@ -236,23 +274,29 @@ class NanGuardMode(Mode):
                else:
                    err = (np.abs(var).max() > 1e10)
                if err:
-                    logger.error('Big value detected')
+                    print('Big value detected', file=sio)
                    error = True
            if error:
-                if is_input:
+                if not is_input:
-                    logger.error('In an input')
+                    print("NanGuardMode found an error in the"
+                          " output of a node in this variable:", file=sio)
+                    print(theano.printing.debugprint(nd, file='str'), file=sio)
                else:
-                    logger.error('In an output')
+                    print("NanGuardMode found an error in an"
-                logger.error('Inputs: ')
+                          " input of this node.", file=sio)
-                for ivar, ival in zip(nd.inputs, f.inputs):
+                    print('Node:', file=sio)
-                    logger.error('var')
+                    print(nd, file=sio)
-                    logger.error(ivar)
+                    print("The input variable that cause problem:", file=sio)
-                    logger.error(theano.printing.min_informative_str(ivar))
+                    print(theano.printing.debugprint(nd, file='str'), file=sio)
-                    logger.error('val')
+                msg = sio.getvalue()
-                    logger.error(ival)
+                if config.NanGuardMode.action == 'raise':
-                logger.error('Node:')
+                    raise AssertionError(msg)
-                logger.error(nd)
+                elif config.NanGuardMode.action == 'pdb':
-                assert False
+                    print(msg)
+                    import pdb
+                    pdb.set_trace()
+                elif config.NanGuardMode.action == 'warn':
+                    logger.error(msg)
        def nan_check(i, node, fn):
            """
@@ -270,14 +314,16 @@ class NanGuardMode(Mode):
            """
            inputs = fn.inputs
-            # TODO: figure out why individual inputs are themselves lists
+            for x, var in zip(inputs, node.inputs):
-            # sometimes
+                # If the input is the result of computation, then we
-            for x in flatten(inputs):
+                # don't need to check it. It is already done after the
-                do_check_on(x, node, fn, True)
+                # computation.
+                if var.owner is not None:
+                    do_check_on(x[0], node, fn, True)
            fn()
            outputs = fn.outputs
-            for j, x in enumerate(flatten(outputs)):
+            for x in outputs:
-                do_check_on(x, node, fn, False)
+                do_check_on(x[0], node, fn, False)
        wrap_linker = theano.gof.WrapLinker([theano.gof.OpWiseCLinker()],
                                            nan_check)

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -302,8 +302,15 @@ def raise_with_op(node, thunk=None, exc_info=None, storage_map=None):
            "HINT: Use the Theano flag 'exception_verbosity=high'"
            " for a debugprint and storage map footprint of this apply node.")
-    exc_value = exc_type(str(exc_value) + detailed_err_msg +
+    try:
-                         '\n' + '\n'.join(hints))
+        exc_value = exc_type(str(exc_value) + detailed_err_msg +
+                             '\n' + '\n'.join(hints))
+    except TypeError:
+        print("WARNING: %s error does not allow us to add extra error message" %
+              str(exc_type))
+        # Some exception need extra parameter in inputs. So forget the
+        # extra long error message in that case.
+        pass
    reraise(exc_type, exc_value, exc_trace)

--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -395,7 +395,13 @@ def ifelse(condition, then_branch, else_branch, name=None):
 @gof.local_optimizer([IfElse])
 def cond_make_inplace(node):
    op = node.op
-    if isinstance(op, IfElse) and not op.as_view:
+    if (isinstance(op, IfElse) and
+        not op.as_view and
+        # For big graph, do not make inplace scalar to speed up
+        # optimization.
+        (len(node.fgraph.apply_nodes) < 500 or
+         not all([getattr(o.type, 'ndim', -1) == 0
+                  for o in node.outputs]))):
        return IfElse(n_outs=op.n_outs,
                      as_view=True,
                      gpu=op.gpu,

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -14,8 +14,8 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous)
 from theano.tensor import as_tensor_variable
-class BatchedDotOp(GpuOp):
+class BatchedDotOp(GpuOp):
    __props__ = ()
    def make_node(self, inp1, inp2):
@@ -213,6 +213,10 @@ class BatchedDotOp(GpuOp):
        return (1,)
 batched_dot = BatchedDotOp()
+"""
+Call cublasSgemmBatched. Take 2 3d tensor as input.
+"""
 class GpuDot22(GpuOp):
    """

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -81,20 +81,28 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
                                         " from one version, but we link with"
                                         " a different version %s" % str(v))
                    raise RuntimeError(dnn_available.msg)
-                if version() == -1:
+                if v == -1:
                    dnn_available.avail = False
                    dnn_available.msg = (
                        "CuDNN v1 detected. This version is no longer "
                        "supported by Theano. Update your CuDNN installation "
                        "to a more recent version")
                    raise RuntimeError(dnn_available.msg)
-                if version() == (20, 20):
+                if v == (20, 20):
                    dnn_available.avail = False
                    dnn_available.msg = (
                        "You have installed a release candidate of CuDNN v2."
                        " This isn't supported anymore."
                        " Update to CuDNN v2 final version.")
                    raise RuntimeError(dnn_available.msg)
+                if v[0] >= 3000 and v[0] < 3007:
+                    # 3007 is the final release of cudnn v3
+                    dnn_available.avail = False
+                    dnn_available.msg = (
+                        "You have installed a release candidate of CuDNN v3."
+                        " This isn't supported anymore."
+                        " Update to CuDNN v3 final version.")
+                    raise RuntimeError(dnn_available.msg)
    return dnn_available.avail
@@ -2380,8 +2388,7 @@ if True:
              isinstance(node.inputs[0].owner.op, HostFromGpu)) or
             (node.inputs[1].owner and
                 isinstance(node.inputs[1].owner.op, HostFromGpu)))):
-            if not dnn_available() or version() != (2000, 2000):
+            if not dnn_available():
-                # Softmax grad is broken in v3 rc1 for this case
                return
            ins = []
            for n in node.inputs:

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -66,7 +66,7 @@ class NaiveAlgo(object):
    def cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (19, self.verbose, self.sync, ver)
+            return (20, self.verbose, self.sync, ver)
        else:
            return ver
@@ -86,7 +86,9 @@ class NaiveAlgo(object):
    def c_src_kernel(self, node, nodename, nd):
        sio = StringIO()
        # print 'C_SRC_KERNEL', sio.getvalue()
+        print("// %s" % str(node.op), file=sio)
+        print("// node.op.destroy_map=%s" % str(
+            getattr(node.op, 'destroy_map', None)), file=sio)
        for ipos, i in enumerate(node.inputs):
            print("//    Input  ", ipos, str(i.type), file=sio)
        for ipos, i in enumerate(node.outputs):
@@ -202,6 +204,9 @@ class NaiveAlgo(object):
        if nd in (4,):
            # print some leading comments to make the code easier to read
+            print("// %s" % str(node.op), file=sio)
+            print("// node.op.destroy_map=%s" % str(
+                getattr(node.op, 'destroy_map', None)), file=sio)
            for ipos, i in enumerate(node.inputs):
                print("//    Input  ", ipos, str(i.type), file=sio)
            for ipos, i in enumerate(node.outputs):
@@ -307,6 +312,9 @@ class NaiveAlgo(object):
            return sio.getvalue()
        # print some leading comments to make the code easier to read
+        print("// %s" % str(node.op), file=sio)
+        print("// node.op.destroy_map=%s" % str(
+            getattr(node.op, 'destroy_map', None)), file=sio)
        for ipos, i in enumerate(node.inputs):
            print("//    Input  ", ipos, str(i.type), file=sio)
        for ipos, i in enumerate(node.outputs):
@@ -456,6 +464,9 @@ class NaiveAlgo(object):
        sio = StringIO()
        # print 'C_SRC_KERNEL', sio.getvalue()
+        print("// %s" % str(node.op), file=sio)
+        print("// node.op.destroy_map=%s" % str(
+            getattr(node.op, 'destroy_map', None)), file=sio)
        for ipos, i in enumerate(node.inputs):
            print("//    Input  ", ipos, str(i.type), file=sio)
        for ipos, i in enumerate(node.outputs):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -795,6 +795,11 @@ def local_gpu_careduce(node):
            replace = False
            if x.owner and isinstance(x.owner.op, HostFromGpu):
                replace = True
+            # If this is a useless reduce, remove it as
+            # local_cut_useless_reduce.  This is needed as the code
+            # below do not support when x.ndim == 0.
+            if x.type == node.outputs[0].type:
+                return [x]
            elif (all([c != "output" and isinstance(c.op, GpuFromHost)
                      for c, i in node.outputs[0].clients])
                  and x.owner and x.owner.op.__class__ in

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -296,6 +296,12 @@ def inplace_elemwise_optimizer_op(OP):
            # gpuarray GpuElemwise inherit from Elemwise
            if not type(op) == OP:
                continue
+            # If big graph and the outputs are scalar, do not make it
+            # inplace.
+            if (check_each_change != 1 and
+                all([getattr(o.type, 'ndim', -1) == 0
+                     for o in node.outputs])):
+                continue
            baseline = op.inplace_pattern
            protected_inputs = [
@@ -4188,28 +4194,29 @@ def local_sum_prod_mul_by_scalar(node):
    """
    # TODO: if the the thing inside the Sum is a division,
    # we should get at the numerator....
-    if isinstance(node.op, T.Sum) or isinstance(node.op, T.elemwise.Prod):
+    if isinstance(node.op, (T.Sum, T.elemwise.Prod)):
        node_inps, = node.inputs
        if node_inps.owner and node_inps.owner.op == T.mul:
            terms = node_inps.owner.inputs
            scalars = [t.dimshuffle() for t in terms if
                       numpy.all(t.type.broadcastable)]
-            non_scalars = [t for t in terms if not numpy.all(t.broadcastable)]
            if len(scalars) == 0:
                # Nothing to optimize here
                return
+            non_scalars = [t for t in terms if not numpy.all(t.broadcastable)]
            # Perform the op only on the non-scalar inputs, if applicable
            if len(non_scalars) == 0:
                new_op_input_nb_elements = 1
                new_op_output = 1
            elif len(non_scalars) == 1:
-                new_op_input_nb_elements = T.prod(non_scalars[0].shape)
+                new_op_input_nb_elements = non_scalars[0].size
                new_op_output = node.op(non_scalars[0])
            else:
                new_op_input = T.mul(*non_scalars)
-                new_op_input_nb_elements = T.prod(new_op_input.shape)
+                new_op_input_nb_elements = new_op_input.size
                new_op_output = node.op(new_op_input)
            # If node.op is a T.elemwise.Prod, then the scalars need to be
@@ -4226,7 +4233,10 @@ def local_sum_prod_mul_by_scalar(node):
            if new_op_input_nb_elements != 1:
                mul_inputs.append(new_op_output)
-            return [T.mul(*mul_inputs)]
+            if len(mul_inputs) == 1:
+                return mul_inputs
+            else:
+                return [T.mul(*mul_inputs)]
        if isinstance(node.op, T.Sum) and node_inps.owner and node_inps.owner.op == T.neg:
            return [T.neg(node.op(node_inps.owner.inputs[0]))]
@@ -4453,25 +4463,25 @@ def local_sum_prod_div_dimshuffle(node):
                    if isinstance(node.op, T.Sum):
                        op_on_compatible_dims = T.sum(
                            numerator, axis=compatible_dims)
-                        div_op = T.true_div(
+                        rval = T.true_div(
                            op_on_compatible_dims,
                            optimized_dimshuffle)
-                        op_on_incompatible_dims = T.sum(
+                        if len(reordered_incompatible_dims) > 0:
-                            div_op,
+                            rval = T.sum(rval,
-                            axis=reordered_incompatible_dims)
+                                         axis=reordered_incompatible_dims)
                    elif isinstance(node.op, T.elemwise.Prod):
                        op_on_compatible_dims = T.prod(
                            numerator, axis=compatible_dims)
                        dtype = numerator.dtype
-                        div_op = T.true_div(
+                        rval = T.true_div(
                            op_on_compatible_dims,
                            (optimized_dimshuffle **
                                T.prod([numerator.shape[ax].astype(dtype)
                                        for ax in compatible_dims])))
-                        op_on_incompatible_dims = T.prod(
+                        if len(reordered_incompatible_dims) > 0:
-                            div_op,
+                            rval = T.prod(rval,
-                            axis=reordered_incompatible_dims)
+                                          axis=reordered_incompatible_dims)
-                    return [op_on_incompatible_dims]
+                    return [rval]
 @register_canonicalize

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -4810,7 +4810,7 @@ class T_local_sum_prod(unittest.TestCase):
        # Case 2
        test_reduction_opt([vect, scalar1], [v_val, s1_val], T.elemwise.Prod,
-                           (s1_val * v_val).prod(), 2)
+                           (s1_val * v_val).prod(), 1)
        # Case 3
        test_reduction_opt([vect, mat, scalar1], [v_val, m_val, s1_val],
@@ -4823,7 +4823,7 @@ class T_local_sum_prod(unittest.TestCase):
        # Case 5
        test_reduction_opt([vect, scalar1, scalar2], [v_val, s1_val, s2_val],
                           T.elemwise.Prod, (s1_val * s2_val * v_val).prod(),
-                           2)
+                           1)
        # Case 6
        test_reduction_opt([vect, mat, scalar1, scalar2],

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -280,7 +280,8 @@ class _tensor_py_operators:
    shape = property(lambda self: theano.tensor.basic.shape(self))
-    size = property(lambda self: theano.tensor.basic.prod(self.shape))
+    size = property(lambda self: self.shape[0] if self.ndim == 1 else
+                    theano.tensor.basic.prod(self.shape))
    # We can't implement __len__ to provide a better error message.
    def any(self, axis=None, keepdims=False):

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -30,7 +30,6 @@ whitelist_flake8 = [
    "tests/test_gradient.py",
    "tests/test_config.py",
    "tests/diverse_tests.py",
-    "tests/test_ifelse.py",
    "tests/test_rop.py",
    "tests/test_2nd_order_grads.py",
    "tests/run_tests_in_batch.py",

--- a/theano/tests/test_ifelse.py
+++ b/theano/tests/test_ifelse.py
@@ -3,20 +3,22 @@
 """
 from __future__ import print_function
-__docformat__ = 'restructedtext en'
-__authors__ = ("Razvan Pascanu ")
-__copyright__ = "(c) 2010, Universite de Montreal"
-__contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 import unittest
 import numpy
 from nose.plugins.skip import SkipTest
+from six.moves import reduce
 import theano
 from theano import tensor
 import theano.ifelse
 from theano.ifelse import IfElse, ifelse
-from theano.tests  import unittest_tools as utt
+from theano.tests import unittest_tools as utt
+__docformat__ = 'restructedtext en'
+__authors__ = ("Razvan Pascanu ")
+__copyright__ = "(c) 2010, Universite de Montreal"
+__contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
@@ -51,6 +53,32 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
        assert numpy.allclose(vx, f(1, vx, vy))
        assert numpy.allclose(vy, f(0, vx, vy))
+    def test_not_lazy_if_inplace(self):
+        # Tests that if the outputs are scalars and the graph is big,
+        # we disable the inplace opt to speed up optimization
+        x = tensor.vector('x', dtype=self.dtype)
+        y = tensor.vector('y', dtype=self.dtype)
+        c = tensor.iscalar('c')
+        mode = theano.compile.get_mode(self.mode).excluding(
+            # Disable many opt to keep the graph big enough to disable
+            # the opt.
+            'fusion', 'local_add_canonizer',
+            'inplace', 'constant_folding', 'constant_folding')
+        y2 = reduce(lambda x, y: x + y, [y] + list(range(200)))
+        f = theano.function([c, x, y], ifelse(c, x, y2), mode=mode)
+        # For not inplace ifelse
+        self.assertFunctionContains1(f, IfElse(1))
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        xlen = rng.randint(200)
+        ylen = rng.randint(200)
+        vx = numpy.asarray(rng.uniform(size=(xlen,)), self.dtype)
+        vy = numpy.asarray(rng.uniform(size=(ylen,)), self.dtype)
+        assert numpy.allclose(vx, f(1, vx, vy))
+        assert numpy.allclose(vy + sum(range(200)), f(0, vx, vy))
    def test_mixed_dtype(self):
        x1 = tensor.vector('x1', dtype='int32')
        x2 = tensor.vector('x2', dtype=self.dtype)
@@ -65,9 +93,9 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
        xlen = rng.randint(200)
        ylen = rng.randint(200)
-        vx1 = numpy.asarray(rng.uniform(size=(xlen,))*3, 'int32')
+        vx1 = numpy.asarray(rng.uniform(size=(xlen,)) * 3, 'int32')
        vx2 = numpy.asarray(rng.uniform(size=(xlen,)), self.dtype)
-        vy1 = numpy.asarray(rng.uniform(size=(ylen,))*3, 'int32')
+        vy1 = numpy.asarray(rng.uniform(size=(ylen,)) * 3, 'int32')
        vy2 = numpy.asarray(rng.uniform(size=(ylen,)), self.dtype)
        o1, o2 = f(1, vx1, vx2, vy1, vy2)
@@ -288,8 +316,8 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
        z2 = ifelse(c, x + 2, y + 2)
        z = z1 + z2
        f = theano.function([c, x, y], z)
-        assert len([x for x in f.maker.fgraph.toposort()
+        assert len([n for n in f.maker.fgraph.toposort()
-                    if isinstance(x.op, IfElse)]) == 1
+                    if isinstance(n.op, IfElse)]) == 1
    def test_remove_useless_inputs1(self):
        raise SkipTest("Optimization temporarily disabled")
@@ -299,8 +327,8 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
        z = ifelse(c, (x, x), (y, y))
        f = theano.function([c, x, y], z)
-        ifnode = [x for x in f.maker.fgraph.toposort()
+        ifnode = [n for n in f.maker.fgraph.toposort()
-                  if isinstance(x.op, IfElse)][0]
+                  if isinstance(n.op, IfElse)][0]
        assert len(ifnode.inputs) == 3
    def test_remove_useless_inputs2(self):
@@ -418,12 +446,12 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
        c = tensor.iscalar('c')
        out = ifelse(c,
-            ifelse(c, x1, x2) + ifelse(c, y1, y2) + w1,
+                     ifelse(c, x1, x2) + ifelse(c, y1, y2) + w1,
-            ifelse(c, x1, x2) + ifelse(c, y1, y2) + w2)
+                     ifelse(c, x1, x2) + ifelse(c, y1, y2) + w2)
        f = theano.function([x1, x2, y1, y2, w1, w2, c], out,
                            allow_input_downcast=True)
        assert len([x for x in f.maker.fgraph.toposort()
-                if isinstance(x.op, IfElse)]) == 1
+                    if isinstance(x.op, IfElse)]) == 1
        rng = numpy.random.RandomState(utt.fetch_seed())
        vx1 = rng.uniform()