Merge branch 'master' into import_numpy_gpuarray

fd7875ad · bscellier · GitHub · 045cda93 · 8b9f7336 · fd7875ad
--- a/.jenkins/jenkins_test1.sh
+++ b/.jenkins/jenkins_test1.sh
@@ -13,5 +13,5 @@ echo "===== Testing theano core"
 # Test theano core
 PARTS="theano -e cuda -e gpuarray"
 THEANO_PARAM="${PARTS} --with-timer --timer-top-n 10 --with-xunit --xunit-file=theanocore_tests.xml"
-FLAGS="mode=FAST_RUN,floatX=float32"
+FLAGS="mode=FAST_RUN,floatX=float32,on_opt_error=raise,on_shape_error=raise"
 THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
--- a/.jenkins/jenkins_test2.sh
+++ b/.jenkins/jenkins_test2.sh
@@ -76,5 +76,5 @@ THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \
                       theano/sandbox/tests/test_rng_mrg.py:test_GPUA_full_fill \
                       theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray"
-FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN"
+FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN,on_opt_error=raise,on_shape_error=raise"
 THEANO_FLAGS=${FLAGS} time nosetests -v --with-xunit --xunit-file=theanogpuarray_tests.xml ${THEANO_GPUARRAY_TESTS}
--- a/bin/theano_cache.py
+++ b/bin/theano_cache.py
@@ -5,11 +5,11 @@ import os
 import sys

 if sys.platform == 'win32':
-    config_cxx = 'cxx='
+    config_for_theano_cache_script = 'cxx=,device=cpu'
    theano_flags = os.environ['THEANO_FLAGS'] if 'THEANO_FLAGS' in os.environ else ''
    if theano_flags:
        theano_flags += ','
-    theano_flags += config_cxx
+    theano_flags += config_for_theano_cache_script
    os.environ['THEANO_FLAGS'] = theano_flags

 import theano

--- a/doc/internal/how_to_release.txt
+++ b/doc/internal/how_to_release.txt
@@ -64,11 +64,18 @@ The documentation will be automatically regenerated in the next few hours.
 Generate and upload the package
 ===============================

-For release candidates, only upload on PyPI.
-
 On PyPI
 -------

+Set your umask to ``0022`` to ensure that the package file will be readable from other people.
+To check your umask::
+
+    umask
+
+To set your umask::
+
+    umask 0022
+
 Now change ``ISRELEASED`` in ``setup.py`` to ``True``.

 Finally, use setuptools to register and upload the release::
@@ -84,8 +91,8 @@ UnicodeDecodeError if there are non-ASCII characters in NEWS.txt. You
 would need to change NEWS.txt so it contains only ASCII characters (the
 problem usually comes from diacritics in people's names).

-On mloss.org
------------
+On mloss.org (for final releases only)
+--------------------------------------

 Project page is at http://mloss.org/software/view/241/.
 Account jaberg is listed as submitter.
@@ -138,8 +145,10 @@ then run the script.
 Announce the release
 ====================

-Generate an e-mail from the template in in ``EMAIL.txt``, including content
-from ``NEWS.txt``, and send it to the following mailing lists:
+Generate an e-mail from the template in ``EMAIL.txt``, including content
+from ``NEWS.txt``.
+
+For final releases, send the e-mail to the following mailing lists:

 * theano-users
 * theano-announce
@@ -152,3 +161,8 @@ For release candidates, only e-mail:
 * theano-announce
 * theano-dev
 * theano-users
+
+For alpha and beta releases, only e-mail:
+
+* theano-dev
+* theano-users
--- a/doc/library/compile/shared.txt
+++ b/doc/library/compile/shared.txt
@@ -19,11 +19,34 @@

    The user-friendly constructor is :func:`shared`

-    .. attribute:: value
-
-        Read/write access to the [non-symbolic] value/data associated with this SharedVariable.
-        
-        Changes to this value will be visible to all functions using this SharedVariable.  
+    .. method:: get_value(self, borrow=False, return_internal_type=False)
+    
+       :param borrow: True to permit returning of an object aliased to internal memory.
+       :type borrow: bool
+               
+       :param return_internal_type: True to permit the returning of an arbitrary type object used
+               internally to store the shared variable.
+       :type return_internal_type: bool
+
+       By default, return a copy of the data. If ``borrow=True`` (and
+       ``return_internal_type=False``), maybe it will return a copy.
+       For tensor, it will always return a ndarray by default, so if
+       the data is on the GPU, it will return a copy, but if the data
+       is on the CPU, it will return the original data.  If you do
+       ``borrow=True`` and ``return_internal_type=True``, it will
+       always return the original data, not a copy, but this can be a
+       GPU object.
+    
+    .. method:: set_value(self, new_value, borrow=False)
+
+       :param new_value: The new value.
+       :type new_value: A compatible type for this shared variable.
+
+       :param borrow: True to use the new_value directly, potentially creating problems
+           related to aliased memory.
+       :type borrow: bool
+
+       The new value will be seen by all functions using this SharedVariable.

    .. method:: __init__(self, name, type, value, strict, container=None)


--- a/doc/library/tensor/nnet/bn.txt
+++ b/doc/library/tensor/nnet/bn.txt
@@ -10,6 +10,9 @@
 .. moduleauthor:: LISA


-.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`. They must be added manually as they do not have the same user interface.
+.. autofunction:: theano.tensor.nnet.bn.batch_normalization_train
+.. autofunction:: theano.tensor.nnet.bn.batch_normalization_test
+
+.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`.

 .. autofunction:: theano.tensor.nnet.bn.batch_normalization
--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -59,11 +59,11 @@ class OpFromGraph(gof.Op):

    .. code-block:: python

-        import numpy
+        import numpy as np
        import theano
        from theano import config, function, OpFromGraph, tensor
        x, y, z = tensor.scalars('xyz')
-        s = theano.shared(numpy.random.rand(2, 2).astype(config.floatX))
+        s = theano.shared(np.random.rand(2, 2).astype(config.floatX))
        e = x + y * z + s
        op = OpFromGraph([x, y, z], [e])
        # op behaves like a normal theano op

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -14,7 +14,7 @@ import six.moves.copyreg as copyreg
 from itertools import chain, product as itertools_product
 from theano.compat import izip

-import numpy
+import numpy as np

 import theano
 from theano import gof, config
@@ -270,15 +270,15 @@ class BadOptimization(DebugModeError):
            print("  New Value: ", str(self.new_r_val), file=sio)

        try:
-            ov = numpy.asarray(self.old_r_val)
-            nv = numpy.asarray(self.new_r_val)
+            ov = np.asarray(self.old_r_val)
+            nv = np.asarray(self.new_r_val)
            ssio = StringIO()
-            abs_diff = numpy.absolute(nv - ov)
-            print("  Max Abs Diff: ", numpy.max(abs_diff), file=ssio)
-            print("  Mean Abs Diff: ", numpy.mean(abs_diff), file=ssio)
-            print("  Median Abs Diff: ", numpy.median(abs_diff), file=ssio)
-            print("  Std Abs Diff: ", numpy.std(abs_diff), file=ssio)
-            arg_max_val = numpy.argmax(abs_diff)
+            abs_diff = np.absolute(nv - ov)
+            print("  Max Abs Diff: ", np.max(abs_diff), file=ssio)
+            print("  Mean Abs Diff: ", np.mean(abs_diff), file=ssio)
+            print("  Median Abs Diff: ", np.median(abs_diff), file=ssio)
+            print("  Std Abs Diff: ", np.std(abs_diff), file=ssio)
+            arg_max_val = np.argmax(abs_diff)
            values_at_max = (nv.flatten()[arg_max_val],
                             ov.flatten()[arg_max_val])
            print("  Value at Max Diff: ", values_at_max, file=ssio)
@@ -286,13 +286,13 @@ class BadOptimization(DebugModeError):
            # N.B. the maximum(..., 1e-8) protects against div by 0 when
            #      nv == ov == 0
            reldiff = (abs_diff /
-                       numpy.maaximum(numpy.absolute(nv) + numpy.absolute(ov),
-                                      1e-8))
-            print("  Max Rel Diff: ", numpy.max(reldiff), file=ssio)
-            print("  Mean Rel Diff: ", numpy.mean(reldiff), file=ssio)
-            print("  Median Rel Diff: ", numpy.median(reldiff), file=ssio)
-            print("  Std Rel Diff: ", numpy.std(reldiff), file=ssio)
-            arg_max_val = numpy.argmax(reldiff)
+                       np.maximum(np.absolute(nv) + np.absolute(ov),
+                                  1e-8))
+            print("  Max Rel Diff: ", np.max(reldiff), file=ssio)
+            print("  Mean Rel Diff: ", np.mean(reldiff), file=ssio)
+            print("  Median Rel Diff: ", np.median(reldiff), file=ssio)
+            print("  Std Rel Diff: ", np.std(reldiff), file=ssio)
+            arg_max_val = np.argmax(reldiff)
            values_at_max = (nv.flatten()[arg_max_val],
                             ov.flatten()[arg_max_val])
            print("  Value at Max Diff: ", values_at_max, file=ssio)
@@ -342,8 +342,8 @@ class BadDestroyMap(DebugModeError):
        print("  repr (old val):", repr(self.old_val), file=sio)
        print("  repr (new val):", repr(self.new_val), file=sio)
        try:
-            npy_old_val = numpy.asarray(self.old_val)
-            npy_new_val = numpy.asarray(self.new_val)
+            npy_old_val = np.asarray(self.old_val)
+            npy_new_val = np.asarray(self.new_val)
            print("  value dtype (new <space> old):", npy_new_val.dtype,
                  npy_old_val.dtype, file=sio)
            print("  value shape (new <space> old):", npy_new_val.shape,
@@ -356,13 +356,13 @@ class BadDestroyMap(DebugModeError):
            print("  value min (new-old):", delta.min(), file=sio)
            print("  value max (new-old):", delta.max(), file=sio)
            print("  value argmin (new-old):",
-                  numpy.unravel_index(delta.argmin(), npy_new_val.shape),
+                  np.unravel_index(delta.argmin(), npy_new_val.shape),
                  file=sio)
            print("  value argmax (new-old):",
-                  numpy.unravel_index(delta.argmax(), npy_new_val.shape),
+                  np.unravel_index(delta.argmax(), npy_new_val.shape),
                  file=sio)
            print("  location of first 10 mismatches:",
-                  numpy.transpose(numpy.nonzero(delta))[:10], file=sio)
+                  np.transpose(np.nonzero(delta))[:10], file=sio)
            print("", file=sio)
        except Exception as e:
            print("(Numpy-hints failed with: %s)" % str(e), file=sio)
@@ -453,7 +453,7 @@ class InvalidValueError(DebugModeError):
            v_dtype = v.dtype
            v_min = v.min()
            v_max = v.max()
-            v_isfinite = numpy.all(numpy.isfinite(v))
+            v_isfinite = np.all(np.isfinite(v))
        except Exception:
            pass
        client_node = self.client_node
@@ -1025,7 +1025,7 @@ def _lessbroken_deepcopy(a):
    # this exists because copy.deepcopy on numpy arrays is broken
    # This logic is also in link.py
    from theano.gof.type import _cdata_type
-    if type(a) in (numpy.ndarray, numpy.memmap):
+    if type(a) in (np.ndarray, np.memmap):
        rval = a.copy()
    elif type(a) is _cdata_type:
        # This is not copyable (and should be used for constant data).
@@ -1034,7 +1034,7 @@ def _lessbroken_deepcopy(a):
        rval = copy.deepcopy(a)

    assert type(rval) == type(a), (type(rval), type(a))
-    if isinstance(rval, numpy.ndarray):
+    if isinstance(rval, np.ndarray):
        assert rval.dtype == a.dtype
    return rval

@@ -1241,7 +1241,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
            # There is no risk to overwrite inputs, since r does not work
            # inplace.
            if isinstance(r.type, (TensorType, CudaNdarrayType)):
-                reuse_outputs[r][...] = numpy.asarray(
+                reuse_outputs[r][...] = np.asarray(
                    def_val).astype(r.type.dtype)

        if reuse_outputs:
@@ -1259,7 +1259,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                new_buf = r.type.value_zeros(r_vals[r].shape)
                # CudaNdarray don't have flags field
                # assert new_buf.flags["C_CONTIGUOUS"]
-                new_buf[...] = numpy.asarray(def_val).astype(r.type.dtype)
+                new_buf[...] = np.asarray(def_val).astype(r.type.dtype)

                c_cont_outputs[r] = new_buf

@@ -1273,7 +1273,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        f_cont_outputs = {}
        for r in considered_outputs:
            if isinstance(r.type, (TensorType, CudaNdarrayType)):
-                new_buf = numpy.zeros(
+                new_buf = np.zeros(
                    shape=r_vals[r].shape,
                    dtype=r_vals[r].dtype,
                    order='F')
@@ -1331,7 +1331,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                    else:
                        buf_shape.append(s * 2)
                new_buf = r.type.value_zeros(buf_shape)
-                new_buf[...] = numpy.asarray(def_val).astype(r.type.dtype)
+                new_buf[...] = np.asarray(def_val).astype(r.type.dtype)
                init_strided[r] = new_buf

        # The number of combinations is exponential in the number of
@@ -1377,7 +1377,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                            r_buf = r_buf[tuple(strides)][tuple(shapes)]
                        assert r_buf.shape == r_vals[r].shape

-                        r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)
+                        r_buf[...] = np.asarray(def_val).astype(r_buf.dtype)
                        strided[r] = r_buf

                if strided:
@@ -1405,7 +1405,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                                     for s, sd in zip(r_vals[r].shape,
                                                      r_shape_diff)]
                        new_buf = r.type.value_zeros(out_shape)
-                        new_buf[...] = numpy.asarray(
+                        new_buf[...] = np.asarray(
                            def_val).astype(r.type.dtype)
                        wrong_size[r] = new_buf

@@ -2261,7 +2261,7 @@ class _Linker(gof.link.LocalLinker):
                        # HACK TO LOOK LIKE A REAL DESTRUCTIVE ACTION
                        # TOOK PLACE
                        if ((type(dr_vals[r][0]) in
-                             (numpy.ndarray, numpy.memmap)) and
+                             (np.ndarray, np.memmap)) and
                            (dr_vals[r][0].dtype ==
                             storage_map[r][0].dtype) and
                            (dr_vals[r][0].shape ==

--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -13,7 +13,7 @@ from six import string_types
 from theano.compile.io import In
 from theano.compile.function_module import orig_function
 from theano.compile.pfunc import pfunc
-from numpy import any
+import numpy as np
 import warnings
 from theano import compat

@@ -286,7 +286,7 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
                        "input.")

    # compute some features of the arguments:
-    uses_tuple = any([isinstance(i, (list, tuple)) for i in inputs])
+    uses_tuple = np.any([isinstance(i, (list, tuple)) for i in inputs])
    uses_updates = bool(updates)
    uses_givens = bool(givens)


--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -12,13 +12,14 @@ import six.moves.cPickle as pickle
 from itertools import chain
 import time
 import warnings
-import numpy
+import numpy as np

 import theano
 from theano import config, gof
 from theano.compat import izip
 from theano.gof import graph
 import theano.compile.mode
+import theano.compile.profiling
 from theano.compile.io import (
    In, SymbolicInput, SymbolicOutput)
 from theano.compile.ops import deep_copy_op, view_op
@@ -663,7 +664,7 @@ class Function(object):
        input_storage = [i.value for i in ins]
        # reinitialize new maker and create new function
        if profile is None:
-            profile = config.profile
+            profile = config.profile or config.print_global_stats
            # profile -> True or False
        if profile is True:
            if name:
@@ -749,6 +750,12 @@ class Function(object):
            List of outputs on indices/keys from ``output_subset`` or all of them,
            if ``output_subset`` is not passed.
        """
+        def restore_defaults():
+            for i, (required, refeed, value) in enumerate(self.defaults):
+                if refeed:
+                    if isinstance(value, gof.Container):
+                        value = value.storage[0]
+                    self[i] = value
        profile = self.profile
        t0 = time.time()

@@ -804,6 +811,7 @@ class Function(object):
                            e.args = ("Bad input " + argument_name + " to " +
                                      function_name + " at index %d (0-based). %s"
                                      % (i, where),) + e.args
+                        restore_defaults()
                        raise
                s.provided += 1
                i += 1
@@ -829,9 +837,9 @@ class Function(object):
                             in args_share_memory[j]],
                            [self.input_storage[k].storage[0] for k
                             in args_share_memory[j]])
-                        if numpy.any([(var.type is i_var.type and
-                                       var.type.may_share_memory(val, i_val))
-                                      for (var, val) in group_j]):
+                        if np.any([(var.type is i_var.type and
+                                  var.type.may_share_memory(val, i_val))
+                                  for (var, val) in group_j]):

                            is_aliased = True
                            args_share_memory[j].append(i)
@@ -853,14 +861,17 @@ class Function(object):
        if not self.trust_input:
            for c in self.input_storage:
                if c.required and not c.provided:
+                    restore_defaults()
                    raise TypeError("Missing required input: %s" %
                                    getattr(self.inv_finder[c], 'variable',
                                            self.inv_finder[c]))
                if c.provided > 1:
+                    restore_defaults()
                    raise TypeError("Multiple values for input: %s" %
                                    getattr(self.inv_finder[c], 'variable',
                                            self.inv_finder[c]))
                if c.implicit and c.provided > 0:
+                    restore_defaults()
                    raise TypeError(
                        'Tried to provide value for implicit input: %s'
                        % getattr(self.inv_finder[c], 'variable',
@@ -873,6 +884,7 @@ class Function(object):
                self.fn() if output_subset is None else\
                self.fn(output_subset=output_subset)
        except Exception:
+            restore_defaults()
            if hasattr(self.fn, 'position_of_error'):
                # this is a new vm-provided function or c linker
                # they need this because the exception manipulation
@@ -925,11 +937,7 @@ class Function(object):
            outputs = outputs[:self.n_returned_outputs]

        # Put default values back in the storage
-        for i, (required, refeed, value) in enumerate(self.defaults):
-            if refeed:
-                if isinstance(value, gof.Container):
-                    value = value.storage[0]
-                self[i] = value
+        restore_defaults()
        #
        # NOTE: This logic needs to be replicated in
        #       scan.
@@ -937,6 +945,7 @@ class Function(object):
        #

        dt_call = time.time() - t0
+        theano.compile.profiling.total_fct_exec_time += dt_call
        self.maker.mode.call_time += dt_call
        if profile:
            profile.fct_callcount += 1
@@ -1019,9 +1028,9 @@ def _pickle_Function(f):
        all_data = input_storage + inputs_data
        for i, d_i in enumerate(all_data):
            for j, d_j in enumerate(all_data):
-                if ((i < j) and isinstance(d_i, numpy.ndarray) and
-                        isinstance(d_j, numpy.ndarray)):
-                    if numpy.may_share_memory(d_i, d_j):
+                if ((i < j) and isinstance(d_i, np.ndarray) and
+                        isinstance(d_j, np.ndarray)):
+                    if np.may_share_memory(d_i, d_j):
                        if f.pickle_aliased_memory_strategy == 'warn':
                            _logger.warning('aliased relationship between '
                                            'Function arguments %s, %s '
@@ -1041,7 +1050,7 @@ def _constructor_Function(maker, input_storage, inputs_data):
    assert len(f.input_storage) == len(inputs_data)
    for container, x in zip(f.input_storage, inputs_data):
        assert (container.data is x) or \
-            (isinstance(x, numpy.ndarray) and (container.data == x).all()) or \
+            (isinstance(x, np.ndarray) and (container.data == x).all()) or \
            (container.data == x)
    return f

@@ -1466,6 +1475,7 @@ class FunctionMaker(object):

                end_optimizer = time.time()
                opt_time = end_optimizer - start_optimizer
+                theano.compile.profiling.total_graph_opt_time += opt_time
                if profile:
                    profile.optimizer_time += opt_time
                    if theano.config.profile_optimizer:
@@ -1655,6 +1665,7 @@ class FunctionMaker(object):
        end_linker = time.time()

        linker_time = end_linker - start_linker
+        theano.compile.profiling.total_time_linker += linker_time
        _logger.debug('Linker took %f seconds', linker_time)
        if self.profile:
            self.profile.linker_time += linker_time

--- a/theano/compile/monitormode.py
+++ b/theano/compile/monitormode.py
 from __future__ import absolute_import, print_function, division
 # Note: this code was initially copied from the 'pyutools' package by its
 # original author, and re-licensed under Theano's license.
-import numpy
+import numpy as np

 import theano
 from theano.compile.mode import Mode
@@ -93,8 +93,8 @@ class MonitorMode(Mode):

 def detect_nan(i, node, fn):
    for output in fn.outputs:
-        if (not isinstance(output[0], numpy.random.RandomState) and
-                numpy.isnan(output[0]).any()):
+        if (not isinstance(output[0], np.random.RandomState) and
+                np.isnan(output[0]).any()):
            print('*** NaN detected ***')
            theano.printing.debugprint(node)
            print('Inputs : %s' % [input[0] for input in fn.inputs])

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -17,7 +17,7 @@ from six import iteritems, integer_types
 from six.moves import xrange


-import numpy
+import numpy as np


 def register_view_op_c_code(type, code, version=()):
@@ -338,7 +338,7 @@ class Shape_i(gof.Op):
    def __init__(self, i):
        # As i will be used in the hash and that ndarray are not hashable,
        # we need to convert it to an int as it is hashable.
-        if isinstance(i, numpy.ndarray):
+        if isinstance(i, np.ndarray):
            assert i.dtype in theano.tensor.integer_dtypes
        assert i == int(i)
        i = int(i)
@@ -665,11 +665,11 @@ class Rebroadcast(gof.Op):
        items = sorted(axis)
        self.axis = OrderedDict(items)
        for axis, broad in iteritems(self.axis):
-            if not isinstance(axis, (numpy.integer, integer_types)):
+            if not isinstance(axis, (np.integer, integer_types)):
                raise TypeError("Rebroadcast needs integer axes. "
                                "Got {}".format(axis))

-            if not isinstance(broad, (numpy.bool_, bool)):
+            if not isinstance(broad, (np.bool_, bool)):
                raise TypeError("Rebroadcast needs bool for new broadcast "
                                "pattern. Got {}".format(broad))

@@ -835,8 +835,8 @@ class SpecifyShape(gof.Op):
        x, shape = inp
        out, = out_
        assert x.ndim == shape.size
-        assert numpy.all(x.shape == shape), ("got shape", x.shape,
-                                             "expected", shape)
+        assert np.all(x.shape == shape), ("got shape", x.shape,
+                                          "expected", shape)
        out[0] = x

    def infer_shape(self, node, shapes):

--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -364,7 +364,7 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
    if givens is None:
        givens = []
    if profile is None:
-        profile = config.profile
+        profile = config.profile or config.print_global_stats
        # profile -> True or False
        if profile is False:
            profile = None

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -27,7 +27,7 @@ import sys
 import time
 from collections import defaultdict

-import numpy
+import numpy as np

 import theano
 from six import iteritems
@@ -36,6 +36,9 @@ from theano.gof import graph
 logger = logging.getLogger('theano.compile.profiling')

 theano_imported_time = time.time()
+total_fct_exec_time = 0.
+total_graph_opt_time = 0.
+total_time_linker = 0.
 config = theano.config

 _atexit_print_list = []
@@ -47,7 +50,80 @@ def _atexit_print_fn():
    Print ProfileStat objects in _atexit_print_list to _atexit_print_file.

    """
-    to_sum = []
+    if config.profile:
+        to_sum = []
+
+        if config.profiling.destination == 'stderr':
+            destination_file = sys.stderr
+        elif config.profiling.destination == 'stdout':
+            destination_file = sys.stdout
+        else:
+            destination_file = open(config.profiling.destination, 'w')
+
+        # Reverse sort in the order of compile+exec time
+        for ps in sorted(_atexit_print_list,
+                         key=lambda a:a.compile_time + a.fct_call_time)[::-1]:
+            if ps.fct_callcount >= 1 or ps.compile_time > 1:
+                ps.summary(file=destination_file,
+                           n_ops_to_print=config.profiling.n_ops,
+                           n_apply_to_print=config.profiling.n_apply)
+                if not isinstance(ps, ScanProfileStats):
+                    to_sum.append(ps)
+            else:
+                # TODO print the name if there is one!
+                print('Skipping empty Profile')
+        if len(to_sum) > 1:
+            # Make a global profile
+            cum = copy.copy(to_sum[0])
+            msg = ("Sum of all(%d) printed profiles at exit excluding Scan op"
+                   " profile." % len(to_sum))
+            cum.message = msg
+            for ps in to_sum[1:]:
+                for attr in ["compile_time", "fct_call_time", "fct_callcount",
+                             "vm_call_time", "optimizer_time", "linker_time",
+                             "validate_time", "import_time",
+                             "linker_node_make_thunks"]:
+                    setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
+
+                # merge dictonary
+                for attr in ["apply_time", "apply_callcount",
+                             "apply_cimpl", "variable_shape", "variable_strides",
+                             "linker_make_thunk_time"]:
+                    cum_attr = getattr(cum, attr)
+                    for key, val in iteritems(getattr(ps, attr)):
+                        assert key not in cum_attr
+                        cum_attr[key] = val
+
+                if cum.optimizer_profile and ps.optimizer_profile:
+                    try:
+                        merge = cum.optimizer_profile[0].merge_profile(
+                            cum.optimizer_profile[1],
+                            ps.optimizer_profile[1])
+                        assert len(merge) == len(cum.optimizer_profile[1])
+                        cum.optimizer_profile = (cum.optimizer_profile[0], merge)
+                    except Exception as e:
+                        print("Got an exception while merging profile")
+                        print(e)
+                        cum.optimizer_profile = None
+                else:
+                    cum.optimizer_profile = None
+
+            cum.summary(file=destination_file,
+                        n_ops_to_print=config.profiling.n_ops,
+                        n_apply_to_print=config.profiling.n_apply)
+
+    if config.print_global_stats:
+        print_global_stats()
+
+def print_global_stats():
+    """
+    Print the following stats:
+      -- Time elapsed since Theano was imported
+      -- Time spent inside Theano functions
+      -- Time spent in compiling Theano functions
+           -- on graph optimization
+           -- on linker
+    """

    if config.profiling.destination == 'stderr':
        destination_file = sys.stderr
@@ -56,57 +132,18 @@ def _atexit_print_fn():
    else:
        destination_file = open(config.profiling.destination, 'w')

-    # Reverse sort in the order of compile+exec time
-    for ps in sorted(_atexit_print_list,
-                     key=lambda a:a.compile_time + a.fct_call_time)[::-1]:
-        if ps.fct_callcount >= 1 or ps.compile_time > 1:
-            ps.summary(file=destination_file,
-                       n_ops_to_print=config.profiling.n_ops,
-                       n_apply_to_print=config.profiling.n_apply)
-            if not isinstance(ps, ScanProfileStats):
-                to_sum.append(ps)
-        else:
-            # TODO print the name if there is one!
-            print('Skipping empty Profile')
-    if len(to_sum) > 1:
-        # Make a global profile
-        cum = copy.copy(to_sum[0])
-        msg = ("Sum of all(%d) printed profiles at exit excluding Scan op"
-               " profile." % len(to_sum))
-        cum.message = msg
-        for ps in to_sum[1:]:
-            for attr in ["compile_time", "fct_call_time", "fct_callcount",
-                         "vm_call_time", "optimizer_time", "linker_time",
-                         "validate_time", "import_time",
-                         "linker_node_make_thunks"]:
-                setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
-
-            # merge dictonary
-            for attr in ["apply_time", "apply_callcount",
-                         "apply_cimpl", "variable_shape", "variable_strides",
-                         "linker_make_thunk_time"]:
-                cum_attr = getattr(cum, attr)
-                for key, val in iteritems(getattr(ps, attr)):
-                    assert key not in cum_attr
-                    cum_attr[key] = val
-
-            if cum.optimizer_profile and ps.optimizer_profile:
-                try:
-                    merge = cum.optimizer_profile[0].merge_profile(
-                        cum.optimizer_profile[1],
-                        ps.optimizer_profile[1])
-                    assert len(merge) == len(cum.optimizer_profile[1])
-                    cum.optimizer_profile = (cum.optimizer_profile[0], merge)
-                except Exception as e:
-                    print("Got an exception while merging profile")
-                    print(e)
-                    cum.optimizer_profile = None
-            else:
-                cum.optimizer_profile = None
-
-        cum.summary(file=destination_file,
-                    n_ops_to_print=config.profiling.n_ops,
-                    n_apply_to_print=config.profiling.n_apply)
+    print('='*50, file=destination_file)
+    print('Global stats: ',
+          'Time elasped since Theano import = %6.3fs, '
+          'Time spent in Theano functions = %6.3fs, '
+          'Time spent compiling Theano functions: '
+          ' optimzation = %6.3fs, linker = %6.3fs ' %
+          (time.time() - theano_imported_time,
+           total_fct_exec_time,
+           total_graph_opt_time,
+           total_time_linker),
+          file=destination_file)
+    print('='*50, file=destination_file)


 class ProfileStats(object):
@@ -440,7 +477,7 @@ class ProfileStats(object):
        hs += ['<#apply>']
        es += [' %4d  ']

-        upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
+        upto_length = np.sum([len(x) for x in hs]) + len(hs)
        maxlen = max(self.line_width - upto_length, 0)
        hs += ['<Class name>']
        es += ['%s']
@@ -522,7 +559,7 @@ class ProfileStats(object):
        hs += ['<#apply>']
        es += ['  %4d  ']

-        upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
+        upto_length = np.sum([len(x) for x in hs]) + len(hs)
        maxlen = max(self.line_width - upto_length, 0)
        hs += ['<Op name>']
        es += ['%s']
@@ -590,7 +627,7 @@ class ProfileStats(object):
        if self.variable_shape:
            hs += ['<Mflops>', '<Gflops/s>']

-        upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
+        upto_length = np.sum([len(x) for x in hs]) + len(hs)
        maxlen = max(self.line_width - upto_length, 0)
        hs += ['<Apply name>']
        es += ['%s']
@@ -892,7 +929,7 @@ class ProfileStats(object):
            node_list = list(node_list)
            mem_count = 0
            max_mem_count = 0
-            mem_bound = numpy.inf
+            mem_bound = np.inf
            # This take only the inputs/outputs dependencies.
            dependencies = fgraph.profile.dependencies
            done_set = set([])

--- a/theano/compile/sharedvalue.py
+++ b/theano/compile/sharedvalue.py
@@ -9,7 +9,7 @@ import copy
 import logging

 # Third-party imports
-import numpy
+import numpy as np

 # Theano imports
 from theano.gof import Container, Variable, generic, utils
@@ -120,6 +120,31 @@ class SharedVariable(Variable):
        Changes to this value will be visible to all functions using
        this SharedVariable.

+        Notes
+        -----
+        Set_value will work in-place on the GPU, if
+        the following conditions are met:
+
+            * The destination on the GPU must be c_contiguous.
+            * The source is on the CPU.
+            * The old value must have the same dtype as the new value
+              (which is a given for now, since only float32 is
+              supported).
+            * The old and new value must have the same shape.
+            * The old value is being completely replaced by the new
+              value (not partially modified, e.g. by replacing some
+              subtensor of it).
+            * You change the value of the shared variable via
+              set_value, not via the .value accessors. You should not
+              use the .value accessors anyway, since they will soon be
+              deprecated and removed.
+
+        It is also worth mentioning that, for efficient transfer to the GPU,
+        Theano will make the new data ``c_contiguous``. This can require an
+        extra copy of the data on the host.
+
+        The inplace on gpu memory work when borrow is either True or False.
+
        """
        if borrow:
            self.container.value = new_value
@@ -162,7 +187,7 @@ class SharedVariable(Variable):
        # implemented at all, but with a more explicit error message to help
        # Theano users figure out the root of the problem more easily.
        value = self.get_value(borrow=True)
-        if isinstance(value, numpy.ndarray):
+        if isinstance(value, np.ndarray):
            # Array probably had an unknown dtype.
            msg = ("a Numpy array with dtype: '%s'. This data type is not "
                   "currently recognized by Theano tensors: please cast "

--- a/theano/compile/tests/test_builders.py
+++ b/theano/compile/tests/test_builders.py
 from __future__ import absolute_import, print_function, division
-import numpy
+import numpy as np

 from theano import config, shared

@@ -23,14 +23,14 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
        f = op(x, y, z) - op(y, z, x)

        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 2), dtype=config.floatX)
-        yv = numpy.ones((2, 2), dtype=config.floatX) * 3
-        zv = numpy.ones((2, 2), dtype=config.floatX) * 5
+        xv = np.ones((2, 2), dtype=config.floatX)
+        yv = np.ones((2, 2), dtype=config.floatX) * 3
+        zv = np.ones((2, 2), dtype=config.floatX) * 5
        # print function, function.__module__
        # print fn.maker.fgraph.toposort()
        fn(xv, yv, zv)
-        assert numpy.all(8.0 == fn(xv, yv, zv))
-        assert numpy.all(8.0 == fn(xv, yv, zv))
+        assert np.all(8.0 == fn(xv, yv, zv))
+        assert np.all(8.0 == fn(xv, yv, zv))

    def test_size_changes(self):
        x, y, z = T.matrices('xyz')
@@ -38,15 +38,15 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
        op = OpFromGraph([x, y], [e])
        f = op(x, op(y, z))
        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 3), dtype=config.floatX)
-        yv = numpy.ones((3, 4), dtype=config.floatX) * 3
-        zv = numpy.ones((4, 5), dtype=config.floatX) * 5
+        xv = np.ones((2, 3), dtype=config.floatX)
+        yv = np.ones((3, 4), dtype=config.floatX) * 3
+        zv = np.ones((4, 5), dtype=config.floatX) * 5
        res = fn(xv, yv, zv)
        assert res.shape == (2, 5)
-        assert numpy.all(180.0 == res)
+        assert np.all(180.0 == res)
        res = fn(xv, yv, zv)
        assert res.shape == (2, 5)
-        assert numpy.all(180.0 == res)
+        assert np.all(180.0 == res)

    def test_grad(self):
        x, y, z = T.matrices('xyz')
@@ -55,10 +55,10 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
        f = op(x, y, z)
        f = f - T.grad(T.sum(f), y)
        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 2), dtype=config.floatX)
-        yv = numpy.ones((2, 2), dtype=config.floatX) * 3
-        zv = numpy.ones((2, 2), dtype=config.floatX) * 5
-        assert numpy.all(11.0 == fn(xv, yv, zv))
+        xv = np.ones((2, 2), dtype=config.floatX)
+        yv = np.ones((2, 2), dtype=config.floatX) * 3
+        zv = np.ones((2, 2), dtype=config.floatX) * 5
+        assert np.all(11.0 == fn(xv, yv, zv))

    def test_grad_grad(self):
        x, y, z = T.matrices('xyz')
@@ -68,47 +68,47 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
        f = f - T.grad(T.sum(f), y)
        f = f - T.grad(T.sum(f), y)
        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 2), dtype=config.floatX)
-        yv = numpy.ones((2, 2), dtype=config.floatX) * 3
-        zv = numpy.ones((2, 2), dtype=config.floatX) * 5
-        assert numpy.allclose(6.0, fn(xv, yv, zv))
+        xv = np.ones((2, 2), dtype=config.floatX)
+        yv = np.ones((2, 2), dtype=config.floatX) * 3
+        zv = np.ones((2, 2), dtype=config.floatX) * 5
+        assert np.allclose(6.0, fn(xv, yv, zv))

    def test_shared(self):
        x, y, z = T.matrices('xyz')
-        s = shared(numpy.random.rand(2, 2).astype(config.floatX))
+        s = shared(np.random.rand(2, 2).astype(config.floatX))
        e = x + y * z + s
        op = OpFromGraph([x, y, z], [e])
        # (1+3*5=array of 16) - (3+1*5=array of 8)
        f = op(x, y, z) - op(y, z, x)

        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 2), dtype=config.floatX)
-        yv = numpy.ones((2, 2), dtype=config.floatX) * 3
-        zv = numpy.ones((2, 2), dtype=config.floatX) * 5
+        xv = np.ones((2, 2), dtype=config.floatX)
+        yv = np.ones((2, 2), dtype=config.floatX) * 3
+        zv = np.ones((2, 2), dtype=config.floatX) * 5
        # print function, function.__module__
        # print fn.maker.fgraph.toposort()
-        assert numpy.allclose(8.0, fn(xv, yv, zv))
-        assert numpy.allclose(8.0, fn(xv, yv, zv))
+        assert np.allclose(8.0, fn(xv, yv, zv))
+        assert np.allclose(8.0, fn(xv, yv, zv))

    def test_shared_grad(self):
        x, y, z = T.matrices('xyz')
-        s = shared(numpy.random.rand(2, 2).astype(config.floatX))
+        s = shared(np.random.rand(2, 2).astype(config.floatX))
        e = x + y * z + s
        op = OpFromGraph([x, y, z], [e])
        f = op(x, y, z)
        f = f - T.grad(T.sum(f), y)
        fn = function([x, y, z], f)
-        xv = numpy.ones((2, 2), dtype=config.floatX)
-        yv = numpy.ones((2, 2), dtype=config.floatX) * 3
-        zv = numpy.ones((2, 2), dtype=config.floatX) * 5
-        assert numpy.allclose(11.0 + s.get_value(), fn(xv, yv, zv))
+        xv = np.ones((2, 2), dtype=config.floatX)
+        yv = np.ones((2, 2), dtype=config.floatX) * 3
+        zv = np.ones((2, 2), dtype=config.floatX) * 5
+        assert np.allclose(11.0 + s.get_value(), fn(xv, yv, zv))

        # grad again the shared variable
        f = op(x, y, z)
        f = f - T.grad(T.sum(f), s)
        fn = function([x, y, z], f)
-        assert numpy.allclose(15.0 + s.get_value(),
-                              fn(xv, yv, zv))
+        assert np.allclose(15.0 + s.get_value(),
+                           fn(xv, yv, zv))

    def test_connection_pattern(self):
        # Basic case
@@ -163,6 +163,6 @@ class T_OpFromGraph(unittest_tools.InferShapeTester):
        p = T.matrix('p')
        self._compile_and_check([q, p],
                                op_graph(q, p),
-                                [numpy.ones([3, 4], dtype=config.floatX),
-                                 numpy.ones([3, 4], dtype=config.floatX)],
+                                [np.ones([3, 4], dtype=config.floatX),
+                                 np.ones([3, 4], dtype=config.floatX)],
                                OpFromGraph)
--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest
 import unittest

-import numpy
+import numpy as np

 from theano import config
 from theano import gof
@@ -316,7 +316,7 @@ def test_just_c_code():
    x = theano.tensor.dvector()
    f = theano.function([x], wb2(x),
                        mode=debugmode.DebugMode(check_py_code=False))
-    assert numpy.all(f([1, 2]) == [2, 4])
+    assert np.all(f([1, 2]) == [2, 4])


 def test_baddestroymap():
@@ -349,7 +349,7 @@ def test_baddestroymap_c():
    f = theano.function([x], wb2i(x),
                        mode=debugmode.DebugMode(check_py_code=False))
    try:
-        assert numpy.all(f([1, 2]) == [2, 4])
+        assert np.all(f([1, 2]) == [2, 4])
        assert False  # failed to raise error
    except debugmode.BadDestroyMap:
        pass
@@ -445,8 +445,8 @@ class Test_ViewMap(unittest.TestCase):

        r0, r1 = f([1, 2, 3, 4], [5, 6, 7, 8])

-        assert numpy.all(r0 == [1, 2, 3, 4])
-        assert numpy.all(r1 == [2, 3, 4])
+        assert np.all(r0 == [1, 2, 3, 4])
+        assert np.all(r1 == [2, 3, 4])

    def test_aliased_outputs_ok_output(self):
        # here aliased outputs is ok because they are both outputs of the
@@ -470,8 +470,8 @@ class Test_ViewMap(unittest.TestCase):

        r0, r1 = f([1, 2, 3, 4], [5, 6, 7, 8])

-        assert numpy.all(r0 == [2, 4, 6, 8])
-        assert numpy.all(r1 == [4, 6, 8])
+        assert np.all(r0 == [2, 4, 6, 8])
+        assert np.all(r1 == [4, 6, 8])

    def test_aliased_outputs_ok_shadow(self):
        # here the alias between outputs is ok because one of them is not used
@@ -496,7 +496,7 @@ class Test_ViewMap(unittest.TestCase):

        r0 = f([1, 2, 3, 4], [5, 6, 7, 8])

-        assert numpy.all(r0 == [2, 4, 6, 8])
+        assert np.all(r0 == [2, 4, 6, 8])

    def test_aliased_outputs_bad(self):
        # here the alias between outputs is not ok because destroying one
@@ -555,31 +555,31 @@ class Test_check_isfinite(unittest.TestCase):
        g = theano.function([x], theano.tensor.log(x), mode='DEBUG_MODE')

        # this should work
-        f(numpy.log([3, 4, 5]).astype(config.floatX))
+        f(np.log([3, 4, 5]).astype(config.floatX))

        # if TensorType.filter_checks_isfinite were true, these would raise
        # ValueError
        # if not, DebugMode will check internally, and raise InvalidValueError
        # passing an invalid value as an input should trigger ValueError
        self.assertRaises(debugmode.InvalidValueError, f,
-                          numpy.log([3, -4, 5]).astype(config.floatX))
+                          np.log([3, -4, 5]).astype(config.floatX))
        self.assertRaises(debugmode.InvalidValueError, f,
-                          (numpy.asarray([0, 1.0, 0]) / 0).astype(config.floatX))
+                          (np.asarray([0, 1.0, 0]) / 0).astype(config.floatX))
        self.assertRaises(debugmode.InvalidValueError, f,
-                          (numpy.asarray([1.0, 1.0, 1.0]) / 0).astype(config.floatX))
+                          (np.asarray([1.0, 1.0, 1.0]) / 0).astype(config.floatX))

        # generating an invalid value internally should trigger
        # InvalidValueError
        self.assertRaises(debugmode.InvalidValueError, g,
-                          numpy.asarray([3, -4, 5], dtype=config.floatX))
+                          np.asarray([3, -4, 5], dtype=config.floatX))

        # this should disable the exception
        theano.tensor.TensorType.filter_checks_isfinite = False
        theano.compile.mode.predefined_modes[
            'DEBUG_MODE'].check_isfinite = False
        # insert several Inf
-        f(numpy.asarray(numpy.asarray([1.0, 1.0, 1.0]) / 0,
-                        dtype=config.floatX))
+        f(np.asarray(np.asarray([1.0, 1.0, 1.0]) / 0,
+                     dtype=config.floatX))

    def test_check_isfinite_disabled(self):
        x = theano.tensor.dvector()
@@ -587,10 +587,10 @@ class Test_check_isfinite(unittest.TestCase):
                            mode=debugmode.DebugMode(check_isfinite=False))

        # nan should go through
-        f(numpy.log([3, -4, 5]))
+        f(np.log([3, -4, 5]))

        # inf should go through
-        infs = numpy.asarray([1.0, 1., 1.]) / 0
+        infs = np.asarray([1.0, 1., 1.]) / 0
        # print infs
        f(infs)
        return
@@ -721,14 +721,14 @@ class VecAsRowAndCol(gof.Op):

 class Test_preallocated_output(unittest.TestCase):
    def setUp(self):
-        self.rng = numpy.random.RandomState(seed=utt.fetch_seed())
+        self.rng = np.random.RandomState(seed=utt.fetch_seed())

    def test_f_contiguous(self):
        a = theano.tensor.fmatrix('a')
        b = theano.tensor.fmatrix('b')
        z = BrokenCImplementationAdd()(a, b)
        # In this test, we do not want z to be an output of the graph.
-        out = theano.tensor.dot(z, numpy.eye(7))
+        out = theano.tensor.dot(z, np.eye(7))

        a_val = self.rng.randn(7, 7).astype('float32')
        b_val = self.rng.randn(7, 7).astype('float32')

--- a/theano/compile/tests/test_function.py
+++ b/theano/compile/tests/test_function.py
@@ -5,7 +5,7 @@ import shutil
 import tempfile
 import unittest

-import numpy
+import numpy as np

 import theano
 from theano.compile.io import In
@@ -27,7 +27,7 @@ def test_function_dump():

    fct2 = theano.function(**l)
    x = [1, 2, 3]
-    assert numpy.allclose(fct1(x), fct2(x))
+    assert np.allclose(fct1(x), fct2(x))


 class TestFunctionIn(unittest.TestCase):
@@ -40,14 +40,14 @@ class TestFunctionIn(unittest.TestCase):

        f = theano.function([In(a, strict=False)], out)
        # works, rand generates float64 by default
-        f(numpy.random.rand(8))
+        f(np.random.rand(8))
        # works, casting is allowed
-        f(numpy.array([1, 2, 3, 4], dtype='int32'))
+        f(np.array([1, 2, 3, 4], dtype='int32'))

        f = theano.function([In(a, strict=True)], out)
        try:
            # fails, f expects float64
-            f(numpy.array([1, 2, 3, 4], dtype='int32'))
+            f(np.array([1, 2, 3, 4], dtype='int32'))
        except TypeError:
            pass

@@ -70,17 +70,17 @@ class TestFunctionIn(unittest.TestCase):

        # using mutable=True will let f change the value in aval
        f = theano.function([In(a, mutable=True)], a_out, mode='FAST_RUN')
-        aval = numpy.random.rand(10)
+        aval = np.random.rand(10)
        aval2 = aval.copy()
-        assert numpy.all(f(aval) == (aval2 * 2))
-        assert not numpy.all(aval == aval2)
+        assert np.all(f(aval) == (aval2 * 2))
+        assert not np.all(aval == aval2)

        # using mutable=False should leave the input untouched
        f = theano.function([In(a, mutable=False)], a_out, mode='FAST_RUN')
-        aval = numpy.random.rand(10)
+        aval = np.random.rand(10)
        aval2 = aval.copy()
-        assert numpy.all(f(aval) == (aval2 * 2))
-        assert numpy.all(aval == aval2)
+        assert np.all(f(aval) == (aval2 * 2))
+        assert np.all(aval == aval2)

    def test_in_update(self):
        a = theano.tensor.dscalar('a')
@@ -115,7 +115,7 @@ class TestFunctionIn(unittest.TestCase):
        # changes occur at the same time and one doesn't overwrite the other.
        for i in range(5):
            f()
-            assert numpy.allclose(shared_var.get_value(), i % 2)
+            assert np.allclose(shared_var.get_value(), i % 2)

    def test_in_allow_downcast_int(self):
        a = theano.tensor.wvector('a')  # int16
@@ -128,16 +128,16 @@ class TestFunctionIn(unittest.TestCase):

        # Both values are in range. Since they're not ndarrays (but lists),
        # they will be converted, and their value checked.
-        assert numpy.all(f([3], [6], 1) == 10)
+        assert np.all(f([3], [6], 1) == 10)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
-        self.assertRaises(TypeError, f, [3], numpy.array([6], dtype='int16'),
+        self.assertRaises(TypeError, f, [3], np.array([6], dtype='int16'),
                          1)

        # Value too big for a, silently ignored
-        assert numpy.all(f([2 ** 20], numpy.ones(1, dtype='int8'), 1) == 2)
+        assert np.all(f([2 ** 20], np.ones(1, dtype='int8'), 1) == 2)

        # Value too big for b, raises TypeError
        self.assertRaises(TypeError, f, [3], [312], 1)
@@ -156,17 +156,17 @@ class TestFunctionIn(unittest.TestCase):
                            (a + b + c))

        # If the values can be accurately represented, everything is OK
-        assert numpy.all(f(0, 0, 0) == 0)
+        assert np.all(f(0, 0, 0) == 0)

        # If allow_downcast is True, idem
-        assert numpy.allclose(f(0.1, 0, 0), 0.1)
+        assert np.allclose(f(0.1, 0, 0), 0.1)

        # If allow_downcast is False, nope
        self.assertRaises(TypeError, f, 0, 0.1, 0)

        # If allow_downcast is None, it should work iff floatX=float32
        if theano.config.floatX == 'float32':
-            assert numpy.allclose(f(0, 0, 0.1), 0.1)
+            assert np.allclose(f(0, 0, 0.1), 0.1)
        else:
            self.assertRaises(TypeError, f, 0, 0, 0.1)

@@ -182,10 +182,10 @@ class TestFunctionIn(unittest.TestCase):

        # If the values can be accurately represented, everything is OK
        z = [0]
-        assert numpy.all(f(z, z, z) == 0)
+        assert np.all(f(z, z, z) == 0)

        # If allow_downcast is True, idem
-        assert numpy.allclose(f([0.1], z, z), 0.1)
+        assert np.allclose(f([0.1], z, z), 0.1)

        # If allow_downcast is False, nope
        self.assertRaises(TypeError, f, z, [0.1], z)

--- a/theano/compile/tests/test_function_module.py
+++ b/theano/compile/tests/test_function_module.py
 from __future__ import absolute_import, print_function, division
 import copy
 import six.moves.cPickle as pickle
-import numpy
+import numpy as np
 import unittest


@@ -18,8 +18,6 @@ from theano import tensor
 from theano import tensor as T
 import theano

-import numpy as N
-

 def PatternOptimizer(p1, p2, ign=True):
    return gof.OpKeyOptimizer(gof.PatternSub(p1, p2), ignore_newtrees=ign)
@@ -281,7 +279,7 @@ class T_function(unittest.TestCase):

    def test_swap_SharedVariable(self):
        i = T.iscalar()
-        x_list = theano.shared(value=numpy.random.rand(10).astype(config.floatX))
+        x_list = theano.shared(value=np.random.rand(10).astype(config.floatX))

        x = T.scalar('x')
        # SharedVariable for tests, one of them has update
@@ -343,11 +341,11 @@ class T_function(unittest.TestCase):
        A special testcase for logistic_sgd.py in Deep Learning Tutorial
        This test assert that SharedVariable in different function have same storage
        """
-        train_x = theano.shared(value=numpy.random.rand(10, 10).astype(config.floatX))
-        test_x = theano.shared(value=numpy.random.rand(10, 10).astype(config.floatX))
+        train_x = theano.shared(value=np.random.rand(10, 10).astype(config.floatX))
+        test_x = theano.shared(value=np.random.rand(10, 10).astype(config.floatX))

-        train_y = theano.shared(value=numpy.random.rand(10, 1).astype(config.floatX))
-        test_y = theano.shared(value=numpy.random.rand(10, 1).astype(config.floatX))
+        train_y = theano.shared(value=np.random.rand(10, 1).astype(config.floatX))
+        test_y = theano.shared(value=np.random.rand(10, 1).astype(config.floatX))

        i = T.iscalar('index')
        x = T.vector('x')
@@ -500,42 +498,42 @@ class T_function(unittest.TestCase):
        when borrow=True is implemented.
        """
        a = T.dmatrix()
-        aval = numpy.random.rand(3, 3)
+        aval = np.random.rand(3, 3)

        # when borrow=False, test that a destroy map cannot alias output to input
        f = theano.function([In(a, borrow=False)], Out(a + 1, borrow=True))
-        assert numpy.all(f(aval) == aval + 1)
-        assert not numpy.may_share_memory(aval, f(aval))
+        assert np.all(f(aval) == aval + 1)
+        assert not np.may_share_memory(aval, f(aval))

        # when borrow=False, test that a viewmap cannot alias output to input
        f = theano.function([In(a, borrow=False)], Out(a[0, :], borrow=True))
-        assert numpy.all(f(aval) == aval[0, :])
-        assert not numpy.may_share_memory(aval, f(aval))
+        assert np.all(f(aval) == aval[0, :])
+        assert not np.may_share_memory(aval, f(aval))

    def test_borrow_output(self):
        a = T.dmatrix()
        f = function([a], Out(a, borrow=False))
-        o = N.ones((3, 3))
+        o = np.ones((3, 3))
        assert o is not f(o)  # function no longer permits aliasing outputs to inputs

        f = function([a], Out(a * 4, borrow=False))
-        o = N.ones((3, 3))
+        o = np.ones((3, 3))
        four = f(o)
-        assert numpy.all(four == 4)
+        assert np.all(four == 4)
        f(o + .1)  # should not clobber the memory used to store four
-        assert numpy.all(four == 4)
+        assert np.all(four == 4)

        f = function([a], Out(a * 4, borrow=True), mode=theano.Mode('c|py_nogc', 'fast_run'))
-        o = N.ones((3, 3))
+        o = np.ones((3, 3))
        four = f(o)
-        assert numpy.all(four == 4)
+        assert np.all(four == 4)
        f(o + .1)  # should clobber the memory used to store four
        if theano.config.cxx:
-            assert not numpy.all(four == 4)
+            assert not np.all(four == 4)
        else:
            # The Elemwise.perform method don't reuse memory
            # as some numpy version don't support that correctly.
-            assert numpy.all(four == 4)
+            assert np.all(four == 4)

    def test_disconnected_input(self):
        a = T.scalar('a')
@@ -579,6 +577,20 @@ class T_function(unittest.TestCase):
            if not isinstance(key, theano.gof.Constant):
                assert (val[0] is None)

+    def test_default_values(self):
+        """
+        Check that default values are restored
+        when an exception occurs in interactive mode.
+        """
+        a, b = T.dscalars('a', 'b')
+        c = a + b
+        func = theano.function([theano.In(a, name='first'), theano.In(b, value=1, name='second')], c)
+        x = func(first=1)
+        try:
+            func(second=2)
+        except TypeError:
+            assert(func(first=1) == x)
+

 class T_picklefunction(unittest.TestCase):

@@ -753,7 +765,7 @@ class T_picklefunction(unittest.TestCase):
        assert f2.container[s].storage is f1.container[s].storage

        # now put in a function with non-scalar
-        v_value = numpy.asarray([2, 3, 4.], dtype=config.floatX)
+        v_value = np.asarray([2, 3, 4.], dtype=config.floatX)
        f3 = function([x, In(v, value=v_value)], x + v)
        list_of_things.append(f3)

@@ -800,13 +812,13 @@ class T_picklefunction(unittest.TestCase):
        assert nl[5](3) == ol[5](3)
        assert nl[4].value[nl[0]] == 6

-        assert numpy.all(nl[6][nl[2]] == numpy.asarray([2, 3., 4]))
+        assert np.all(nl[6][nl[2]] == np.asarray([2, 3., 4]))

    def test_broken_pickle_with_shared(self):
        saves = []

        def pers_save(obj):
-            if isinstance(obj, numpy.ndarray):
+            if isinstance(obj, np.ndarray):
                saves.append(obj)
                return len(saves) - 1
            else:
@@ -815,7 +827,7 @@ class T_picklefunction(unittest.TestCase):
        def pers_load(id):
            return saves[id]

-        b = numpy.random.rand(5, 4)
+        b = np.random.rand(5, 4)

        x = theano.tensor.matrix()
        y = theano.shared(b)

--- a/theano/compile/tests/test_misc.py
+++ b/theano/compile/tests/test_misc.py
 from __future__ import absolute_import, print_function, division

-import numpy
+import numpy as np
 import unittest

 from theano.compile.pfunc import pfunc
@@ -20,8 +20,8 @@ class NNet(object):
        self.input = input
        self.target = target
        self.lr = shared(lr, 'learning_rate')
-        self.w1 = shared(numpy.zeros((n_hidden, n_input)), 'w1')
-        self.w2 = shared(numpy.zeros((n_output, n_hidden)), 'w2')
+        self.w1 = shared(np.zeros((n_hidden, n_input)), 'w1')
+        self.w2 = shared(np.zeros((n_output, n_hidden)), 'w2')
        # print self.lr.type

        self.hidden = sigmoid(tensor.dot(self.w1, self.input))
@@ -45,7 +45,7 @@ class NNet(object):
 class TestNnet(unittest.TestCase):

    def test_nnet(self):
-        rng = numpy.random.RandomState(1827)
+        rng = np.random.RandomState(1827)
        data = rng.rand(10, 4)
        nnet = NNet(n_input=3, n_hidden=10)
        for epoch in range(3):
@@ -60,4 +60,4 @@ class TestNnet(unittest.TestCase):
        self.assertTrue(abs(mean_cost - 0.20588975452) < 1e-6)
        # Just call functions to make sure they do not crash.
        nnet.compute_output(input)
-        nnet.output_from_hidden(numpy.ones(10))
+        nnet.output_from_hidden(np.ones(10))
--- a/theano/compile/tests/test_monitormode.py
+++ b/theano/compile/tests/test_monitormode.py
 from __future__ import absolute_import, print_function, division
-import numpy
+import numpy as np

 import theano

@@ -12,7 +12,7 @@ def test_detect_nan():

    def detect_nan(i, node, fn):
        for output in fn.outputs:
-            if numpy.isnan(output[0]).any():
+            if np.isnan(output[0]).any():
                print('*** NaN detected ***')
                theano.printing.debugprint(node)
                print('Inputs : %s' % [input[0] for input in fn.inputs])
@@ -36,7 +36,7 @@ def test_optimizer():

    def detect_nan(i, node, fn):
        for output in fn.outputs:
-            if numpy.isnan(output[0]).any():
+            if np.isnan(output[0]).any():
                print('*** NaN detected ***')
                theano.printing.debugprint(node)
                print('Inputs : %s' % [input[0] for input in fn.inputs])
@@ -65,7 +65,7 @@ def test_not_inplace():

    def detect_nan(i, node, fn):
        for output in fn.outputs:
-            if numpy.isnan(output[0]).any():
+            if np.isnan(output[0]).any():
                print('*** NaN detected ***')
                theano.printing.debugprint(node)
                print('Inputs : %s' % [input[0] for input in fn.inputs])

--- a/theano/compile/tests/test_nanguardmode.py
+++ b/theano/compile/tests/test_nanguardmode.py
@@ -6,7 +6,7 @@ from __future__ import absolute_import, print_function, division
 import logging
 from nose.tools import assert_raises

-import numpy
+import numpy as np

 from theano.compile.nanguardmode import NanGuardMode
 import theano
@@ -18,20 +18,20 @@ def test_NanGuardMode():
    # intentionally. A working implementation should be able to capture all
    # the abnormalties.
    x = T.matrix()
-    w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX))
+    w = theano.shared(np.random.randn(5, 7).astype(theano.config.floatX))
    y = T.dot(x, w)

    fun = theano.function(
        [x], y,
        mode=NanGuardMode(nan_is_error=True, inf_is_error=True)
    )
-    a = numpy.random.randn(3, 5).astype(theano.config.floatX)
-    infa = numpy.tile(
-        (numpy.asarray(100.) ** 1000000).astype(theano.config.floatX), (3, 5))
-    nana = numpy.tile(
-        numpy.asarray(numpy.nan).astype(theano.config.floatX), (3, 5))
-    biga = numpy.tile(
-        numpy.asarray(1e20).astype(theano.config.floatX), (3, 5))
+    a = np.random.randn(3, 5).astype(theano.config.floatX)
+    infa = np.tile(
+        (np.asarray(100.) ** 1000000).astype(theano.config.floatX), (3, 5))
+    nana = np.tile(
+        np.asarray(np.nan).astype(theano.config.floatX), (3, 5))
+    biga = np.tile(
+        np.asarray(1e20).astype(theano.config.floatX), (3, 5))

    fun(a)  # normal values

@@ -46,14 +46,14 @@ def test_NanGuardMode():
        _logger.propagate = True

    # slices
-    a = numpy.random.randn(3, 4, 5).astype(theano.config.floatX)
-    infa = numpy.tile(
-        (numpy.asarray(100.) ** 1000000).astype(theano.config.floatX),
+    a = np.random.randn(3, 4, 5).astype(theano.config.floatX)
+    infa = np.tile(
+        (np.asarray(100.) ** 1000000).astype(theano.config.floatX),
        (3, 4, 5))
-    nana = numpy.tile(
-        numpy.asarray(numpy.nan).astype(theano.config.floatX), (3, 4, 5))
-    biga = numpy.tile(
-        numpy.asarray(1e20).astype(theano.config.floatX), (3, 4, 5))
+    nana = np.tile(
+        np.asarray(np.nan).astype(theano.config.floatX), (3, 4, 5))
+    biga = np.tile(
+        np.asarray(1e20).astype(theano.config.floatX), (3, 4, 5))

    x = T.tensor3()
    y = x[:, T.arange(2), T.arange(2)]

--- a/theano/compile/tests/test_ops.py
+++ b/theano/compile/tests/test_ops.py
@@ -9,7 +9,6 @@ from theano.tests import unittest_tools as utt
 from theano import function
 import theano
 from theano.tensor import dmatrix, dvector
-from numpy import allclose
 from theano.compile import as_op
 import pickle

@@ -34,7 +33,7 @@ class OpDecoratorTests(utt.InferShapeTester):
        r = fn([[1.5, 5], [2, 2]])
        r0 = np.array([1.5, 7.5, 15., 30.])

-        assert allclose(r, r0), (r, r0)
+        assert np.allclose(r, r0), (r, r0)

    def test_2arg(self):
        x = dmatrix('x')
@@ -50,7 +49,7 @@ class OpDecoratorTests(utt.InferShapeTester):
        r = fn([[1.5, 5], [2, 2]], [1, 100, 2, 200])
        r0 = np.array([2.5, 107.5, 17., 230.])

-        assert allclose(r, r0), (r, r0)
+        assert np.allclose(r, r0), (r, r0)

    def test_infer_shape(self):
        x = dmatrix('x')

--- a/theano/compile/tests/test_pfunc.py
+++ b/theano/compile/tests/test_pfunc.py
--- a/theano/compile/tests/test_profiling.py
+++ b/theano/compile/tests/test_profiling.py
@@ -6,7 +6,7 @@ from __future__ import absolute_import, print_function, division

 import unittest

-import numpy
+import numpy as np

 import theano
 from six.moves import StringIO
@@ -45,7 +45,7 @@ class Test_profiling(unittest.TestCase):
            f = theano.function(x, z, profile=p, name="test_profiling",
                                mode=m)

-            inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
+            inp = [np.arange(1024, dtype='float32') + 1 for i in range(len(x))]
            f(*inp)

            buf = StringIO()

--- a/theano/compile/tests/test_shared.py
+++ b/theano/compile/tests/test_shared.py
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -126,6 +126,12 @@ AddConfigVar(
    BoolParam(False, allow_override=False),
    in_c_key=False)

+AddConfigVar(
+    'print_global_stats',
+    "Print some global statistics (time spent) at the end",
+    BoolParam(False),
+    in_c_key=False)
+

 class ContextsParam(ConfigParam):
    def __init__(self):
@@ -1111,7 +1117,7 @@ AddConfigVar('optdb.position_cutoff',

 AddConfigVar('optdb.max_use_ratio',
             'A ratio that prevent infinite loop in EquilibriumOptimizer.',
-             FloatParam(5),
+             FloatParam(8),
             in_c_key=False)

 AddConfigVar('gcc.cxxflags',

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -2510,10 +2510,14 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        end_nb_nodes = len(fgraph.apply_nodes)

        if max_use_abort:
-            _logger.error("EquilibriumOptimizer max'ed out by '%s'" % opt_name +
-                          ". You can safely raise the current threshold of " +
-                          "%f with the theano flag 'optdb.max_use_ratio'." %
-                          config.optdb.max_use_ratio)
+            msg = ("EquilibriumOptimizer max'ed out by '%s'" % opt_name +
+                   ". You can safely raise the current threshold of " +
+                   "%f with the theano flag 'optdb.max_use_ratio'." %
+                   config.optdb.max_use_ratio)
+            if theano.config.on_opt_error == 'raise':
+                raise AssertionError(msg)
+            else:
+                _logger.error(msg)
        fgraph.remove_feature(change_tracker)
        assert len(loop_process_count) == len(loop_timing)
        assert len(loop_process_count) == len(global_opt_timing)

--- a/theano/gof/tests/test_opt.py
+++ b/theano/gof/tests/test_opt.py
@@ -571,6 +571,7 @@ class TestEquilibrium(object):
        opt.optimize(g)
        assert str(g) == '[Op2(x, y)]'

+    @theano.configparser.change_flags(on_opt_error='ignore')
    def test_low_use_ratio(self):
        x, y, z = map(MyVariable, 'xyz')
        e = op3(op4(x, y))

--- a/theano/gof/utils.py
+++ b/theano/gof/utils.py
@@ -503,6 +503,8 @@ def hist(coll):
    return counts


+@deprecated("theano.gof.utils",
+            msg="Use a_theano_variable.auto_name instead")
 def give_variables_names(variables):
    """
    Gives unique names to an iterable of variables. Modifies input.

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -482,7 +482,7 @@ class Stack(VM):
                    try:
                        _, dt = self.run_thunk_of_node(current_apply)
                        del _
-                        if config.profile:
+                        if config.profile or config.print_global_stats:
                            current_idx = self.node_idx[current_apply]
                            self.call_counts[current_idx] += 1
                            self.call_times[current_idx] += dt
@@ -596,7 +596,7 @@ class Stack(VM):
                        if current_apply.inputs[r].owner:
                            apply_stack.append(current_apply.inputs[r].owner)
                else:
-                    if config.profile:
+                    if config.profile or config.print_global_stats:
                        for (idx, o) in enumerate(thunks[
                                self.node_idx[current_apply]].outputs):
                            var = self.nodes[
@@ -757,7 +757,7 @@ class VM_Linker(link.LocalLinker):
        associated to self, else, a new VM_Linker associated to fgraph.

        """
-        if (config.profile and
+        if ((config.profile or config.print_global_stats) and
                ((hasattr(theano, 'sandbox') and
                  hasattr(theano.sandbox, 'cuda') and
                  theano.sandbox.cuda.cuda_enabled) or
@@ -856,7 +856,7 @@ class VM_Linker(link.LocalLinker):
        pre_call_clear = [storage_map[v] for v in self.no_recycling]

        if (self.callback is not None or self.callback_input is not None or
-                (config.profile and config.profile_memory) or
+                ((config.profile or config.print_global_stats) and config.profile_memory) or
                (self.allow_partial_eval and not self.use_cloop)):

            if self.use_cloop and (self.callback is not None or
@@ -1086,7 +1086,7 @@ class VM_Linker(link.LocalLinker):
            lazy = config.vm.lazy
        if lazy is None:
            lazy = not all([(not th.lazy) for th in thunks])
-        if not (lazy or (config.profile and config.profile_memory) or
+        if not (lazy or ((config.profile or config.print_global_stats) and config.profile_memory) or
                self.use_cloop or self.callback or self.callback_input):
            for pair in itervalues(reallocated_info):
                storage_map[pair[1]] = storage_map[pair[0]]

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/dnn_batchnorm.c
+++ b/theano/gpuarray/dnn_batchnorm.c
@@ -2,8 +2,19 @@

 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, npy_float64 epsilon,
-                     PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
-                     PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
+                     npy_float64 running_average_factor,
+#ifdef RUNNING_AVERAGES
+                     PyGpuArrayObject *in_running_mean,
+                     PyGpuArrayObject *in_running_var,
+#endif
+                     PyGpuArrayObject **outp,
+                     PyGpuArrayObject **x_mean,
+                     PyGpuArrayObject **x_invstd,
+#ifdef RUNNING_AVERAGES
+                     PyGpuArrayObject **out_running_mean,
+                     PyGpuArrayObject **out_running_var,
+#endif
+                     cudnnHandle_t _handle) {
  PyGpuContextObject *c = inp->context;

  if (c_set_tensorNd(inp, bn_input) != 0)
@@ -11,11 +22,19 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
  if (c_set_tensorNd(scale, bn_params) != 0)
    return 1;

-  if (epsilon < 1e-5)
+  if (epsilon < 1e-5) {
+    PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
    return 1;
+  }

+#ifdef INPLACE_OUTPUT
+  Py_XDECREF(*outp);
+  *outp = inp;
+  Py_INCREF(*outp);
+#else
  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
+#endif
  if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
@@ -24,6 +43,31 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
  if (c_set_tensorNd(*outp, bn_output) != 0)
    return 1;

+#ifdef RUNNING_AVERAGES
+#ifdef INPLACE_RUNNING_MEAN
+  Py_XDECREF(out_running_mean);
+  PyGpuArrayObject *running_mean = in_running_mean;
+  Py_INCREF(running_mean);
+#else
+  PyGpuArrayObject *running_mean = *out_running_mean;
+  running_mean = theano_try_copy(running_mean, in_running_mean);
+  if (running_mean == NULL) {
+    return 1;
+  }
+#endif
+#ifdef INPLACE_RUNNING_VAR
+  Py_XDECREF(out_running_var);
+  PyGpuArrayObject *running_var = in_running_var;
+  Py_INCREF(running_var);
+#else
+  PyGpuArrayObject *running_var = *out_running_var;
+  running_var = theano_try_copy(running_var, in_running_var);
+  if (running_var == NULL) {
+    return 1;
+  }
+#endif
+#endif
+
  {
    const float falpha = 1.;
    const float fbeta = 0.;
@@ -50,9 +94,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      bn_params,
      PyGpuArray_DEV_DATA(scale),
      PyGpuArray_DEV_DATA(bias),
+#ifdef RUNNING_AVERAGES
+      running_average_factor,
+      PyGpuArray_DEV_DATA(running_mean),
+      PyGpuArray_DEV_DATA(running_var),
+#else
      0,
      NULL,  // running mean, deliberately unused
      NULL,  // running var, deliberately unused
+#endif
      epsilon,
      PyGpuArray_DEV_DATA(*x_mean),
      PyGpuArray_DEV_DATA(*x_invstd)
@@ -62,6 +112,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                   cudnnGetErrorString(err));
      return 1;
    }
+#ifdef RUNNING_AVERAGES
+    *out_running_mean = running_mean;
+    *out_running_var = running_var;
+#endif
  }
  return 0;
 }
--- a/theano/gpuarray/dnn_batchnorm_grad.c
+++ b/theano/gpuarray/dnn_batchnorm_grad.c
@@ -34,8 +34,10 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
  if (c_set_tensorNd(scale, bn_params) != 0)
    return 1;

-  if (epsilon < 1e-5)
+  if (epsilon < 1e-5) {
+    PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
    return 1;
+  }

  if (theano_prep_output(dinp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;

--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
@@ -11,11 +11,19 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
  if (c_set_tensorNd(scale, bn_params) != 0)
    return 1;

-  if (epsilon < 1e-5)
+  if (epsilon < 1e-5) {
+    PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
    return 1;
+  }

+#ifdef INPLACE_OUTPUT
+  Py_XDECREF(*outp);
+  *outp = inp;
+  Py_INCREF(*outp);
+#else
  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
+#endif

  if (c_set_tensorNd(*outp, bn_output) != 0)
    return 1;

--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
@@ -252,3 +252,7 @@ class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
        self.constant_tensor = gpuarray.array(
            np.zeros((3, 5, 7, 11), dtype='float32'),
            context=get_context(test_ctx_name))
+
+
+class TestConv2dTranspose(test_abstract_conv.TestConv2dTranspose):
+    mode = mode_with_gpu
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -13,7 +13,7 @@ import time
 from optparse import OptionParser
 import subprocess

-import numpy
+import numpy as np
 import theano
 import theano.tensor as T

@@ -47,10 +47,10 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
        print()
        print('Numpy config: (used when the Theano flag'
              ' "blas.ldflags" is empty)')
-        numpy.show_config()
-        print('Numpy dot module:', numpy.dot.__module__)
-        print('Numpy location:', numpy.__file__)
-        print('Numpy version:', numpy.__version__)
+        np.show_config()
+        print('Numpy dot module:', np.dot.__module__)
+        print('Numpy location:', np.__file__)
+        print('Numpy version:', np.__version__)
        if (theano.config.device.startswith("gpu") or
                theano.config.init_gpu_device.startswith("gpu")):
            print('nvcc version:')
@@ -58,12 +58,12 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
                             "--version"))
            print()

-    a = theano.shared(numpy.ones((M, N), dtype=theano.config.floatX,
-                                 order=order))
-    b = theano.shared(numpy.ones((N, K), dtype=theano.config.floatX,
-                                 order=order))
-    c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX,
-                                 order=order))
+    a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
+                              order=order))
+    b = theano.shared(np.ones((N, K), dtype=theano.config.floatX,
+                              order=order))
+    c = theano.shared(np.ones((M, K), dtype=theano.config.floatX,
+                              order=order))
    f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))])

    if any([x.op.__class__.__name__ == 'Gemm' for x in

--- a/theano/misc/check_multi_gpu.py
+++ b/theano/misc/check_multi_gpu.py
@@ -9,7 +9,7 @@ from __future__ import absolute_import, print_function, division
 import threading
 import time

-import numpy
+import numpy as np

 import theano
 from theano.gpuarray import init_dev
@@ -21,7 +21,7 @@ def main(dev1, dev2):
    init_dev(dev2, 'ctx2')

    size = 1024 * 16
-    data = numpy.random.randn(size, size).astype('float32')
+    data = np.random.randn(size, size).astype('float32')
    val1a = theano.shared(data, target='ctx1')
    val1b = theano.shared(data, target='ctx1')
    val1c = theano.shared(data, target='ctx1')

--- a/theano/misc/latence_gpu_transfert.py
+++ b/theano/misc/latence_gpu_transfert.py
@@ -2,18 +2,18 @@ from __future__ import absolute_import, print_function, division

 import time

-import numpy
+import numpy as np

 import theano

 y = theano.tensor.fvector()
-x = theano.shared(numpy.zeros(1, dtype='float32'))
+x = theano.shared(np.zeros(1, dtype='float32'))
 f1 = theano.function([y], updates={x: y})
 f2 = theano.function([], theano.sandbox.cuda.host_from_gpu(x))
 print(f1.maker.fgraph.toposort())
 print(f2.maker.fgraph.toposort())
 for i in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]:
-    o = numpy.zeros(i, dtype='float32')
+    o = np.zeros(i, dtype='float32')
    t0 = time.time()
    f1(o)
    t1 = time.time()

--- a/theano/misc/may_share_memory.py
+++ b/theano/misc/may_share_memory.py
@@ -4,7 +4,7 @@ numpy version support only ndarray.
 """
 from __future__ import absolute_import, print_function, division

-import numpy
+import numpy as np
 from theano.tensor.basic import TensorType

 try:
@@ -42,8 +42,8 @@ else:


 def may_share_memory(a, b, raise_other_type=True):
-    a_ndarray = isinstance(a, numpy.ndarray)
-    b_ndarray = isinstance(b, numpy.ndarray)
+    a_ndarray = isinstance(a, np.ndarray)
+    b_ndarray = isinstance(b, np.ndarray)
    if a_ndarray and b_ndarray:
        return TensorType.may_share_memory(a, b)
    a_cuda = _is_cuda(a)

--- a/theano/misc/pkl_utils.py
+++ b/theano/misc/pkl_utils.py
@@ -5,7 +5,7 @@ These pickled graphs can be used, for instance, as cases for
 unit tests or regression tests.
 """
 from __future__ import absolute_import, print_function, division
-import numpy
+import numpy as np
 import os
 import pickle
 import sys
@@ -188,10 +188,10 @@ class PersistentNdarrayID(object):
        return name

    def __call__(self, obj):
-        if type(obj) is numpy.ndarray:
+        if type(obj) is np.ndarray:
            if id(obj) not in self.seen:
                def write_array(f):
-                    numpy.lib.format.write_array(f, obj)
+                    np.lib.format.write_array(f, obj)
                name = self._resolve_name(obj)
                zipadd(write_array, self.zip_file, name)
                self.seen[id(obj)] = 'ndarray.{0}'.format(name)
@@ -204,7 +204,7 @@ class PersistentCudaNdarrayID(PersistentNdarrayID):
                type(obj) is cuda_ndarray.cuda_ndarray.CudaNdarray):
            if id(obj) not in self.seen:
                def write_array(f):
-                    numpy.lib.format.write_array(f, numpy.asarray(obj))
+                    np.lib.format.write_array(f, np.asarray(obj))
                name = self._resolve_name(obj)
                zipadd(write_array, self.zip_file, name)
                self.seen[id(obj)] = 'cuda_ndarray.{0}'.format(name)
@@ -283,7 +283,7 @@ class PersistentNdarrayLoad(object):
        if name in self.cache:
            return self.cache[name]
        ret = None
-        array = numpy.lib.format.read_array(self.zip_file.open(name))
+        array = np.lib.format.read_array(self.zip_file.open(name))
        if array_type == 'cuda_ndarray':
            if config.experimental.unpickle_gpu_on_cpu:
                # directly return numpy array
@@ -335,10 +335,10 @@ def dump(obj, file_handler, protocol=DEFAULT_PROTOCOL,
    >>> foo_1 = theano.shared(0, name='foo')
    >>> foo_2 = theano.shared(1, name='foo')
    >>> with open('model.zip', 'wb') as f:
-    ...     dump((foo_1, foo_2, numpy.array(2)), f)
-    >>> numpy.load('model.zip').keys()
+    ...     dump((foo_1, foo_2, np.array(2)), f)
+    >>> np.load('model.zip').keys()
    ['foo', 'foo_2', 'array_0', 'pkl']
-    >>> numpy.load('model.zip')['foo']
+    >>> np.load('model.zip')['foo']
    array(0)
    >>> with open('model.zip', 'rb') as f:
    ...     foo_1, foo_2, array = load(f)

--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
@@ -22,7 +22,7 @@ TheanoElementwiseKernel.
 from __future__ import absolute_import, print_function, division
 from itertools import chain

-import numpy
+import numpy as np

 import theano
 from six.moves import xrange
@@ -257,13 +257,13 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
                            " inputs don't have the same shape!")

        if inputs[0].size > 512:
-            grid = (int(numpy.ceil(inputs[0].size / 512.)), 1)
+            grid = (int(np.ceil(inputs[0].size / 512.)), 1)
            block = (512, 1, 1)
        else:
            grid = (1, 1)
            block = (inputs[0].shape[0], inputs[0].shape[1], 1)
        self.pycuda_fct(inputs[0], inputs[1], z[0],
-                        numpy.intc(inputs[1].size), block=block, grid=grid)
+                        np.intc(inputs[1].size), block=block, grid=grid)


 class PycudaElemwiseSourceModuleMakeThunkOp(Op):
@@ -349,13 +349,13 @@ class PycudaElemwiseSourceModuleMakeThunkOp(Op):
                                " inputs don't have the same shape!")

            if inputs[0][0].size > 512:
-                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
+                grid = (int(np.ceil(inputs[0][0].size / 512.)), 1)
                block = (512, 1, 1)
            else:
                grid = (1, 1)
                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
            pycuda_fct(inputs[0][0], inputs[1][0], z[0],
-                       numpy.intc(inputs[1][0].size), block=block,
+                       np.intc(inputs[1][0].size), block=block,
                       grid=grid)
        thunk.inputs = inputs
        thunk.outputs = outputs

--- a/theano/misc/safe_asarray.py
+++ b/theano/misc/safe_asarray.py
@@ -3,7 +3,7 @@ Helper function to safely convert an array to a new data type.
 """
 from __future__ import absolute_import, print_function, division

-import numpy
+import numpy as np

 import theano

@@ -30,8 +30,8 @@ def _asarray(a, dtype, order=None):
    """
    if str(dtype) == 'floatX':
        dtype = theano.config.floatX
-    dtype = numpy.dtype(dtype)  # Convert into dtype object.
-    rval = numpy.asarray(a, dtype=dtype, order=order)
+    dtype = np.dtype(dtype)  # Convert into dtype object.
+    rval = np.asarray(a, dtype=dtype, order=order)
    # Note that dtype comparison must be done by comparing their `num`
    # attribute. One cannot assume that two identical data types are pointers
    # towards the same object (e.g. under Windows this appears not to be the

--- a/theano/misc/tests/test_cudamat_utils.py
+++ b/theano/misc/tests/test_cudamat_utils.py
 from __future__ import absolute_import, print_function, division
-import numpy
+import numpy as np
 import theano
 from theano.misc.cudamat_utils import cudamat_available

@@ -20,7 +20,7 @@ def test(shape=(3, 4)):
    U = gpu(theano.tensor.fmatrix('U'))
    ii = theano.function([U], gpu(U + 1))

-    A_cpu = numpy.asarray(numpy.random.rand(*shape), dtype="float32")
+    A_cpu = np.asarray(np.random.rand(*shape), dtype="float32")
    A_cnd = theano.sandbox.cuda.CudaNdarray(A_cpu)
    A_cmat = cudandarray_to_cudamat(A_cnd)

@@ -28,9 +28,9 @@ def test(shape=(3, 4)):
    B_cnd = ii(A_cnd)

    u = A_cnd.copy()
-    u += theano.sandbox.cuda.CudaNdarray(numpy.asarray([[1]], dtype='float32'))
-    u = numpy.asarray(u)
-    v = numpy.asarray(B_cnd)
+    u += theano.sandbox.cuda.CudaNdarray(np.asarray([[1]], dtype='float32'))
+    u = np.asarray(u)
+    v = np.asarray(B_cnd)
    w = A_cmat.add(1).asarray()

    assert abs(u - v).max() == 0

--- a/theano/misc/tests/test_gnumpy_utils.py
+++ b/theano/misc/tests/test_gnumpy_utils.py
 from __future__ import absolute_import, print_function, division
-import numpy
+import numpy as np

 import theano
 from theano.misc.gnumpy_utils import gnumpy_available
@@ -31,11 +31,10 @@ def test(shape=(3, 4, 5)):
    B_cnd = ii(A_cnd)
    B = cudandarray_to_garray(B_cnd)
    assert A_cnd.shape == A.shape
-    from numpy import array

    u = (A + 1).asarray()
    v = B.asarray()
-    w = array(B_cnd)
+    w = np.array(B_cnd)
    assert (u == v).all()
    assert (u == w).all()

@@ -49,7 +48,7 @@ def test2(shape=(3, 4, 5)):
    U = gpu(theano.tensor.ftensor3('U'))
    theano.function([U], gpu(U + 1))

-    A = numpy.random.rand(*shape).astype('float32')
+    A = np.random.rand(*shape).astype('float32')
    A_cnd = theano.sandbox.cuda.CudaNdarray(A)
    A_gar = cudandarray_to_garray(A_cnd)
    assert A_cnd.shape == A_gar.shape
@@ -62,7 +61,7 @@ def test2(shape=(3, 4, 5)):
    # dtype always float32
    assert A_cnd._strides == B._strides
    assert A_cnd.gpudata == B.gpudata
-    v = numpy.asarray(B)
+    v = np.asarray(B)
    assert (v == A).all()



--- a/theano/misc/tests/test_may_share_memory.py
+++ b/theano/misc/tests/test_may_share_memory.py
@@ -3,7 +3,7 @@ test the tensor and sparse type. The CudaNdarray type is tested in
 sandbox/cuda/tests/test_tensor_op.py.test_may_share_memory_cuda
 """
 from __future__ import absolute_import, print_function, division
-import numpy
+import numpy as np
 import theano

 try:
@@ -16,8 +16,8 @@ from theano.misc.may_share_memory import may_share_memory


 def test_may_share_memory():
-    a = numpy.random.rand(5, 4)
-    b = numpy.random.rand(5, 4)
+    a = np.random.rand(5, 4)
+    b = np.random.rand(5, 4)
    va = a.view()
    vb = b.view()
    ra = a.reshape((4, 5))

--- a/theano/misc/tests/test_pkl_utils.py
+++ b/theano/misc/tests/test_pkl_utils.py
@@ -4,8 +4,7 @@ import shutil
 import unittest
 from tempfile import mkdtemp

-import numpy
-from numpy.testing import assert_allclose
+import numpy as np
 from nose.plugins.skip import SkipTest

 import theano
@@ -44,7 +43,7 @@ class T_dump_load(unittest.TestCase):
            x = load(f)

        assert x.name == 'x'
-        assert_allclose(x.get_value(), [[1]])
+        np.testing.assert_allclose(x.get_value(), [[1]])

    def test_dump_load_mrg(self):
        rng = MRG_RandomStreams(use_cuda=cuda_ndarray.cuda_enabled)
@@ -62,14 +61,14 @@ class T_dump_load(unittest.TestCase):
        foo_2 = theano.shared(1, name='foo')
        foo_3 = theano.shared(2, name='foo')
        with open('model.zip', 'wb') as f:
-            dump((foo_1, foo_2, foo_3, numpy.array(3)), f)
-        keys = list(numpy.load('model.zip').keys())
+            dump((foo_1, foo_2, foo_3, np.array(3)), f)
+        keys = list(np.load('model.zip').keys())
        assert keys == ['foo', 'foo_2', 'foo_3', 'array_0', 'pkl']
-        foo_3 = numpy.load('model.zip')['foo_3']
-        assert foo_3 == numpy.array(2)
+        foo_3 = np.load('model.zip')['foo_3']
+        assert foo_3 == np.array(2)
        with open('model.zip', 'rb') as f:
            foo_1, foo_2, foo_3, array = load(f)
-        assert array == numpy.array(3)
+        assert array == np.array(3)


 class TestStripPickler(unittest.TestCase):

--- a/theano/misc/tests/test_pycuda_example.py
+++ b/theano/misc/tests/test_pycuda_example.py
 from __future__ import absolute_import, print_function, division
-import numpy
+import numpy as np

 import theano
 import theano.misc.pycuda_init
@@ -58,11 +58,11 @@ def test_pycuda_elemwise_source_module():
                                   PycudaElemwiseSourceModuleMakeThunkOp)
                        for node in f4.maker.fgraph.toposort()])

-            val1 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
-            val2 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
-            assert numpy.allclose(f(val1, val2), f2(val1, val2))
-            assert numpy.allclose(f(val1, val2), f3(val1, val2))
-            assert numpy.allclose(f(val1, val2), f4(val1, val2))
+            val1 = np.asarray(np.random.rand(*shape), dtype='float32')
+            val2 = np.asarray(np.random.rand(*shape), dtype='float32')
+            assert np.allclose(f(val1, val2), f2(val1, val2))
+            assert np.allclose(f(val1, val2), f3(val1, val2))
+            assert np.allclose(f(val1, val2), f4(val1, val2))
            # print f(val1,val2)
            # print f2(val1,val2)

@@ -82,10 +82,10 @@ def test_pycuda_elemwise_kernel():
    assert any([isinstance(node.op, PycudaElemwiseKernelOp)
                for node in f2.maker.fgraph.toposort()])

-    val1 = numpy.asarray(numpy.random.rand(5, 5), dtype='float32')
-    val2 = numpy.asarray(numpy.random.rand(5, 5), dtype='float32')
-    #val1 = numpy.ones((5,5))
-    #val2 = numpy.arange(25).reshape(5,5)
+    val1 = np.asarray(np.random.rand(5, 5), dtype='float32')
+    val2 = np.asarray(np.random.rand(5, 5), dtype='float32')
+    #val1 = np.ones((5,5))
+    #val2 = np.arange(25).reshape(5,5)
    assert (f(val1, val2) == f2(val1, val2)).all()
    print(f(val1, val2))
    print(f2(val1, val2))
@@ -99,8 +99,8 @@ def test_pycuda_elemwise_kernel():
    assert any([isinstance(node.op, PycudaElemwiseKernelOp)
                for node in f4.maker.fgraph.toposort()])

-    val1 = numpy.random.rand(2, 2, 2)
+    val1 = np.random.rand(2, 2, 2)
    print(val1)
    print(f4(val1, val1, val1))
-    assert numpy.allclose(f4(val1, val1, val1), val1 * val1 + val1)
+    assert np.allclose(f4(val1, val1, val1), val1 * val1 + val1)
 """
--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
@@ -8,7 +8,7 @@ from __future__ import absolute_import, print_function, division

 import sys

-import numpy
+import numpy as np

 import theano
 import theano.sandbox.cuda as cuda_ndarray
@@ -42,9 +42,9 @@ __global__ void multiply_them(float *dest, float *a, float *b)
    multiply_them = mod.get_function("multiply_them")

    # Test with pycuda in/out of numpy.ndarray
-    a = numpy.random.randn(100).astype(numpy.float32)
-    b = numpy.random.randn(100).astype(numpy.float32)
-    dest = numpy.zeros_like(a)
+    a = np.random.randn(100).astype(np.float32)
+    b = np.random.randn(100).astype(np.float32)
+    dest = np.zeros_like(a)
    multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400, 1, 1), grid=(1, 1))
@@ -64,8 +64,8 @@ __global__ void multiply_them(float *dest, float *a, float *b)

    multiply_them = mod.get_function("multiply_them")

-    a = numpy.random.randn(100).astype(numpy.float32)
-    b = numpy.random.randn(100).astype(numpy.float32)
+    a = np.random.randn(100).astype(np.float32)
+    b = np.random.randn(100).astype(np.float32)

    # Test with Theano object
    ga = cuda_ndarray.CudaNdarray(a)
@@ -73,7 +73,7 @@ __global__ void multiply_them(float *dest, float *a, float *b)
    dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
    multiply_them(dest, ga, gb,
                  block=(400, 1, 1), grid=(1, 1))
-    assert (numpy.asarray(dest) == a * b).all()
+    assert (np.asarray(dest) == a * b).all()


 def test_pycuda_memory_to_theano():
@@ -87,7 +87,7 @@ def test_pycuda_memory_to_theano():
    print("gpuarray ref count before creating a CudaNdarray", end=' ')
    print(sys.getrefcount(y))
    assert sys.getrefcount(y) == initial_refcount
-    rand = numpy.random.randn(*y.shape).astype(numpy.float32)
+    rand = np.random.randn(*y.shape).astype(np.float32)
    cuda_rand = cuda_ndarray.CudaNdarray(rand)

    strides = [1]
@@ -102,7 +102,7 @@ def test_pycuda_memory_to_theano():
    z = cuda_ndarray.from_gpu_pointer(y_ptr, y.shape, strides, y)
    print("gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y))
    assert sys.getrefcount(y) == initial_refcount + 1
-    assert (numpy.asarray(z) == 0).all()
+    assert (np.asarray(z) == 0).all()
    assert z.base is y

    # Test that we can take a view from this cuda view on pycuda memory
@@ -112,17 +112,17 @@ def test_pycuda_memory_to_theano():
    del zz
    assert sys.getrefcount(y) == initial_refcount + 1

-    cuda_ones = cuda_ndarray.CudaNdarray(numpy.asarray([[[1]]],
-                                                       dtype='float32'))
+    cuda_ones = cuda_ndarray.CudaNdarray(np.asarray([[[1]]],
+                                                    dtype='float32'))
    z += cuda_ones
-    assert (numpy.asarray(z) == numpy.ones(y.shape)).all()
-    assert (numpy.asarray(z) == 1).all()
+    assert (np.asarray(z) == np.ones(y.shape)).all()
+    assert (np.asarray(z) == 1).all()

    assert cuda_rand.shape == z.shape
    assert cuda_rand._strides == z._strides, (cuda_rand._strides, z._strides)
-    assert (numpy.asarray(cuda_rand) == rand).all()
+    assert (np.asarray(cuda_rand) == rand).all()
    z += cuda_rand
-    assert (numpy.asarray(z) == (rand + 1)).all()
+    assert (np.asarray(z) == (rand + 1)).all()

    # Check that the ref count to the gpuarray is right.
    del z

--- a/theano/misc/tests/test_pycuda_utils.py
+++ b/theano/misc/tests/test_pycuda_utils.py
 from __future__ import absolute_import, print_function, division
-import numpy
+import numpy as np

 import theano.sandbox.cuda as cuda
 import theano.misc.pycuda_init
@@ -22,30 +22,30 @@ def test_to_gpuarray():

    px = to_gpuarray(cx)
    assert isinstance(px, pycuda.gpuarray.GPUArray)
-    cx[0, 0] = numpy.asarray(1, dtype="float32")
+    cx[0, 0] = np.asarray(1, dtype="float32")
    # Check that they share the same memory space
    assert px.gpudata == cx.gpudata
-    assert numpy.asarray(cx[0, 0]) == 1
+    assert np.asarray(cx[0, 0]) == 1

-    assert numpy.allclose(numpy.asarray(cx), px.get())
+    assert np.allclose(np.asarray(cx), px.get())
    assert px.dtype == cx.dtype
    assert px.shape == cx.shape
-    assert all(numpy.asarray(cx._strides) * 4 == px.strides)
+    assert all(np.asarray(cx._strides) * 4 == px.strides)

    # Test when the CudaNdarray is strided
    cx = cx[::2, ::]
    px = to_gpuarray(cx, copyif=True)
    assert isinstance(px, pycuda.gpuarray.GPUArray)
-    cx[0, 0] = numpy.asarray(2, dtype="float32")
+    cx[0, 0] = np.asarray(2, dtype="float32")

    # Check that they do not share the same memory space
    assert px.gpudata != cx.gpudata
-    assert numpy.asarray(cx[0, 0]) == 2
-    assert not numpy.allclose(numpy.asarray(cx), px.get())
+    assert np.asarray(cx[0, 0]) == 2
+    assert not np.allclose(np.asarray(cx), px.get())

    assert px.dtype == cx.dtype
    assert px.shape == cx.shape
-    assert not all(numpy.asarray(cx._strides) * 4 == px.strides)
+    assert not all(np.asarray(cx._strides) * 4 == px.strides)

    # Test that we return an error
    try:
@@ -59,11 +59,11 @@ def test_to_cudandarray():
    px = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
    cx = to_cudandarray(px)
    assert isinstance(cx, cuda.CudaNdarray)
-    assert numpy.allclose(px.get(),
-                          numpy.asarray(cx))
+    assert np.allclose(px.get(),
+                       np.asarray(cx))
    assert px.dtype == cx.dtype
    assert px.shape == cx.shape
-    assert all(numpy.asarray(cx._strides) * 4 == px.strides)
+    assert all(np.asarray(cx._strides) * 4 == px.strides)

    try:
        px = pycuda.gpuarray.zeros((3, 4, 5), 'float64')
@@ -73,7 +73,7 @@ def test_to_cudandarray():
        pass

    try:
-        to_cudandarray(numpy.zeros(4))
+        to_cudandarray(np.zeros(4))
        assert False
    except ValueError:
        pass
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -12,7 +12,7 @@ import warnings
 import theano
 from theano.compat import get_unbound_function
 from theano.compile import optdb
-from theano.gof import EquilibriumDB, SequenceDB
+from theano.gof import EquilibriumDB, SequenceDB, TopoOptimizer
 from theano.gof.cmodule import get_lib_extension
 from theano.gof.compilelock import get_lock, release_lock
 from theano import config
@@ -40,6 +40,17 @@ def register_opt(*tags, **kwargs):
    return f


+def register_inplace(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        optdb.register(
+            name, TopoOptimizer(
+                local_opt, failure_callback=TopoOptimizer.warn_inplace),
+            60, 'fast_run', 'inplace', 'gpu', *tags)
+        return local_opt
+    return f
+
+
 _logger_name = 'theano.sandbox.cuda'
 _logger = logging.getLogger(_logger_name)


--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
--- a/theano/tensor/nnet/tests/test_bn.py
+++ b/theano/tensor/nnet/tests/test_bn.py
--- a/theano/tensor/tests/test_var.py
+++ b/theano/tensor/tests/test_var.py
--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py