Merge pull request #5054 from nouiz/profile_mode

Remove ProfileMode

Merge pull request #5054 from nouiz/profile_mode
5bdb4a0b · abergeron · GitHub · 195517c4 · 0a7b12af · 5bdb4a0b
--- a/Theano.pyproj
+++ b/Theano.pyproj
@@ -34,7 +34,6 @@
    <Compile Include="theano\compile\mode.py" />
    <Compile Include="theano\compile\module.py" />
    <Compile Include="theano\compile\pfunc.py" />
-    <Compile Include="theano\compile\profilemode.py" />
    <Compile Include="theano\compile\profiling.py" />
    <Compile Include="theano\compile\sandbox\__init__.py" />
    <Compile Include="theano\compile\sharedvalue.py" />

--- a/benchmark/autoencoder/aa.py
+++ b/benchmark/autoencoder/aa.py
@@ -8,7 +8,7 @@ import theano
 import theano.tensor as T
 import theano.sandbox
 from six.moves import xrange
-from theano.compile import module, Mode, ProfileMode
+from theano.compile import module, Mode
 from theano import gof, Op, Apply
 from theano.tensor import blas, opt
@@ -191,7 +191,6 @@ class M(module.Module):
 mod = M()
 mode = 'FAST_RUN'
-#mode = ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
 mode = Mode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker(nice_errors=True))
 mode = Mode(optimizer='fast_run', linker='c')
 mode = Mode(optimizer='fast_run', linker='c|py')

--- a/benchmark/regression/regression.py
+++ b/benchmark/regression/regression.py
@@ -91,7 +91,6 @@ class PrintEverythingMode(theano.Mode):
 def test_module_advanced_example():
-    profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
    profmode = PrintEverythingMode(theano.gof.OpWiseCLinker(), 'fast_run')
    data_x = N.random.randn(4, 10)

--- a/doc/library/compile/index.txt
+++ b/doc/library/compile/index.txt
@@ -19,7 +19,6 @@
    ops
    mode
    debugmode
-    profilemode
    nanguardmode

--- a/doc/library/compile/mode.txt
+++ b/doc/library/compile/mode.txt
@@ -21,10 +21,8 @@ Theano defines the following modes by name:
 - ``'FAST_COMPILE'``: Apply just a few graph optimizations and only use Python implementations.
 - ``'FAST_RUN'``: Apply all optimizations, and use C implementations where possible.
 - ``'DebugMode'``: A mode for debugging. See :ref:`DebugMode <debugmode>` for details.
- ``'ProfileMode'``: Deprecated, use the Theano flag :attr:`config.profile`.
 - ``'NanGuardMode``: :ref:`Nan detector <nanguardmode>`
 - ``'DEBUG_MODE'``: Deprecated. Use the string DebugMode.
- ``'PROFILE_MODE'``: Deprecated, use the Theano flag :attr:`config.profile`.
 The default mode is typically ``FAST_RUN``, but it can be controlled via the
 configuration variable :attr:`config.mode`, which can be

--- a/doc/library/compile/profilemode.txt
+++ b/doc/library/compile/profilemode.txt
+:orphan:
 .. _profilemode:
@@ -16,203 +17,4 @@ Guide
 .. note::
-    ProfileMode is deprecated. Use :attr:`config.profile` instead.
+    ProfileMode is removed. Use :attr:`config.profile` instead.
-To profile a Theano graph, a special mode called ProfileMode, must be passed as
-an argument when compiling your graph. Using ProfileMode is a three-step
-process.
-Creating a ProfileMode Instance
-------------------------------
-First create a ProfileMode instance.
->>> import theano
->>> from theano import ProfileMode
->>> profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
-The ProfileMode constructor takes as input an optimizer and a
-linker. Which optimizer and linker to use will depend on the
-application. For example, a user wanting to profile the Python
-implementation only, should use the gof.PerformLinker (or "py" for
-short). On the other hand, a user wanting to profile his graph using C
-implementations wherever possible should use the ``gof.OpWiseCLinker``
-(or "c|py").
-In the same manner, modifying which optimizer is passed to ProfileMode
-will decide which optimizations are applied to the graph, prior to
-profiling. Changing the optimizer should be especially useful when
-developing new graph optimizations, in order to evaluate their impact
-on performance. Also keep in mind that optimizations might change the
-computation graph a lot, meaning that you might not recognize some of
-the operations that are profiled (you did not use them explicitly but
-an optimizer decided to use it to improve performance or numerical
-stability). If you cannot easily relate the output of ProfileMode with
-the computations you defined, you might want to try setting optimizer
-to None (but keep in mind the computations will be slower than if they
-were optimized).
-Note that most users will want to use ProfileMode to optimize their
-graph and find where most of the computation time is being spent. In
-this context, 'fast_run' optimizer and ``gof.OpWiseCLinker`` are the
-most appropriate choices.
-Compiling your Graph with ProfileMode
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Once the ProfileMode instance is created, simply compile your graph as you
-would normally, by specifying the mode parameter.
-.. testsetup::
-   import theano
-   input1, input2 = theano.tensor.scalars(2)
-   output1 = input1+input2
->>> # with functions
->>> f = theano.function([input1,input2],[output1], mode=profmode)
-Retrieving Timing Information
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Once your graph is compiled, simply run the program or operation you wish to
-profile, then call ``profmode.print_summary()``. This will provide you with
-the desired timing information, indicating where your graph is spending most
-of its time.
-This is best shown through an example.
-Lets use the example of logistic
-regression.  (Code for this example is in the file
-``benchmark/regression/regression.py``.)
-Compiling the module with ProfileMode and calling ``profmode.print_summary()``
-generates the following output:
-.. code-block:: python
-    """
-    ProfileMode.print_summary()
-    ---------------------------
-    local_time 0.0749197006226 (Time spent running thunks)
-    Apply-wise summary: <fraction of local_time spent at this position> (<Apply position>, <Apply Op name>)
-            0.069   15      _dot22
-            0.064   1       _dot22
-            0.053   0       InplaceDimShuffle{x,0}
-            0.049   2       InplaceDimShuffle{1,0}
-            0.049   10      mul
-            0.049   6       Elemwise{ScalarSigmoid{output_types_preference=<theano.scalar.basic.transfer_type object at 0x171e650>}}[(0, 0)]
-            0.049   3       InplaceDimShuffle{x}
-            0.049   4       InplaceDimShuffle{x,x}
-            0.048   14      Sum{0}
-            0.047   7       sub
-            0.046   17      mul
-            0.045   9       sqr
-            0.045   8       Elemwise{sub}
-            0.045   16      Sum
-            0.044   18      mul
-       ... (remaining 6 Apply instances account for 0.25 of the runtime)
-    Op-wise summary: <fraction of local_time spent on this kind of Op> <Op name>
-            0.139   * mul
-            0.134   * _dot22
-            0.092   * sub
-            0.085   * Elemwise{Sub{output_types_preference=<theano.scalar.basic.transfer_type object at 0x1779f10>}}[(0, 0)]
-            0.053   * InplaceDimShuffle{x,0}
-            0.049   * InplaceDimShuffle{1,0}
-            0.049   * Elemwise{ScalarSigmoid{output_types_preference=<theano.scalar.basic.transfer_type object at 0x171e650>}}[(0, 0)]
-            0.049   * InplaceDimShuffle{x}
-            0.049   * InplaceDimShuffle{x,x}
-            0.048   * Sum{0}
-            0.045   * sqr
-            0.045   * Sum
-            0.043   * Sum{1}
-            0.042   * Elemwise{Mul{output_types_preference=<theano.scalar.basic.transfer_type object at 0x17a0f50>}}[(0, 1)]
-            0.041   * Elemwise{Add{output_types_preference=<theano.scalar.basic.transfer_type object at 0x1736a50>}}[(0, 0)]
-            0.039   * Elemwise{Second{output_types_preference=<theano.scalar.basic.transfer_type object at 0x1736d90>}}[(0, 1)]
-       ... (remaining 0 Ops account for 0.00 of the runtime)
-    (*) Op is running a c implementation
-    """
-.. note::
-    ***TODO***
-    The following text was recovered from a recent version of the source
-    file... hopefully things haven't gotten too out-of-sync!
-    The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
-    The Apply-wise summary print the timing information for the worst
-    offending Apply nodes. This corresponds to individual Op applications
-    within your graph which take the longest to execute (so if you use dot
-    twice, you will see two entries there).
-    The Op-wise summary print the execution time of all Apply nodes
-    executing the same Op are grouped together and the total execution
-    time per Op is shown (so if you use dot twice, you will see only one
-    entry there corresponding to the sum of the time spent in each of
-    them). If two Op have different hash value, they will be separate.
-    The type-Op-wise summary group the result by type of op. So event if
-    two Op have different hash value, they will be merged.
-    Their is an hack with the Op-wise summary. Go see it if you want to know more.
-The summary has two components to it. In the first section called the
-Apply-wise summary, timing information is provided for the worst
-offending Apply nodes. This corresponds to individual Op applications
-within your graph which take the longest to execute (so if you use
-``dot`` twice, you will see two entries there). In the second portion,
-the Op-wise summary, the execution time of all Apply nodes executing
-the same Op are grouped together and the total execution time per Op
-is shown (so if you use ``dot`` twice, you will see only one entry
-there corresponding to the sum of the time spent in each of them).
-Note that the ProfileMode also shows which Ops were running a c
-implementation.
-Developers wishing to optimize the performance of their graph should
-focus on the worst offending Ops and Apply nodes -- either by optimizing an
-implementation, providing a missing C implementation, or by writing a graph
-optimization that eliminates the offending Op altogether.
-You should strongly consider emailing one of our lists about your issue before
-spending too much time on this.
-Reference
-=========
-.. class:: ProfileMode(Mode)
-    .. method:: print_summary(n_apply_to_print=None, n_ops_to_print=None)
-        Print three summaries to stdout that show where cpu time is spent during theano function executions (for all functions using this object instance).
-        :param n_apply_to_print: the number of apply nodes to print.
-           The default 15, but can be configured via ``ProfileMode.n_ops_to_print`` in :envvar:`THEANO_FLAGS`.
-        :param n_ops_to_print: the number of ops to print.
-           Default 20, or but can be configured via ``ProfileMode.n_apply_to_print`` in :envvar:`THEANO_FLAGS`.
-        :returns: None
-    .. method:: print_diff_summary(self, other, n_apply_to_print=None, n_ops_to_print=None):
-        """ As print_summary, but print the difference on two different profile mode.
-        TODO: Also we don't print the Apply-wise summary as it don't work for now.
-        TODO: make comparaison with gpu code.
-        :param other: the other instance of ProfileMode that we want to be compared to.
-        :param n_apply_to_print: the number of apply nodes to print.
-           The default 15, but can be configured via ``ProfileMode.n_ops_to_print`` in :envvar:`THEANO_FLAGS`.
-        :param n_ops_to_print: the number of ops to print.
-           Default 20, or but can be configured via ``ProfileMode.n_apply_to_print`` in :envvar:`THEANO_FLAGS`.
-        :returns: None
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -315,7 +315,7 @@ import theano and print the config variable, as in:
 .. attribute:: mode
-    String value: ``'Mode'``, ``'ProfileMode'`` (deprecated), ``'DebugMode'``, ``'FAST_RUN'``,
+    String value: ``'Mode'``, ``'DebugMode'``, ``'FAST_RUN'``,
    ``'FAST_COMPILE'``
    Default: ``'Mode'``

--- a/doc/tutorial/debug_faq.txt
+++ b/doc/tutorial/debug_faq.txt
@@ -284,7 +284,7 @@ First, make sure you're running in ``FAST_RUN`` mode. Even though
 to ``theano.function`` (or ``theano.make``) or by setting :attr:`config.mode`
 to ``FAST_RUN``.
-Second, try the Theano :ref:`using_profilemode`.  This will tell you which
+Second, try the Theano :ref:`profiling <tut_profiling>`.  This will tell you which
 ``Apply`` nodes, and which ops are eating up your CPU cycles.
 Tips:

--- a/doc/tutorial/modes.txt
+++ b/doc/tutorial/modes.txt
@@ -248,13 +248,3 @@ constructor arguments. The keyword version of DebugMode (which you get by using
 is quite strict.
 For more detail, see :ref:`DebugMode<debugmode>` in the library.
-.. _using_profilemode:
-ProfileMode
-===========
-.. note::
-    ProfileMode is deprecated. Use :attr:`config.profile` instead.
--- a/doc/tutorial/profiling.txt
+++ b/doc/tutorial/profiling.txt
@@ -27,7 +27,7 @@ functions using either of the following two options:
      :attr:`profiling.n_ops` and :attr:`profiling.min_memory_size`
      to modify the quantity of information printed.
-2. Pass the argument :attr:`profile=True` to the function :func:`theano.function <function.function>`. And then call :attr:`f.profile.print_summary()` for a single function.
+2. Pass the argument :attr:`profile=True` to the function :func:`theano.function <function.function>`. And then call :attr:`f.profile.summary()` for a single function.
    - Use this option when you want to profile not all the
      functions but one or more specific function(s).
    - You can also combine the profile of many functions: 
@@ -39,7 +39,7 @@ functions using either of the following two options:
          f = theano.function(..., profile=profile)  # doctest: +SKIP
          g = theano.function(..., profile=profile)  # doctest: +SKIP
          ...  # doctest: +SKIP
-          profile.print_summary()
+          profile.summary()

--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -73,7 +73,7 @@ from theano.compile import (
    Mode,
    predefined_modes, predefined_linkers, predefined_optimizers,
    FunctionMaker, function, function_dump, OpFromGraph,
-    ProfileMode, ProfileStats,
+    ProfileStats,
    Param, shared, as_op)
 from theano.misc.safe_asarray import _asarray

--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
@@ -19,8 +19,6 @@ from theano.compile.monitormode import MonitorMode
 from theano.compile.profiling import ProfileStats, ScanProfileStats
-from theano.compile.profilemode import ProfileMode
 from theano.compile.sharedvalue import (shared, shared_constructor,
                                        SharedVariable)
 from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1378,17 +1378,11 @@ class FunctionMaker(object):
                 output_keys=None):
        mode = theano.compile.mode.get_mode(mode)
-        # figure out which profile object to use (if any)
+        # Assert old way of working isn't used
-        # to help with forward-porting ProfileMode,
+        if getattr(mode, 'profile', None):
-        # we allow ProfileMode to provide a ProfileStats object
-        # using this somewhat awkward mechanism.
-        mode_profile = getattr(mode, 'profile', None)
-        if (profile is not None and
-                profile is not False and
-                mode_profile is not None):
            raise TypeError(
-                'profile passed via both "mode" and "profile" arguments')
+                "profile passed via 'mode'. This isn't supported anymore")
-        self.profile = profile = profile or mode_profile
+        self.profile = profile
        if profile:
            # This is very important:
            # 1) We preload the cache here to don't have its timming
@@ -1745,9 +1739,6 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
    - FAST_COMPILE (minimal optimization)
-    - ProfileMode(deprecated): allow to print a profile mode with
-      mode.print_summary
    - DebugMode: verify many internal conditions that are normally assumed
      (slow)

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -391,7 +391,7 @@ def get_mode(orig_string):
                default_mode_class):
            return instantiated_default_mode
-    if string in ['Mode', 'ProfileMode', 'DebugMode', 'NanGuardMode']:
+    if string in ['Mode', 'DebugMode', 'NanGuardMode']:
        if string == 'DebugMode':
            # need to import later to break circular dependency.
            from .debugmode import DebugMode
@@ -403,9 +403,6 @@ def get_mode(orig_string):
            # NanGuardMode use its own linker.
            ret = NanGuardMode(True, True, True, optimizer=config.optimizer)
        else:
-            # This might be required if the string is 'ProfileMode'
-            from .profilemode import ProfileMode  # noqa
-            from .profilemode import prof_mode_instance_to_print
            # TODO: Can't we look up the name and invoke it rather than using eval here?
            ret = eval(string +
                       '(linker=config.linker, optimizer=config.optimizer)')
@@ -424,11 +421,6 @@ def get_mode(orig_string):
            ret = ret.requiring(*theano.config.optimizer_requiring.split(':'))
        instantiated_default_mode = ret
-    # must tell python to print the summary at the end.
-    if string == 'ProfileMode':
-        # need to import later to break circular dependency.
-        prof_mode_instance_to_print.append(ret)
    return ret

--- a/theano/compile/monitormode.py
+++ b/theano/compile/monitormode.py
@@ -79,7 +79,7 @@ class MonitorMode(Mode):
        Create a new instance of this Mode.
        Keyword arguments can be provided for the linker, but they will be
-        ignored, because ProfileMode needs to use its own linker.
+        ignored, because MonitorMode needs to use its own linker.
        """
        if optimizer == "":

--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -366,6 +366,8 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
    if profile is None:
        profile = config.profile
        # profile -> True or False
+        if profile is False:
+            profile = None
    if profile is True:
        profile = ProfileStats(message=name)
        # profile -> object

--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
-from __future__ import absolute_import, print_function, division
-import atexit
-import copy
-import os
-import time
-import warnings
-import theano
-from theano.gof.link import WrapLinker
-from six import string_types, iteritems, itervalues
-from theano.compile.mode import (Mode, register_mode,
-                                 predefined_modes, predefined_linkers,
-                                 predefined_optimizers)
-from theano.configparser import config
-from theano.compile.function_module import FunctionMaker
-from .profiling import ProfileStats
-run_cthunk = None  # Will be imported only when needed.
-import_time = time.time()
-class Profile_Maker(FunctionMaker):
-    def create(self, input_storage=None, trustme=False, storage_map=None):
-        ret = super(Profile_Maker, self).create(input_storage, trustme,
-                                                storage_map)
-        if (hasattr(theano, 'sandbox') and
-                hasattr(theano.sandbox, 'cuda') and
-                theano.sandbox.cuda.cuda_enabled):
-            if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
-                raise Exception(
-                    "You are running the Theano profiler with CUDA enabled."
-                    " Theano GPU ops execution is asynchronous by default."
-                    " So by default, the profile is useless."
-                    " You must set the environment variable"
-                    " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to"
-                    " synchronize the execution to get a meaningful profile.")
-        # create a function-specific storage container for profiling info
-        profile = ProfileStats(atexit_print=False)
-        self.mode.profile_stats[ret] = profile
-        ret.profile = profile
-        # initialize the timers
-        for i, node in enumerate(ret.maker.fgraph.toposort()):
-            profile.apply_time[node] = 0.0
-            # a thunk_group is a list of the thunks from each linker
-            # corresponding to the i'th position in the toposort.
-            assert len(ret.fn.thunk_groups[i]) == 1
-            profile.apply_cimpl[node] = hasattr(
-                ret.fn.thunk_groups[i][0],
-                'cthunk')
-        # Here we replace the linker function.
-        # This ugliness makes WrapLinker (an object that *generates*
-        # functions and is not function-specific)  work with ProfileStats
-        # objects which are function-specific.
-        # capture old fn in closure. This is important since new_fn is about to
-        # take its place as ret.fn.
-        ret_fn = ret.fn
-        def new_fn():
-            self.mode.apply_time = self.mode.profile_stats[ret].apply_time
-            self.mode.variable_shape = \
-                self.mode.profile_stats[ret].variable_shape
-            ret_fn()
-            # delete the old apply_time variable
-            # because it doesn't mean the same thing anymore.
-            # This prevents old code from looking like it still works.
-            del self.mode.apply_time
-            del self.mode.variable_shape
-        ret.fn = new_fn
-        global run_cthunk
-        if run_cthunk is None and any(profile.apply_cimpl.values()):
-            # Lazy import to avoid compilation when importing theano.
-            from theano.gof.cutils import run_cthunk  # noqa
-        warnings.warn(
-            "DEPRECATION WARNING: The ProfileMode is deprecated. "
-            "Use the Theano flags/parameter to theano.function "
-            "'profile=True' instead of 'mode=ProfileMode'")
-        return ret
-class ProfileMode(Mode):
-    def __init__(self, linker=None, optimizer='default'):
-        if linker is None:
-            linker = config.linker
-        if optimizer is 'default':
-            optimizer = config.optimizer
-        message = ""
-        profile_stats = {}
-        self.__setstate__((linker,
-                           optimizer,
-                           message,
-                           profile_stats))
-    def function_maker(self, i, o, m, *args, **kwargs):
-        """
-        Return an instance of `Profiler_Maker` which init the count.
-        """
-        assert m is self
-        return Profile_Maker(i, o, self, *args, **kwargs)
-    def __get_local_time(self):
-        rval = 0
-        for ps in itervalues(self.profile_stats):
-            rval += sum(ps.apply_time.values())
-        return rval
-    local_time = property(__get_local_time)
-    def __getstate__(self):
-        # print "__getstate__",self.provided_linker,self.provided_optimizer
-        return (self.provided_linker,
-                self.provided_optimizer,
-                self.message,
-                self.profile_stats)
-    def __setstate__(self, state):
-        linker, optimizer, message, profile_stats = state
-        self.message = message
-        self.profile_stats = profile_stats
-        def profile_thunk(i, node, th):
-            """
-            Profile only the execution time.
-            """
-            global run_cthunk
-            if hasattr(th, 'cthunk'):
-                t0 = time.time()
-                failure = run_cthunk(th.cthunk)
-                dt = time.time() - t0
-                if failure:
-                    raise RuntimeError(
-                        ('A C Op raised an exception.  ProfileMode cannot'
-                         ' tell you what it was though.  Use a standard mode'
-                         ' such as FAST_RUN to correct the problem.'))
-            else:
-                t0 = time.time()
-                th()
-                dt = time.time() - t0
-            # Some Op are so fast that the time.time() resolution is
-            # insufficient to measure it.  So we add an epsilon.
-            self.apply_time[node] += max(dt, 1e-14)
-        def profile_thunk2(i, node, th):
-            """
-            Profile the execution time and the memory size.
-            """
-            global run_cthunk
-            if hasattr(th, 'cthunk'):
-                t0 = time.time()
-                failure = run_cthunk(th.cthunk)
-                dt = time.time() - t0
-                if failure:
-                    raise RuntimeError(
-                        ('A C Op raised an exception.  ProfileMode cannot'
-                         ' tell you what it was though.  Use a standard mode'
-                         ' such as FAST_RUN to correct the problem.'))
-            else:
-                t0 = time.time()
-                th()
-                dt = time.time() - t0
-            for var, data in zip(node.outputs, th.outputs):
-                sh = getattr(data[0], 'shape', 'input no shape')
-                self.variable_shape[var] = sh
-            self.apply_time[node] += max(dt, 1e-14)
-        self.provided_linker = linker
-        self.provided_optimizer = optimizer
-        if isinstance(linker, string_types) or linker is None:
-            linker = predefined_linkers[linker]
-        if not config.ProfileMode.profile_memory:
-            p_thunk = profile_thunk
-        else:
-            p_thunk = profile_thunk2
-        linker = WrapLinker([linker], p_thunk)
-        self.linker = linker
-        if isinstance(optimizer, string_types) or optimizer is None:
-            optimizer = predefined_optimizers[optimizer]
-        self._optimizer = optimizer
-        self.call_time = 0
-        self.fn_time = 0
-    def print_summary(self, **kwargs):
-        """
-        Print 3 summaries that show where time is spent. The first shows
-        an Apply-wise summary, the second an Op-wise summary and the
-        third a type-Op-wise summary.
-        The Apply-wise summary prints the timing information for the
-        worst offending Apply nodes. This corresponds to individual Op
-        applications within your graph which take the longest to
-        execute (so if you use dot twice, you will see two entries
-        there).
-        The Op-wise summary prints the execution time of all Apply
-        nodes executing the same Op grouped together and the total
-        execution time per Op is shown (so if you use dot twice, you
-        will see only one entry there corresponding to the sum of the
-        time spent in each of them). If two Ops have different hash
-        value, they will be separate.
-        The type-Op-wise summary group the result by type of op. So
-        event if two Op have different hash value, they will be
-        merged.
-        There is an hack with the Op-wise summary. Go see it if you
-        want to know more.
-        Parameters
-        ----------
-        kwargs
-            They are passed to print_summary_ expanded. Currently there is
-            n_apply_to_print, n_ops_to_print and min_memory_size that are
-            accepted.
-        """
-        compile_time = sum([ps.compile_time for ps
-                            in self.profile_stats.values()])
-        fct_call = dict([(fn, ps.fct_callcount)
-                         for (fn, ps) in iteritems(self.profile_stats)])
-        fct_call_time = dict([(fn, ps.fct_call_time)
-                              for (fn, ps) in iteritems(self.profile_stats)])
-        apply_time = {}
-        for fn, ps in iteritems(self.profile_stats):
-            for (i, node) in enumerate(fn.maker.fgraph.toposort()):
-                apply_time[(i, node)] = ps.apply_time[node]
-        for (i, n), t in iteritems(apply_time):
-            if t == 0:
-                print(i, n)
-        apply_cimpl = {}
-        for ps in itervalues(self.profile_stats):
-            apply_cimpl.update(ps.apply_cimpl)
-        message = self.message
-        variable_shape = {}
-        for ps in itervalues(self.profile_stats):
-            variable_shape.update(ps.variable_shape)
-        other_time = dict(
-            linker_time=sum(
-                [ps.linker_time for ps in self.profile_stats.values()]),
-            optimizer_time=sum(
-                [ps.optimizer_time for ps in self.profile_stats.values()]))
-        self.print_summary_("print_summary",
-                            compile_time, fct_call_time, fct_call,
-                            apply_time, apply_cimpl, message, variable_shape,
-                            self.local_time, other_time,
-                            **kwargs)
-    def print_diff_summary(self, other, **kwargs):
-        """
-        As print_summary, but print the difference on two different
-        profile mode.
-        TODO: Also we don't print the Apply-wise summary as it don't
-        work for now.
-        TODO: make comparaison with gpu code.
-        Parameters
-        ----------
-        other
-            The other instance of ProfileMode that we want to be compared to.
-        kwargs
-            They are passed to print_summary_ expanded.
-            Currently there is n_apply_to_print, n_ops_to_print and
-            min_memory_size that are accepted.
-        """
-        def diff_dict(a_time, b_time_):
-            r = {}
-            b_time = copy.copy(b_time_)
-            for a, ta in iteritems(a_time):
-                r.setdefault(a, 0)
-                tb = b_time.pop(a, 0)
-                r[a] += ta - tb
-            # they are missing in a
-            for a, t in iteritems(b_time):
-                r.setdefault(a, 0)
-                r[a] += t
-            return r
-        compile_time = self.compile_time - other.compile_time
-        fct_call_time = diff_dict(self.fct_call_time, other.fct_call_time)
-        fct_call = diff_dict(self.fct_call, other.fct_call)
-        apply_time = diff_dict(self.apply_time, other.apply_time)
-        apply_cimpl = self.apply_cimpl and other.apply_cimpl
-        message = self.message
-        variable_shape = diff_dict(self.variable_shape, other.variable_shape)
-        self_linker_time = sum([ps.linker_time for ps
-                                in self.profile_stats.values()])
-        other_linker_time = sum([ps.linker_time for ps
-                                 in other.profile_stats.values()])
-        self_optimizer_time = sum([ps.optimizer_time for ps
-                                   in self.profile_stats.values()])
-        other_optimizer_time = sum([ps.optimizer_time for ps
-                                    in other.profile_stats.values()])
-        other_time = {'linker_time': self_linker_time - other_linker_time,
-                      'optimizer_time': self_optimizer_time -
-                      other_optimizer_time}
-        self.print_summary_("print_diff_summary", compile_time,
-                            fct_call_time, fct_call,
-                            apply_time, apply_cimpl, message, variable_shape,
-                            print_apply=False, other_time=other_time,
-                            **kwargs)
-    @staticmethod
-    def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
-                       apply_time, apply_cimpl, message, variable_shape,
-                       local_time, other_time,
-                       n_apply_to_print=config.ProfileMode.n_apply_to_print,
-                       n_ops_to_print=config.ProfileMode.n_ops_to_print,
-                       print_apply=True,
-                       min_memory_size=config.ProfileMode.min_memory_size,
-                       ):
-        """
-        Do the actual printing of print_summary and print_diff_summary.
-        Parameters
-        ----------
-        n_apply_to_print
-            The number of apply to print. Default 15.
-        n_ops_to_print
-            The number of ops to print. Default 20.
-        min_memory_size
-            Don't print memory profile of apply whose outputs memory size is
-            lower than that.
-        """
-        print("ProfileMode is deprecated! Use the new profiler.")
-        print(" The Theano flags to enable it ise: profile=True")
-        print(" The Theano flags for the memory profile to it is: "
-              "profile_memory=True")
-        total_time = time.time() - import_time
-        total_fct_time = sum(fct_call_time.values())
-        total_fct_call = sum(fct_call.values())
-        unknown_time = total_time - total_fct_time - compile_time
-        overhead_time = total_fct_time - local_time
-        if total_fct_time > 0:
-            time_pr_in_fct = local_time / total_fct_time * 100
-            overhead_time_pourcent_fct_time = (overhead_time / total_fct_time *
-                                               100)
-            time_per_call = total_fct_time / total_fct_call
-        else:
-            time_pr_in_fct = 0
-            overhead_time_pourcent_fct_time = 0
-            time_per_call = 0
-        print()
-        print('ProfileMode.%s(%s)' % (fct_name, message))
-        print('---------------------------')
-        print()
-        print('Time since import %.3fs' % (total_time))
-        print('Theano compile time: %.3fs (%.1f%% since import)' %
-              (compile_time, compile_time / total_time * 100))
-        print('    Optimization time: %.3fs' % (other_time['optimizer_time']))
-        print('    Linker time: %.3fs' % (other_time['linker_time']))
-        print('Theano fct call %.3fs (%.1f%% since import)' %
-              (total_fct_time, total_fct_time / total_time * 100))
-        print('   Theano Op time %.3fs %.1f%%(since import) %.1f%%'
-              '(of fct call)' % (local_time, local_time / total_time * 100,
-                                 time_pr_in_fct))
-        print('   Theano function overhead in ProfileMode %.3fs %.1f%%'
-              '(since import) %.1f%%(of fct call)' % (
-                  overhead_time, overhead_time / total_time * 100,
-                  overhead_time_pourcent_fct_time))
-        print('%i Theano fct call, %.3fs per call' %
-              (total_fct_call, time_per_call))
-        print('Rest of the time since import %.3fs %.1f%%' %
-              (unknown_time, unknown_time / total_time * 100))
-        print()
-        print('Theano fct summary:')
-        print('<% total fct time> <total time> <time per call> <nb call> '
-              '<fct name>')
-        for key in fct_call:
-            if fct_call[key] > 0:
-                print('   %4.1f%% %.3fs %.2es %d %s' %
-                      (fct_call_time[key] / total_fct_time * 100,
-                       fct_call_time[key],
-                       fct_call_time[key] / fct_call[key],
-                       fct_call[key],
-                       key.name))
-            else:
-                print('   NOT CALLED', key.name)
-        # Compute stats per op.
-        op_time = {}
-        op_call = {}
-        op_apply = {}
-        op_cimpl = {}
-        sop_apply = {}
-        for (i, a), t in iteritems(apply_time):
-            op = a.op
-            op_time.setdefault(op, 0)
-            op_call.setdefault(op, 0)
-            op_apply.setdefault(op, 0)
-            sop_apply.setdefault(type(a.op), 0)
-            op_time[op] += t
-            nb_call = [v for k, v in iteritems(fct_call)
-                       if k.maker.fgraph is a.fgraph][0]
-            op_cimpl.setdefault(a.op, True)
-            op_cimpl[a.op] = op_cimpl[a.op] and apply_cimpl.get(a, False)
-            if t == 0:
-                assert nb_call == 0, nb_call
-            else:
-                op_call[op] += nb_call
-                op_apply[op] += 1
-                sop_apply[type(a.op)] += 1
-        # Compute stats per op class
-        sop_time = {}
-        sop_call = {}
-        sop_op = {}
-        # map each op class to Bool. True iff all applies were done in c.
-        sop_cimpl = {}
-        for a, t in iteritems(op_time):
-            typ = type(a)
-            sop_time.setdefault(typ, 0)
-            sop_time[typ] += t
-            sop_op.setdefault(typ, 0)
-            sop_op[typ] += 1
-            sop_cimpl.setdefault(typ, True)
-            sop_cimpl[typ] = sop_cimpl[typ] and op_cimpl.get(a, False)
-            sop_call[typ] = sop_call.get(typ, 0) + op_call[a]
-        # Print the summary per op class.
-        print()
-        print('Single Op-wise summary:')
-        print('<% of local_time spent on this kind of Op> <cumulative %> '
-              '<self seconds> <cumulative seconds> <time per call> [*] '
-              '<nb_call> <nb_op> <nb_apply> <Op name>')
-        sotimes = [(t * 100 / local_time, t, a, sop_cimpl[a], sop_call[a],
-                    sop_op[a], sop_apply[a]) for a, t in iteritems(sop_time)]
-        sotimes.sort()
-        sotimes.reverse()
-        tot = 0
-        for f, t, a, ci, nb_call, nb_op, nb_apply in sotimes[:n_ops_to_print]:
-            if nb_call == 0:
-                assert t == 0
-                continue
-            tot += t
-            ftot = tot * 100 / local_time
-            if ci:
-                msg = '*'
-            else:
-                msg = ' '
-            print('   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d '
-                  '%2d %s' % (f, ftot, t, tot, t / nb_call, msg, nb_call,
-                              nb_op, nb_apply, a))
-        print('   ... (remaining %i single Op account for %.2f%%(%.2fs) of '
-              'the runtime)' %
-              (max(0, len(sotimes) - n_ops_to_print),
-               sum(soinfo[0] for soinfo in sotimes[n_ops_to_print:]),
-               sum(soinfo[1] for soinfo in sotimes[n_ops_to_print:])))
-        print('(*) Op is running a c implementation')
-        # The summary per op
-        op_flops = {}
-        for a, t in iteritems(op_time):
-            if hasattr(a, 'flops'):
-                op_flops[a] = a.flops * op_call[a] / t / 1e6
-        flops_msg = ''
-        if op_flops:
-            flops_msg = ' <MFlops/s>'
-            print("\nHACK WARNING: we print the flops for some OP, but the "
-                  "logic doesn't always work. You need to know the "
-                  "internals of Theano to make it work correctly. "
-                  "Otherwise don't use it!")
-        print()
-        print('Op-wise summary:')
-        print('<%% of local_time spent on this kind of Op> <cumulative %%> '
-              '<self seconds> <cumulative seconds> <time per call> [*] %s '
-              '<nb_call> <nb apply> <Op name>' % (flops_msg))
-        otimes = [(t * 100 / local_time, t, a, op_cimpl.get(a, 0),
-                   op_call.get(a, 0), op_apply.get(a, 0))
-                  for a, t in iteritems(op_time)]
-        otimes.sort()
-        otimes.reverse()
-        tot = 0
-        for f, t, a, ci, nb_call, nb_apply in otimes[:n_ops_to_print]:
-            if nb_call == 0:
-                assert t == 0
-                continue
-            tot += t
-            ftot = tot * 100 / local_time
-            if ci:
-                msg = '*'
-            else:
-                msg = ' '
-            if op_flops:
-                print('   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f '
-                      '%5d %2d %s' % (f, ftot, t, tot, t / nb_call, msg,
-                                      op_flops.get(a, -1), nb_call, nb_apply,
-                                      a))
-            else:
-                print('   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d '
-                      '%s' % (f, ftot, t, tot, t / nb_call, msg, nb_call,
-                              nb_apply, a))
-        print('   ... (remaining %i Op account for %6.2f%%(%.2fs) of the '
-              'runtime)' %
-              (max(0, len(otimes) - n_ops_to_print),
-               sum(f for f, t, a, ci, nb_call, nb_op in
-                   otimes[n_ops_to_print:]),
-               sum(t for f, t, a, ci, nb_call, nb_op in
-                   otimes[n_ops_to_print:])))
-        print('(*) Op is running a c implementation')
-        if print_apply:
-            print()
-            print('Apply-wise summary:')
-            print('<% of local_time spent at this position> <cumulative %%> '
-                  '<apply time> <cumulative seconds> <time per call> [*] '
-                  '<nb_call> <Apply position> <Apply Op name>')
-            atimes = [(t * 100 / local_time, t, a,
-                       [v for k, v in iteritems(fct_call)
-                        if k.maker.fgraph is a[1].fgraph][0])
-                      for a, t in iteritems(apply_time)]
-            atimes.sort()
-            atimes.reverse()
-            tot = 0
-            for f, t, a, nb_call in atimes[:n_apply_to_print]:
-                tot += t
-                ftot = tot * 100 / local_time
-                if nb_call == 0:
-                    continue
-                if apply_cimpl.get(a[1], False):
-                    msg = '*'
-                else:
-                    msg = ' '
-                print('   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs %.2es  %s %i  '
-                      '%2i %s' %
-                      (f, ftot, t, tot, t / nb_call, msg, nb_call, a[0],
-                       str(a[1])))
-            print('   ... (remaining %i Apply instances account for '
-                  '%.2f%%(%.2fs) of the runtime)' %
-                  (max(0, len(atimes) - n_apply_to_print),
-                   sum(f for f, t, a, nb_call in atimes[n_apply_to_print:]),
-                   sum(t for f, t, a, nb_call in atimes[n_apply_to_print:])))
-            print('(*) Op is running a c implementation')
-        for printer in profiler_printers:
-            printer(fct_name, compile_time, fct_call_time, fct_call,
-                    apply_time, apply_cimpl, message, variable_shape,
-                    other_time)
-        if not variable_shape:
-            print("\nProfile of Theano intermediate memory disabled. "
-                  "To enable, set the Theano flag ProfileMode.profile_memory "
-                  "to True.")
-        else:
-            print("""
-            The memory profile in ProfileMode is removed!
-            Use the new profiler. Use the Theano flags
-            profile=True,profile_memory=True to enable it.""")
-        print()
-        print("""Here are tips to potentially make your code run faster
-(if you think of new ones, suggest them on the mailing list).
-Test them first, as they are not guaranteed to always provide a speedup.""")
-        from theano import tensor as T
-        from theano.tensor.raw_random import RandomFunction
-        import theano
-        import theano.scalar as scal
-        scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE,
-                                         scal.EQ, scal.NEQ, scal.InRange,
-                                         scal.Switch, scal.OR, scal.XOR,
-                                         scal.AND, scal.Invert, scal.Maximum,
-                                         scal.Minimum, scal.Add, scal.Mul,
-                                         scal.Sub, scal.TrueDiv, scal.IntDiv,
-                                         scal.Clip, scal.Second, scal.Identity,
-                                         scal.Cast, scal.Sgn, scal.Neg,
-                                         scal.Inv, scal.Sqr]
-        scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil,
-                                      scal.Floor, scal.RoundHalfToEven,
-                                      scal.RoundHalfAwayFromZero, scal.Log,
-                                      scal.Log2, scal.Log10, scal.Log1p,
-                                      scal.Exp, scal.Sqrt, scal.Abs, scal.Cos,
-                                      scal.Sin, scal.Tan, scal.Tanh,
-                                      scal.Cosh, scal.Sinh,
-                                      T.nnet.sigm.ScalarSigmoid,
-                                      T.nnet.sigm.ScalarSoftplus]
-        def get_scalar_ops(s):
-            if isinstance(s, theano.scalar.Composite):
-                l = []
-                for node in s.fgraph.toposort():
-                    l += get_scalar_ops(node.op)
-                return l
-            else:
-                return [s]
-        def list_scalar_op(op):
-            if isinstance(op.scalar_op, theano.scalar.Composite):
-                return get_scalar_ops(op.scalar_op)
-            else:
-                return [op.scalar_op]
-        def amdlibm_speed_up(op):
-            if not isinstance(op, T.Elemwise):
-                return False
-            else:
-                l = list_scalar_op(op)
-                for s_op in l:
-                    if s_op.__class__ in scalar_op_amdlibm_speed_up:
-                        return True
-                    elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
-                        print("We don't know if amdlibm will accelerate "
-                              "this scalar op.", s_op)
-                return False
-        def exp_float32_op(op):
-            if not isinstance(op, T.Elemwise):
-                return False
-            else:
-                l = list_scalar_op(op)
-                return any([s_op.__class__ in [scal.Exp] for s_op in l])
-        printed_tip = False
-        # tip 1
-        if config.floatX == 'float64':
-            print("  - Try the Theano flag floatX=float32")
-            printed_tip = True
-        # tip 2
-        if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i, a
-                                           in apply_time]):
-            print("  - Try installing amdlibm and set the Theano flag "
-                  "lib.amdlibm=True. This speeds up only some Elemwise "
-                  "operation.")
-            printed_tip = True
-        # tip 3
-        if not config.lib.amdlibm and any([exp_float32_op(a.op) and
-                                           a.inputs[0].dtype == 'float32'
-                                           for i, a in apply_time]):
-            print("  - With the default gcc libm, exp in float32 is slower "
-                  "than in float64! Try Theano flag floatX=float64, or "
-                  "install amdlibm and set the theano flags lib.amdlibm=True")
-            printed_tip = True
-        # tip 4
-        for a, t in iteritems(apply_time):
-            node = a[1]
-            if (isinstance(node.op, T.Dot) and
-                    all([len(i.type.broadcastable) == 2
-                         for i in node.inputs])):
-                print("  - You have a dot operation that was not optimized to"
-                      " dot22 (which is faster). Make sure the inputs are "
-                      "float32 or float64, and are the same for both inputs. "
-                      "Currently they are: %s" %
-                      [i.type for i in node.inputs])
-                printed_tip = True
-        # tip 5
-        for a, t in iteritems(apply_time):
-            node = a[1]
-            if isinstance(node.op, RandomFunction):
-                printed_tip = True
-                print("  - Replace the default random number generator by "
-                      "'from theano.sandbox.rng_mrg import MRG_RandomStreams "
-                      "as RandomStreams', as this is is faster. It is still "
-                      "experimental, but seems to work correctly.")
-                if config.device.startswith("gpu"):
-                    print("     - MRG_RandomStreams is the only random number"
-                          " generator supported on the GPU.")
-                break
-        # tip 6
-        import theano.sandbox.cuda as cuda
-        from theano.tensor.nnet import LogSoftmax
-        import theano.tensor.signal.pool as pool
-        import theano.gpuarray
-        for a, t in iteritems(apply_time):
-            node = a[1]
-            if (isinstance(node.op, pool.Pool)):
-                if (not cuda.dnn.dnn_available() and not theano.gpuarray.dnn.dnn_present()):
-                    print("Install CuDNN to do pooling faster"
-                          "this allows the operation to run on GPU")
-            if (isinstance(node.op, LogSoftmax)):
-                if (not cuda.dnn.dnn_available() and not theano.gpuarray.dnn.dnn_present()):
-                    print("Install CuDNN to do LogSoftmax faster"
-                          "this allows the operation to run on GPU")
-        if not printed_tip:
-            print("  Sorry, no tip for today.")
-    def clone(self, link_kwargs=None, optimizer="", message=None):
-        """
-        Create a new instance of this Mode.
-        Keyword arguments can be provided for the linker, in which case its
-        `clone` method will be called with these arguments.
-        """
-        new_linker = self.linker.clone(**link_kwargs)
-        new_optimizer = optimizer
-        if optimizer == "":
-            new_optimizer = self.provided_optimizer
-        new_mode = type(self)(linker=new_linker,
-                              optimizer=new_optimizer)
-        # If self is in the list or profiles to print, then add the
-        # new one as well
-        if self in prof_mode_instance_to_print:
-            prof_mode_instance_to_print.append(new_mode)
-        if message:
-            new_mode.message = message
-        return new_mode
-register_mode('PROFILE_MODE', ProfileMode())
-# needed to print the profile at the end automatically
-prof_mode_instance_to_print = [predefined_modes["PROFILE_MODE"]]
-def atexit_print_default_profile_mode():
-    """
-    Print the summary of the predefined mode ProfileMode if used.
-    This all to have the summary printed at exit when config.mode=ProfileMode.
-    """
-    for prof_mode in prof_mode_instance_to_print:
-        if prof_mode.local_time > 0:
-            prof_mode.print_summary()
-# Register atexit_print_default_profile_mode to have the summary of the
-# predefined mode ProfileMode if it is used printed when the program terminate.
-atexit.register(atexit_print_default_profile_mode)
-# Here we define an hook that allow to print extra profiling information
-profiler_printers = []
-def register_profiler_printer(fct):
-    profiler_printers.append(fct)
-    return fct
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -3,8 +3,6 @@ ProfileStats object for runtime and memory profiling.
 """
 #
-# TODO: measure memory usage like ProfileMode did
-# TODO: put the optimization tips into a tips section??
 # TODO: add tip to use specify_shape (is specify_shape even in library doc?)
 # TODO: ensure field width for string fields makes columns line up
 # TODO: what to do about 'diff summary'? (ask Fred?)
@@ -378,7 +376,7 @@ class ProfileStats(object):
        else:
            local_time = 0
        if local_time == 0:
-            print(('ProfileMode.summary_class: total time 0'
+            print(('ProfileStats.summary_class: total time 0'
                   ' (did you forget to enable counters?)'), file=file)
            return
        class_time = self.class_time()
@@ -462,7 +460,7 @@ class ProfileStats(object):
        else:
            local_time = 0
        if local_time == 0:
-            print(('ProfileMode.summary_ops: total time 0'
+            print(('ProfileStats.summary_ops: total time 0'
                   ' (did you forget to enable counters?)'), file=file)
            return
        op_time = self.op_time()
@@ -540,7 +538,7 @@ class ProfileStats(object):
        else:
            local_time = 0
        if local_time == 0:
-            print(('ProfileMode.summary_nodes: total time 0'
+            print(('ProfileStats.summary_nodes: total time 0'
                   ' (did you forget to enable counters?)'), file=file)
            return

--- a/theano/compile/tests/test_modes.py
+++ b/theano/compile/tests/test_modes.py
@@ -7,7 +7,7 @@ import unittest
 import theano
 import theano.tensor as T
-from theano.compile import Mode, ProfileMode
+from theano.compile import Mode
 class T_bunch_of_modes(unittest.TestCase):
@@ -18,9 +18,6 @@ class T_bunch_of_modes(unittest.TestCase):
        linker_classes_involved = []
        predef_modes = ['FAST_COMPILE', 'FAST_RUN', 'DEBUG_MODE']
-        # Use a new instance of ProfileMode instead of 'ProfileMode' to
-        # avoid printing a profile mode summary in nose output
-        predef_modes.append(ProfileMode())
        # Linkers to use with regular Mode
        if theano.config.cxx:
@@ -43,20 +40,13 @@ class T_bunch_of_modes(unittest.TestCase):
        # there should be
        # - VM_Linker
        # - OpWiseCLinker (FAST_RUN)
-        # - WrapLinker ("ProfileMode")
        # - PerformLinker (FAST_COMPILE)
        # - DebugMode's Linker  (DEBUG_MODE)
-        assert 5 == len(set(linker_classes_involved))
+        assert 4 == len(set(linker_classes_involved))
-class T_ProfileMode_WrapLinker(unittest.TestCase):
+class T_old_problem(unittest.TestCase):
    def test_1(self):
-        # First, compile a function with a new ProfileMode() object
-        # No need to call that function
-        x = T.matrix()
-        mode = ProfileMode()
-        theano.function([x], x * 2, mode=mode)
        # Then, build a mode with the same linker, and a modified optimizer
        default_mode = theano.compile.mode.get_default_mode()
        modified_mode = default_mode.including('specialize')

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -405,9 +405,9 @@ AddConfigVar(
 AddConfigVar(
    'mode',
    "Default compilation mode",
-    EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
+    EnumStr('Mode', 'DebugMode', 'FAST_RUN',
            'NanGuardMode',
-            'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
+            'FAST_COMPILE', 'DEBUG_MODE'),
    in_c_key=False)
 param = "g++"
@@ -463,8 +463,7 @@ del param
 if rc == 0 and config.cxx != "":
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
-                 ("Default linker used if the theano flags mode is Mode "
+                 "Default linker used if the theano flags mode is Mode",
-                  "or ProfileMode(deprecated)"),
                 EnumStr('cvm', 'c|py', 'py', 'c', 'c|py_nogc',
                         'vm', 'vm_nogc', 'cvm_nogc'),
                 in_c_key=False)
@@ -472,8 +471,7 @@ else:
    # g++ is not present or the user disabled it,
    # linker should default to python only.
    AddConfigVar('linker',
-                 ("Default linker used if the theano flags mode is Mode "
+                 "Default linker used if the theano flags mode is Mode",
-                  "or ProfileMode(deprecated)"),
                 EnumStr('vm', 'py', 'vm_nogc'),
                 in_c_key=False)
    try:
@@ -501,8 +499,7 @@ AddConfigVar('allow_gc',
 # Keep the default optimizer the same as the one for the mode FAST_RUN
 AddConfigVar(
    'optimizer',
-    ("Default optimizer. If not None, will use this linker with the Mode "
+    "Default optimizer. If not None, will use this optimizer with the Mode",
-     "object (not ProfileMode(deprecated) or DebugMode)"),
    EnumStr('fast_run', 'merge', 'fast_compile', 'None'),
    in_c_key=False)
@@ -951,27 +948,6 @@ AddConfigVar('NanGuardMode.action',
             EnumStr('raise', 'warn', 'pdb'),
             in_c_key=False)
-AddConfigVar('ProfileMode.n_apply_to_print',
-             "Number of apply instances to print by default",
-             IntParam(15, lambda i: i > 0),
-             in_c_key=False)
-AddConfigVar('ProfileMode.n_ops_to_print',
-             "Number of ops to print by default",
-             IntParam(20, lambda i: i > 0),
-             in_c_key=False)
-AddConfigVar('ProfileMode.min_memory_size',
-             "For the memory profile, do not print apply nodes if the size "
-             "of their outputs (in bytes) is lower then this threshold",
-             IntParam(1024, lambda i: i >= 0),
-             in_c_key=False)
-AddConfigVar('ProfileMode.profile_memory',
-             """Enable profiling of memory used by Theano functions""",
-             BoolParam(False),
-             in_c_key=False)
 AddConfigVar('optimizer_excluding',
             ("When using the default mode, we will remove optimizer with "
              "these tags. Separate tags with ':'."),

--- a/theano/d3viz/formatting.py
+++ b/theano/d3viz/formatting.py
@@ -11,7 +11,6 @@ from six import iteritems, itervalues
 import theano
 from theano import gof
-from theano.compile.profilemode import ProfileMode
 from theano.compile import Function
 from theano.compile import builders
 from theano.printing import pydot_imported, pydot_imported_msg
@@ -123,13 +122,6 @@ class PyDotFormatter(object):
        profile = None
        if isinstance(fct, Function):
-            mode = fct.maker.mode
-            if (not isinstance(mode, ProfileMode) or
-                    fct not in mode.profile_stats):
-                mode = None
-            if mode:
-                profile = mode.profile_stats[fct]
-            else:
            profile = getattr(fct, "profile", None)
            outputs = fct.maker.fgraph.outputs
            topo = fct.maker.fgraph.toposort()

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -20,7 +20,6 @@ from theano import gof
 from theano import config
 from theano.gof import Op, Apply
 from theano.compile import Function, debugmode, SharedVariable
-from theano.compile.profilemode import ProfileMode
 pydot_imported = False
 pydot_imported_msg = ""
@@ -759,15 +758,10 @@ def pydotprint(fct, outfile=None,
                               config.device + '.' + format)
    if isinstance(fct, Function):
-        mode = fct.maker.mode
        profile = getattr(fct, "profile", None)
-        if (not isinstance(mode, ProfileMode) or
-                fct not in mode.profile_stats):
-                mode = None
        outputs = fct.maker.fgraph.outputs
        topo = fct.maker.fgraph.toposort()
    elif isinstance(fct, gof.FunctionGraph):
-        mode = None
        profile = None
        outputs = fct.outputs
        topo = fct.toposort()
@@ -780,7 +774,6 @@ def pydotprint(fct, outfile=None,
        assert all(isinstance(v, gof.Variable) for v in fct)
        fct = gof.FunctionGraph(inputs=gof.graph.inputs(fct),
                                outputs=fct)
-        mode = None
        profile = None
        outputs = fct.outputs
        topo = fct.toposort()
@@ -868,19 +861,7 @@ def pydotprint(fct, outfile=None,
        if node in apply_name_cache:
            return apply_name_cache[node], apply_name_id[node]
        prof_str = ''
-        if mode:
+        if profile:
-            time = mode.profile_stats[fct].apply_time.get(node, 0)
-            # second, % total time in profiler, %fct time in profiler
-            if mode.local_time == 0:
-                pt = 0
-            else:
-                pt = time * 100 / mode.local_time
-            if mode.profile_stats[fct].fct_callcount == 0:
-                pf = 0
-            else:
-                pf = time * 100 / mode.profile_stats[fct].fct_call_time
-            prof_str = '   (%.3fs,%.3f%%,%.3f%%)' % (time, pt, pf)
-        elif profile:
            time = profile.apply_time.get(node, 0)
            # second, %fct time in profiler
            if profile.fct_callcount == 0:

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -4092,7 +4092,8 @@ def tensor4(name=None, dtype=None):
 ftensor4 = CudaNdarrayType(dtype='float32', broadcastable=(False,) * 4)
-@theano.compile.profilemode.register_profiler_printer
+# TODO: move that to the new back-end and new profiling.py print_tips
+# @theano.compile.profilemode.register_profiler_printer
 def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
                    apply_time, apply_cimpl, message, outputs_size,
                    other_time):

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -907,7 +907,7 @@ class BaseGpuCorrMM(GpuOp):
    def flops(self, inp, outp):
        """
-        Useful with the hack in profilemode to print the MFlops.
+        Useful with the hack in profiling to print the MFlops.
        """
        # if the output shape is correct, then this gives the correct
@@ -1421,7 +1421,7 @@ class BaseGpuCorr3dMM(GpuOp):
            self.pad)
    def flops(self, inp, outp):
-        """ Useful with the hack in profilemode to print the MFlops"""
+        """ Useful with the hack in profiling to print the MFlops"""
        # if the output shape is correct, then this gives the correct
        # flops for any direction, sampling, padding, and border mode
        inputs, filters = inp
@@ -2101,7 +2101,7 @@ class GpuConv(GpuOp):
        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
    def flops(self, inputs, outputs):
-        """ Useful with the hack in profilemode to print the MFlops"""
+        """ Useful with the hack in profiling to print the MFlops"""
        images, kerns = inputs
        out, = outputs
        assert images[1] == kerns[1]

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1367,12 +1367,12 @@ def speed_adv_sub1():
    vec = tensor.lvector()
    for batch_size in [100, 1000, 10000, 100000]:
        idx = numpy.random.randint(0, 50000, batch_size)
-        mode_with_gpu = theano.compile.ProfileMode().including('gpu')
+        mode_with_gpu = theano.compile.get_default_mode().including('gpu')
-        f = theano.function([vec], var[vec], mode=mode_with_gpu)
+        f = theano.function([vec], var[vec], mode=mode_with_gpu, profile=True)
        for i in range(100):
            f(idx)
-        print("ProfileMode with batch size", batch_size)
+        print("profile with batch size", batch_size)
-        mode_with_gpu.print_summary()
+        mode_with_gpu.summary()
 def speed_reduce10():

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -19,7 +19,7 @@ import theano.sandbox.cuda as tcn
 import theano.tests.unittest_tools as utt
-if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
+if theano.config.mode not in ['FAST_RUN', 'Mode']:
    raise SkipTest('Skip test_mlp when not in normal optimization mode as '
                   'otherwise it is too slow!')
@@ -48,8 +48,6 @@ def get_mode(use_gpu, check_isfinite=True):
        ret = theano.compile.get_default_mode()
    else:
        ret = theano.compile.mode.get_mode('FAST_RUN')
-    if isinstance(ret, theano.compile.ProfileMode):
-        ret = copy.copy(ret)
    if isinstance(ret, theano.compile.DebugMode):
        ret = copy.copy(ret)
        ret.check_isfinite = check_isfinite
@@ -60,19 +58,6 @@ def get_mode(use_gpu, check_isfinite=True):
    return ret
-def print_mode(mode):
-    if mode is not None and isinstance(mode, (theano.compile.ProfileMode,)):
-        mode.print_summary()
-def print_diff_mode(a, b):
-    if (a is not None and
-        isinstance(a, (theano.compile.ProfileMode,)) and
-       isinstance(b, (theano.compile.ProfileMode,))):
-        a.print_diff_summary(b)
 def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10,
             n_train=100):
@@ -123,7 +108,6 @@ def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10,
        rval.append(train(xval, yval, lr))
    dt = time.time() - t0
-    print_mode(mode)
    return numpy.asarray(rval), dt
@@ -220,7 +204,6 @@ def run_conv_nnet1(use_gpu):
    for i in xrange(n_train):
        rval = train(xval, yval, lr)
    # print 'training done'
-    print_mode(mode)
    return rval
@@ -316,7 +299,6 @@ def run_conv_nnet2(use_gpu):  # pretend we are training LeNet for MNIST
    for i in xrange(n_train):
        rval = train(xval, yval, lr)
-    print_mode(mode)
    return rval
@@ -428,7 +410,6 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
 def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
                           n_train=10,
                           check_isfinite=True,
-                           pickle=False,
                           verbose=0,
                           version=-1):
    """Run the train function returned by build_conv_nnet2_classif on one device.
@@ -444,11 +425,6 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
        version=version,
        check_isfinite=check_isfinite)
-    if use_gpu:
-        device = 'GPU'
-    else:
-        device = 'CPU'
    xval = my_rand(*x_shape)
    yval = my_rand(*y_shape)
    lr = theano._asarray(0.01, dtype='float32')
@@ -456,17 +432,6 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
    rvals = my_zeros(n_train)
    for i in xrange(n_train):
        rvals[i] = train(xval, yval, lr)[0]
-    print_mode(mode)
-    if pickle and isinstance(mode, theano.compile.ProfileMode):
-        import pickle
-        print("BEGIN %s profile mode dump" % device)
-        print(pickle.dumps(mode))
-        print("END %s profile mode dump" % device)
-    # print "%s time: %.3f" % (device, t1-t0)
-    # print "estimated time for one pass through MNIST with %s: %f" % (
-    #        device, (t1-t0) * (60000.0 / (n_train*bsize)))
 def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
@@ -476,7 +441,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
                               cpu_only=False,
                               float_atol=1e-06,
                               check_isfinite=True,
-                               pickle=False,
                               verbose=0,
                               version=-1):
    """Run the nnet2 function on 1 or 2 devices, and compares the results.
@@ -512,7 +476,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
                seed=seed, isize=isize, ksize=ksize, bsize=bsize,
                n_train=n_train,
                check_isfinite=check_isfinite,
-                pickle=pickle,
                verbose=verbose,
                version=version)

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -175,7 +175,7 @@ class Scan(PureOp):
        mode_instance = compile.mode.get_mode(self.mode)
        # Clone mode_instance, altering "allow_gc" for the linker,
-        # and adding a message if the mode is a ProfileMode.
+        # and adding a message if we profile
        if self.name:
            message = self.name + " sub profile"
        else:
@@ -1564,14 +1564,6 @@ class Scan(PureOp):
            if hasattr(self.fn.fn, 'update_profile'):
                self.fn.fn.update_profile(profile)
-        #/* Old ProfileMode
-        # if hasattr(self.fn.maker.mode,'fct_call_time'):
-        #    self.fn.maker.mode.fct_call_time[self.fn] += t_fn
-        #    self.fn.maker.mode.fct_call[self.fn] += n_steps
-        #self.fn.maker.mode.call_time += t_fn
-        #self.fn.maker.mode.fn_time += t_fn
-        # Old Profile Mode */
        self.t_call = t_call
        self.t_fn = t_fn
@@ -2839,7 +2831,8 @@ class Scan(PureOp):
 gof.ops_with_inner_function[Scan] = 'fn'
-@theano.compile.profilemode.register_profiler_printer
+# TODO: move that to the new back-end and new profiling.py print_tips
+#@theano.compile.profilemode.register_profiler_printer
 def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
                    apply_time, apply_cimpl, message, outputs_size,
                    other_time):

--- a/theano/sparse/sandbox/test_sp.py
+++ b/theano/sparse/sandbox/test_sp.py
@@ -47,7 +47,7 @@ class TestSP(unittest.TestCase):
        filters = rng.randn(nkern, numpy.prod(kshp))
        biasvals = rng.randn(nkern)
-        for mode in ('FAST_COMPILE', 'FAST_RUN'):  # , profmode):
+        for mode in ('FAST_COMPILE', 'FAST_RUN'):
            ttot, ntot = 0, 0
            for conv_mode in convmodes:
                for ss in ssizes:
@@ -128,7 +128,6 @@ class TestSP(unittest.TestCase):
 #            print 'Numpy processing time: ', ntot
 #            print 'Theano processing time: ', ttot
-        # profmode.print_summary()
    # this doesn't compare the output of anything... but I manually verified that the patches
    # are properly generated

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -719,7 +719,7 @@ class BaseAbstractConv2d(Op):
        self.filter_dilation = tuple(filter_dilation)
    def flops(self, inp, outp):
-        """ Useful with the hack in profilemode to print the MFlops"""
+        """ Useful with the hack in profiling to print the MFlops"""
        # if the output shape is correct, then this gives the correct
        # flops for any direction, sampling, padding, and border mode
        inputs, filters = inp

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -609,7 +609,7 @@ class ConvOp(OpenMPOp):
    def flops(self, inputs, outputs):
        """
-        Useful with the hack in profilemode to print the MFlops.
+        Useful with the hack in profiling to print the MFlops.
        """
        images, kerns = inputs

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1394,11 +1394,7 @@ class test_fusion(unittest.TestCase):
    def speed_log_exp(self):
        s = slice(31, 36)
-#        linker=gof.CLinker
+        print("time", self.do(None, shared, shp=(1000, 1000), gpu=False,
-        linker = gof.OpWiseCLinker
-        mode = compile.Mode(linker(), copy.copy(compile.mode.OPT_FAST_RUN))
-        mode = compile.ProfileMode()
-        print("time", self.do(mode, shared, shp=(1000, 1000), gpu=False,
                              assert_len_topo=False, slice=s, nb_repeat=100))
    def tes_memory_leak(self, mode=compile.mode.Mode('c', 'merge'),

--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py
@@ -115,14 +115,17 @@ def test_pydotprint_long_name():
 def test_pydotprint_profile():
-    """Just check that pydotprint does not crash with ProfileMode."""
+    """Just check that pydotprint does not crash with profile."""
    # Skip test if pydot is not available.
    if not theano.printing.pydot_imported:
        raise SkipTest('pydot not available')
    A = tensor.matrix()
-    f = theano.function([A], A + 1, mode='ProfileMode')
+    prof = theano.compile.ProfileStats(atexit_print=False)
+    f = theano.function([A], A + 1, profile=prof)
+    theano.printing.pydotprint(f, print_output_file=False)
+    f([[1]])
    theano.printing.pydotprint(f, print_output_file=False)