merged file

958efe87 · Chinnadhurai Sankar · 0a7a4c06 · adfe319c · 958efe87 · 958efe87
--- a/doc/extending/extending_theano.txt
+++ b/doc/extending/extending_theano.txt
@@ -122,8 +122,6 @@ An op has to implement some methods defined in the the interface of
 the method :func:`make_node` or :attr:`itypes`, :attr:`otypes` and one of the 
 implementation methods, either :func:`perform`, :meth:`Op.c_code` 
 or :func:`make_thunk`.
-method :func:`make_node` and one of the implementation methods, either 
-:func:`perform`, :meth:`Op.c_code` or :func:`make_thunk`.
  :func:`make_node` method creates an Apply node representing the application
  of the op on the inputs provided. This method is reponsible for three things:

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -117,21 +117,24 @@ A contributor made rpm package for Mandriva_ 2010.2 of Theano 0.3.1.
 .. _linux_basic:
-Docker images
+Docker
-~~~~~~~~~~~~~
+~~~~~~
-Builds of Theano are available as `Docker <https://www.docker.com/whatisdocker>`_ images:
+Builds of Theano are available as `Docker <https://www.docker.com>`_
-`Theano Docker (CPU) <https://hub.docker.com/r/kaixhin/theano/>`_ or `Theano Docker (CUDA) <https://hub.docker.com/r/kaixhin/cuda-theano/>`_.
+images: `Theano Docker (CPU) <https://hub.docker.com/r/kaixhin/theano/>`_ or
-These are updated on a weekly basis with bleeding-edge builds of Theano. Examples of running bash in a Docker container
+`Theano Docker (CUDA) <https://hub.docker.com/r/kaixhin/cuda-theano/>`_. These
-are as follows:
+are updated on a weekly basis with bleeding-edge builds of Theano.
+Examples of running bash in a Docker container are as follows:
 .. code-block:: bash
    sudo docker run -it kaixhin/theano
-    sudo docker run -it --device /dev/nvidiactl --device /dev/nvidia-uvm --device /dev/nvidia0 kaixhin/cuda-theano:7.0
+    sudo nvidia-docker run -it kaixhin/cuda-theano:7.0
-For a guide to Docker, see the `official docs <https://docs.docker.com/userguide/>`_. For more details on how to use the
+For a guide to Docker, see the `official docs <https://docs.docker.com>`_.
-Theano Docker images, including requirements for CUDA support, consult the `source project <https://github.com/Kaixhin/dockerfiles>`_.
+CUDA support requires `NVIDIA Docker <https://github.com/NVIDIA/nvidia-docker>`_.
+For more details on how to use the Theano Docker images,
+consult the `source project <https://github.com/Kaixhin/dockerfiles>`_.
 Basic user install instructions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -87,7 +87,9 @@ Environment Variables
    with later (right-most) files taking priority over earlier files in the
    case that multiple files specify values for a common configuration option.
    For example, to override system-wide settings with personal ones,
-    set ``THEANORC=/etc/theanorc:~/.theanorc``.
+    set ``THEANORC=/etc/theanorc:~/.theanorc``. To load configuration files in
+    the current working directory, append ``.theanorc`` to the list of configuration
+    files, e.g. ``THEANORC=~/.theanorc:.theanorc``.
 Config Attributes
 =====================

--- a/doc/tutorial/nan_tutorial.txt
+++ b/doc/tutorial/nan_tutorial.txt
@@ -40,7 +40,9 @@ If adjusting hyperparameters doesn't work for you, you can still get help from
 Theano's NanGuardMode. Change the mode of your theano function to NanGuardMode
 and run them again. The NanGuardMode will monitor all input/output variables in
 each node, and raises an error if NaNs are detected. For how to use the
-``NanGuardMode``, please refer to :ref:`nanguardmode`.
+``NanGuardMode``, please refer to :ref:`nanguardmode`. Using ``optimizer_including=alloc_empty_to_zeros`` 
+with ``NanGuardMode`` could be helpful to detect NaN, for more information please refer 
+to :ref:`AllocEmpty`. 
 DebugMode can also help. Run your code in DebugMode with flag
 ``mode=DebugMode,DebugMode.check_py=False``. This will give you clue about which
@@ -76,3 +78,13 @@ Cuda Specific Option
 The Theano flag ``nvcc.fastmath=True`` can genarate NaN. Don't set
 this flag while debugging NaN.
+.. _AllocEmpty:
+NaN Introduced by AllocEmpty
+-----------------------------------------------
+AllocEmpty is used by many operation such as scan to allocate some memory without properly clearing it. The reason for that is that the allocated memory will subsequently be overwritten. However, this can sometimes introduce NaN depending on the operation and what was previously stored in the memory it is working on. For instance, trying to zero out memory  using a multipication before applying an operation could cause NaN if NaN is already present in the memory, since `0 * NaN => NaN`.
+Using ``optimizer_including=alloc_empty_to_zeros`` replaces `AllocEmpty` by `Alloc{0}`, which is helpful to diagnose where NaNs come from. Please note that when running in `NanGuardMode`, this optimizer is not included by default. Therefore, it might be helpful to use them both together. 
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -25,7 +25,7 @@ from theano.gof import (graph, utils, link, ops_with_inner_function)
 from theano.gof.link import raise_with_op
 from theano.compile.function_module import (
    FunctionMaker, Function, infer_reuse_pattern,
-    SymbolicInputKit, SymbolicOutput, Supervisor, std_fgraph)
+    SymbolicOutput, Supervisor, std_fgraph)
 from theano.compile.mode import Mode, register_mode
 from theano.compile.ops import OutputGuard
@@ -2517,27 +2517,9 @@ class _Maker(FunctionMaker):  # inheritance buys a few helper functions
                # default.storage to input_storage.
                if indices is not None:
                    raise TypeError("Cannot take a Container instance as "
-                                    "default for a SymbolicInputKit.")
+                                    "default for a SymbolicInput.")
                input_storage.append(default.storage)
                default = None
-            elif isinstance(input, SymbolicInputKit):
-                # If the input is a SymbolicInputKit, it represents more than
-                # one storage unit. The indices and subinputs lists represent
-                # which of the kit's inputs are active in this graph, so we
-                # make as many storage units as needed
-                if isinstance(default, (list, tuple)) \
-                        and all(isinstance(x, gof.Container) for x in default):
-                    if len(default) == len(indices):
-                        input_storage += [x.storage for x in default]
-                    elif len(default) > len(indices):
-                        input_storage += [default[i].storage for i in indices]
-                    else:
-                        raise ValueError(
-                            'Not enough storage for SymbolicInputKit',
-                            input, indices, default)
-                    default = _NODEFAULT
-                else:
-                    input_storage += [[None] for i in indices]
            else:
                # Normal case: one new, independent storage unit
                input_storage.append([None])
@@ -2550,16 +2532,7 @@ class _Maker(FunctionMaker):  # inheritance buys a few helper functions
            #   storage after each function call
            # - value is the value that will be put in the storage initially
-            # Even though a SymbolicInputKit represents more than one input,
+            if input.update is not None:
-            # we still only have one entry for the defaults list.
-            if isinstance(input, SymbolicInputKit):
-                if default is _NODEFAULT:
-                    _defaults.append((False, False, None))
-                elif default is None:
-                    _defaults.append((True, True, None))
-                else:
-                    _defaults.append((False, False, default))
-            elif input.update is not None:
                # If the input has an update, then (logically) it is
                # not required since it is just a parameter and of
                # course we don't want to refeed the default back into

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -16,12 +16,11 @@ import numpy
 import theano
 from theano import config, gof
-from functools import partial
 from theano.compat import izip
 from theano.gof import graph
 import theano.compile.mode
 from theano.compile.io import (
-    In, SymbolicInput, SymbolicInputKit, SymbolicOutput)
+    In, SymbolicInput, SymbolicOutput)
 from theano.compile.ops import deep_copy_op, view_op
 from theano.gof.graph import is_same_graph
 from theano.gof.op import ops_with_inner_function
@@ -286,7 +285,7 @@ class Function(object):
    indices = None
    """
-    List of (SymbolicInput|SymbolicInputKit, indices, [SymbolicInput,...]),
+    List of (SymbolicInput, indices, [SymbolicInput,...]),
    one tuple for each input.
    The first tuple element is the SymbolicInput object for the corresponding
@@ -396,7 +395,6 @@ class Function(object):
        # self.input_storage inplace.
        for i, ((input, indices, sinputs), (required, refeed, value)) in \
                enumerate(zip(self.indices, defaults)):
-            # this is true iff input is not a SymbolicInputKit
            if indices is None:
                # containers is being used as a stack. Here we pop off
                # the next one.
@@ -432,41 +430,6 @@ class Function(object):
                    named_inputs.append(input.name)
                inv_finder[c] = input
                containers[:1] = []
-            else:
-                # TODO The following code may need to do something to handle
-                # implicit inputs.
-                # The input is a SymbolicInputKit, so we take as many
-                # containers as the Kit provides inputs
-                cs = containers[:len(indices)]
-                # distribute does the initialization of the containers
-                input.distribute(value, indices, cs)
-                f = partial(distribute, indices, cs)
-                # Like before, we set a finder entry for the kit. Note that
-                # we are not mapping to a container but to a function which
-                # can reinitialize all the containers
-                finder[i] = f
-                finder[input] = f
-                if input.name not in finder:
-                    finder[input.name] = f
-                else:
-                    finder[input.name] = DUPLICATE
-                # For each input in the kit and its corresponding
-                # container, we put an entry in finder.  This allows
-                # the user to micro-manage elements of the kit if need
-                # be.  All containers inherit the required field and
-                # have their own "provided" counter
-                for c, sin in zip(cs, sinputs):
-                    finder[sin.variable] = c
-                    finder[sin.name] = c
-                    if sin.name not in finder:
-                        finder[sin.name] = c
-                    else:
-                        finder[sin.name] = DUPLICATE
-                    inv_finder[c] = input
-                    c.required = required
-                    c.provided = 0
-                containers[:len(indices)] = []
        self.finder = finder
        self.inv_finder = inv_finder
@@ -1033,14 +996,6 @@ def _pickle_Function(f):
    for (input, indices, inputs), (required, refeed, default) in \
            zip(f.indices, f.defaults):
-        if isinstance(input, SymbolicInputKit):
-            li = len(indices)
-            if not default:
-                input_storage.append(ins[:li])
-            else:
-                input_storage.append(default)
-            ins[:li] = []
-        else:
        input_storage.append(ins[0])
        del ins[0]
@@ -1210,7 +1165,7 @@ class FunctionMaker(object):
    @staticmethod
    def wrap_in(input):
-        if isinstance(input, (SymbolicInput, SymbolicInputKit)):
+        if isinstance(input, (SymbolicInput)):
            return input
        elif isinstance(input, gof.Variable):
            # r -> SymbolicInput(variable=r)
@@ -1234,9 +1189,10 @@ class FunctionMaker(object):
        # instances in inputs.  For SymbolicInput, this returns None
        # as the list of indices and a list with just the
        # SymbolicInput.
-        if isinstance(sinput, SymbolicInputKit):
+        # if isinstance(sinput, SymbolicInputKit):
-            return sinput.complete(rinputs)
+        #    return sinput.complete(rinputs)
-        elif isinstance(sinput, SymbolicInput):
+        # elif isinstance(sinput, SymbolicInput):
+        if isinstance(sinput, SymbolicInput):
            return [None, [sinput]]
    @staticmethod
@@ -1858,7 +1814,7 @@ def convert_function_input(input):
      `In`(r, name=name, value=val, update=up, autoname=True)
    """
-    if isinstance(input, (SymbolicInput, SymbolicInputKit)):
+    if isinstance(input, SymbolicInput):
        return input
    elif isinstance(input, gof.Constant):
        raise TypeError('A Constant instance is not a legal function input',
@@ -1887,7 +1843,7 @@ def convert_function_input(input):
            else:
                raise TypeError("Invalid input syntax: %s (check "
                                "documentation or use an In instance)" % orig)
-        elif isinstance(input[0], (SymbolicInput, SymbolicInputKit)):
+        elif isinstance(input[0], SymbolicInput):
            if len(input) == 1:
                return input[0]
            elif len(input) == 2:

--- a/theano/compile/io.py
+++ b/theano/compile/io.py
@@ -97,69 +97,6 @@ class SymbolicInput(object):
        return str(self)
-# TODO: FB: I think this isn't used, confirm this and remove.
-class SymbolicInputKit(object):
-    """
-    Represents a group ("kit") of SymbolicInputs. If fed into function or
-    FunctionMaker, only the inputs which are needed to compile the function
-    properly will be taken.
-    A SymbolicInputKit provides the distribute function in order to set or
-    initialize several inputs from a single value. Specialized Kits should
-    override it.
-    """
-    def __init__(self, name):
-        if not isinstance(name, string_types):
-            raise TypeError('name must be a string (got: %s)' % name)
-        self.name = name
-        self.sinputs = []
-        self.variables = []
-    def add_input(self, sinput):
-        """
-        Add a SymbolicInput to this SymbolicInputKit.
-        It will be given the next available index.
-        """
-        self.sinputs.append(sinput)
-        self.variables.append(sinput.variable)
-    def distribute(self, value, indices, containers):
-        """
-        Given a list of indices corresponding to SymbolicInputs in this kit
-        as well as a corresponding list of containers, initialize all the
-        containers using the provided value.
-        """
-        raise NotImplementedError
-    def complete(self, inputs):
-        """
-        Given inputs (a list of Variable instances), checks through all the
-        SymbolicInputs in the kit and return a sorted list of indices and a list
-        of their corresponding SymbolicInputs such that each of them represents
-        some variable in the inputs list.
-        Not all the provided inputs will have a corresponding SymbolicInput in
-        the kit.
-        """
-        ret = []
-        for input in inputs:
-            try:
-                i = self.variables.index(input)
-                ret.append((i, self.sinputs[i]))
-            except ValueError:
-                pass
-        ret.sort()
-        if not ret:
-            return [[], []]
-        return list(zip(*ret))
 class In(SymbolicInput):
    """
    Represents a symbolic input for use with function or FunctionMaker.

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -88,6 +88,7 @@ def _atexit_print_fn():
                    merge = cum.optimizer_profile[0].merge_profile(
                        cum.optimizer_profile[1],
                        ps.optimizer_profile[1])
+                    assert len(merge) == len(cum.optimizer_profile[1])
                    cum.optimizer_profile = (cum.optimizer_profile[0], merge)
                except Exception as e:
                    print("Got an exception while merging profile")

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -5,6 +5,7 @@ Generate and compile C modules for Python.
 from __future__ import absolute_import, print_function, division
 import atexit
+import textwrap
 import six.moves.cPickle as pickle
 import logging
 import os
@@ -24,6 +25,7 @@ import numpy.distutils  # TODO: TensorType should handle this
 import theano
 from theano.compat import PY3, decode, decode_iter
 from six import b, BytesIO, StringIO, string_types, iteritems
+from six.moves import xrange
 from theano.gof.utils import flatten
 from theano.configparser import config
 from theano.gof.utils import hash_from_code
@@ -125,9 +127,7 @@ class ExtFunction(object):
 class DynamicModule(object):
-    """
-    WRITEME
-    """
    def __init__(self, name=None):
        assert name is None, (
            "The 'name' parameter of DynamicModule"
@@ -1807,6 +1807,34 @@ class Compiler(object):
                                         output=output, compiler=compiler)
+def try_march_flag(flags):
+    """
+        Try to compile and run a simple C snippet using current flags.
+        Return: compilation success (True/False), execution success (True/False)
+    """
+    test_code = textwrap.dedent("""\
+            #include <cmath>
+            using namespace std;
+            int main(int argc, char** argv)
+            {
+                float Nx = -1.3787706641;
+                float Sx = 25.0;
+                double r = Nx + sqrt(Sx);
+                if (abs(r - 3.621229) > 0.01)
+                {
+                    return -1;
+                }
+                return 0;
+            }
+            """)
+    cflags = flags + ['-L' + d for d in theano.gof.cmodule.std_lib_dirs()]
+    compilation_result, execution_result = GCC_compiler.try_compile_tmp(
+        test_code, tmp_prefix='try_march_',
+        flags=cflags, try_run=True)
+    return compilation_result, execution_result
 class GCC_compiler(Compiler):
    # The equivalent flags of --march=native used by g++.
    march_flags = None
@@ -2027,6 +2055,54 @@ class GCC_compiler(Compiler):
                    _logger.info("g++ -march=native equivalent flags: %s",
                                 GCC_compiler.march_flags)
+            # Find working march flag:
+            #   -- if current GCC_compiler.march_flags works, we're done.
+            #   -- else replace -march and -mtune with ['core-i7-avx', 'core-i7', 'core2']
+            #      and retry with all other flags and arguments intact.
+            #   -- else remove all other flags and only try with -march = default + flags_to_try.
+            #   -- if none of that worked, set GCC_compiler.march_flags = [] (for x86).
+            default_compilation_result, default_execution_result = try_march_flag(GCC_compiler.march_flags)
+            if not default_compilation_result or not default_execution_result:
+                march_success = False
+                march_ind = None
+                mtune_ind = None
+                default_detected_flag = []
+                march_flags_to_try = ['corei7-avx', 'corei7', 'core2']
+                for m_ in xrange(len(GCC_compiler.march_flags)):
+                    march_flag = GCC_compiler.march_flags[m_]
+                    if 'march' in march_flag:
+                        march_ind = m_
+                        default_detected_flag = [march_flag]
+                    elif 'mtune' in march_flag:
+                        mtune_ind = m_
+                for march_flag in march_flags_to_try:
+                    if march_ind is not None:
+                        GCC_compiler.march_flags[march_ind] = '-march=' + march_flag
+                    if mtune_ind is not None:
+                        GCC_compiler.march_flags[mtune_ind] = '-mtune=' + march_flag
+                    compilation_result, execution_result = try_march_flag(GCC_compiler.march_flags)
+                    if compilation_result and execution_result:
+                        march_success = True
+                        break
+                if not march_success:
+                    # perhaps one of the other flags was problematic; try default flag in isolation again:
+                    march_flags_to_try = default_detected_flag + march_flags_to_try
+                    for march_flag in march_flags_to_try:
+                        compilation_result, execution_result = try_march_flag(['-march=' + march_flag])
+                        if compilation_result and execution_result:
+                            march_success = True
+                            GCC_compiler.march_flags = ['-march=' + march_flag]
+                            break
+                if not march_success:
+                    GCC_compiler.march_flags = []
        # Add the detected -march=native equivalent flags
        if march_flags and GCC_compiler.march_flags:
            cxxflags.extend(GCC_compiler.march_flags)
@@ -2099,7 +2175,6 @@ class GCC_compiler(Compiler):
                    include_dirs=None, lib_dirs=None, libs=None,
                    preargs=None, py_module=True, hide_symbols=True):
        """
        Parameters
        ----------
        module_name : str

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -315,17 +315,17 @@ class SeqOptimizer(Optimizer, list):
                  "  time      - (name, class, index, nodes before, nodes after) - validate time",
                  file=stream)
        ll = []
-        for opt in opts:
+        for (opt, nb_n) in zip(opts, nb_nodes):
            if hasattr(opt, "__name__"):
                name = opt.__name__
            else:
                name = opt.name
            idx = opts.index(opt)
            ll.append((name, opt.__class__.__name__,
-                       idx))
+                       idx) + nb_n)
-        lll = sorted(zip(prof, ll, nb_nodes), key=lambda a: a[0])
+        lll = sorted(zip(prof, ll), key=lambda a: a[0])
-        for (t, opt, nb_n) in lll[::-1]:
+        for (t, opt) in lll[::-1]:
            i = opt[2]
            if sub_validate_time:
                val_time = sub_validate_time[i + 1] - sub_validate_time[i]
@@ -345,8 +345,8 @@ class SeqOptimizer(Optimizer, list):
        Merge 2 profiles returned by this cass apply() fct.
        """
-        new_t = []
+        new_t = []  # the time for the optimization
-        new_l = []
+        new_l = []  # the optimization
        new_sub_profile = []
        # merge common(same object) opt
        for l in set(prof1[0]).intersection(set(prof2[0])):
@@ -399,6 +399,12 @@ class SeqOptimizer(Optimizer, list):
            new_sub_profile.append(p[6][idx])
        new_opt = SeqOptimizer(*new_l)
+        new_nb_nodes = []
+        for p1, p2 in zip(prof1[8], prof2[8]):
+            new_nb_nodes.append((p1[0] + p2[0], p1[1] + p2[1]))
+        new_nb_nodes.extend(prof1[8][len(new_nb_nodes):])
+        new_nb_nodes.extend(prof2[8][len(new_nb_nodes):])
        new_callbacks_times = merge_dict(prof1[9], prof2[9])
        # We need to assert based on the name as we merge also based on
        # the name.
@@ -410,6 +416,7 @@ class SeqOptimizer(Optimizer, list):
        return (new_opt, new_t, prof1[2] + prof2[2],
                prof1[3] + prof2[3],
                -1, -1, new_sub_profile, [],
+                new_nb_nodes,
                new_callbacks_times)
@@ -2313,10 +2320,18 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                          "%f with the theano flag 'optdb.max_use_ratio'." %
                          config.optdb.max_use_ratio)
        fgraph.remove_feature(change_tracker)
+        assert len(loop_process_count) == len(loop_timing)
+        assert len(loop_process_count) == len(global_opt_timing)
+        assert len(loop_process_count) == len(nb_nodes)
+        assert len(loop_process_count) == len(io_toposort_timing)
+        assert len(loop_process_count) == len(global_sub_profs)
+        assert len(loop_process_count) == len(final_sub_profs)
+        assert len(loop_process_count) == len(cleanup_sub_profs)
        return (self, loop_timing, loop_process_count,
                (start_nb_nodes, end_nb_nodes, max_nb_nodes),
                global_opt_timing, nb_nodes, time_opts, io_toposort_timing,
-                node_created, global_sub_profs, final_sub_profs, cleanup_sub_profs)
+                node_created, global_sub_profs, final_sub_profs,
+                cleanup_sub_profs)
    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
        name = getattr(self, 'name', None)

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -75,10 +75,11 @@ class GpuElemwise(HideC, Elemwise):
            pass
        try:
            support_code = self.scalar_op.c_support_code()
-            if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
+            if "struct" in support_code:
-                    support_code.strip() != ""):
                # The macro is fine, the C++ struct is not.
-                raise SupportCodeError(support_code)
+                raise SupportCodeError(
+                    "struct aren't supported in GpuElemwise support_code" +
+                    support_code)
        except MethodNotDefined:
            pass

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -673,6 +673,15 @@ def local_gpua_advanced_incsubtensor(node, context_name):
            set_instead_of_inc=set_instead_of_inc)
+@register_inplace()
+@local_optimizer([GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20])
+def local_advincsub1_gpua_inplace(node):
+    if isinstance(node.op, (GpuAdvancedIncSubtensor1,
+                            GpuAdvancedIncSubtensor1_dev20)):
+        if not node.op.inplace:
+            return [node.op.clone_inplace()(*node.inputs)]
 @register_opt('fast_compile')
 @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
 def local_gpua_careduce(node, context_name):
@@ -881,6 +890,10 @@ def local_gpua_softmaxwithbias(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([theano.tensor.opt.Assert])
 def local_assert(node, context_name):
+    # Check if input nodes are already on the GPU
+    if isinstance(node.inputs[0].type, GpuArrayType):
+        return
    return [host_from_gpu(node.op(as_gpuarray_variable(node.inputs[0],
                                                       context_name),
                                  *node.inputs[1:]))]
@@ -946,7 +959,7 @@ def local_lift_abstractconv2d(node, context_name):
    return [node.op(*inps)]
 # Register this here so that it goes after the abstract lifting
-register_opt()(conv_groupopt)
+register_opt('fast_compile')(conv_groupopt)
 @register_opt("low_memory")

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -608,11 +608,6 @@ class GpuAdvancedIncSubtensor1(Op):
        }
        step[0] = 0;
        num_indices = PyArray_SIZE(%(ind)s);
-        if ((num_indices - 1) > LONG_MAX) {
-          PyErr_Format(PyExc_AssertionError,
-                       "num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices);
-          %(fail)s
-        }
        if (!%(inplace)s) {
          %(out)s = theano_try_copy(%(out)s, %(x)s);
          if (%(out)s == NULL)
@@ -622,6 +617,12 @@ class GpuAdvancedIncSubtensor1(Op):
          %(out)s = %(x)s;
          Py_INCREF(%(out)s);
        }
+        if (num_indices != 0) {
+          if ((num_indices - 1) > LONG_MAX) {
+            PyErr_Format(PyExc_AssertionError,
+                         "num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices);
+            %(fail)s
+          }
          broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1;
          for (j = 0; j < num_indices; j++) {
            start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
@@ -659,13 +660,14 @@ class GpuAdvancedIncSubtensor1(Op):
            if (ret != GA_NO_ERROR)
              PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
          }
+        }
        """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
                   fail=sub['fail'], inplace=int(self.inplace),
                   nd=node.inputs[0].ndim,
                   set_instead_of_inc=int(self.set_instead_of_inc))
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
 class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -590,6 +590,8 @@ def get_scalar_constant_value(orig_v, elemwise=True,
    ----------
    elemwise : bool
        If False, we won't try to go into elemwise. So this call is faster.
+        But we still investigate in Second Elemwise (as this is a substitute
+        for Alloc)
    only_process_constants : bool
        If True, we only attempt to obtain the value of `orig_v` if it's
        directly constant and don't try to dig through dimshuffles, fills,
@@ -650,13 +652,17 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                    ret = [[None]]
                    v.owner.op.perform(v.owner, const, ret)
                    return ret[0][0].copy()
-            elif elemwise and isinstance(v.owner.op, Elemwise):
+            # In fast_compile, we don't enable local_fill_to_alloc, so
+            # we need to investigate Second as Alloc. So elemwise
+            # don't disable the check for Second.
+            elif isinstance(v.owner.op, Elemwise):
                if isinstance(v.owner.op.scalar_op, scal.Second):
                    # We don't need both input to be constant for second
                    shp, val = v.owner.inputs
                    v = val
                    continue
-                elif isinstance(v.owner.op.scalar_op,
+                elif elemwise and isinstance(
+                        v.owner.op.scalar_op,
                        get_scalar_constant_value_elemwises):
                    const = [get_scalar_constant_value(i)
                             for i in v.owner.inputs]

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -905,9 +905,11 @@ def softmax_simplifier(numerators, denominators):
                                matching_denom = denominator
                                break
        if matching_denom:
+            softmax = softmax_op(x)
+            copy_stack_trace(numerator, softmax)
            numerators.remove(numerator)
            denominators.remove(matching_denom)
-            numerators.append(softmax_op(x))
+            numerators.append(softmax)
    return numerators, denominators
 opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -612,6 +612,7 @@ def local_exp_over_1_plus_exp(node):
            else:
                # case: 1/(1+exp(x))
                sigmoids.append(sigmoid(-t))
+            copy_stack_trace(node.outputs[0], sigmoids[-1])
        if not sigmoids:  # we didn't find any.  abort
            return
@@ -625,12 +626,17 @@ def local_exp_over_1_plus_exp(node):
        if num_neg ^ denom_neg:
            new_num = -new_num
+        copy_stack_trace(num, new_num)
        if len(denom_rest) == 0:
            return [new_num]
        elif len(denom_rest) == 1:
-            return [new_num / denom_rest[0]]
+            out = new_num / denom_rest[0]
        else:
-            return [new_num / tensor.mul(*denom_rest)]
+            out = new_num / tensor.mul(*denom_rest)
+        copy_stack_trace(node.outputs[0], out)
+        return [out]
 def parse_mul_tree(root):
@@ -923,6 +929,7 @@ def local_sigm_times_exp(node):
    exp(x) * sigm(-x) -> sigm(x)
    exp(-x) * sigm(x) -> sigm(-x)
+    todo: add stack traces to the intermediate variables
    """
    # Bail early if it is not a multiplication.
    if node.op != tensor.mul:

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -7,6 +7,7 @@ from nose.tools import assert_raises
 import theano
 from theano import tensor
+from theano.gof.opt import check_stack_trace
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, abstract_conv as conv
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
@@ -98,7 +99,7 @@ class BaseTestConv2d(unittest.TestCase):
    def run_fwd(self, inputs_shape, filters_shape, ref=conv_corr,
                subsample=(1, 1), verify_grad=True, mode=None,
                border_mode='valid', filter_flip=True, provide_shape=False,
-                target_op=None):
+                target_op=None, check_trace=False):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')
@@ -133,8 +134,9 @@ class BaseTestConv2d(unittest.TestCase):
        if target_op is not None:
            assert any([isinstance(n.op, target_op) for n
                        in f.maker.fgraph.toposort()])
+            if check_trace:
+                self.assertTrue(check_stack_trace(f, ops_to_check=target_op))
-        self.assertTrue(hasattr(f.maker.fgraph.outputs[0].tag, 'trace'))
        res_ref = numpy.array(f_ref())
        res = numpy.array(f())
        utt.assert_allclose(res_ref, res)
@@ -148,7 +150,7 @@ class BaseTestConv2d(unittest.TestCase):
    def run_gradweight(self, inputs_shape, filters_shape, output_shape,
                       ref=conv_corr_gw, subsample=(1, 1), filter_flip=True,
                       verify_grad=True, mode=None, border_mode='valid',
-                       provide_shape=False, target_op=None):
+                       provide_shape=False, target_op=None, check_trace=False):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        output_val = numpy.random.random(output_shape).astype('float32')
@@ -177,12 +179,13 @@ class BaseTestConv2d(unittest.TestCase):
                    subsample=subsample,
                    conv_mode=conv_mode)
        f = theano.function([], c, mode=mode)
-        self.assertTrue(hasattr(f.maker.fgraph.outputs[0].tag, 'trace'))
        f_ref = theano.function([], c_ref, mode='FAST_RUN')
        if target_op is not None:
            assert any([isinstance(n.op, target_op) for n
                        in f.maker.fgraph.toposort()])
+            if check_trace:
+                self.assertTrue(check_stack_trace(f, ops_to_check=target_op))
        res_ref = numpy.array(f_ref())
        res = numpy.array(f())
@@ -201,7 +204,7 @@ class BaseTestConv2d(unittest.TestCase):
    def run_gradinput(self, inputs_shape, filters_shape, output_shape,
                      ref=conv_corr_gi, subsample=(1, 1), filter_flip=True,
                      verify_grad=True, mode=None, border_mode='valid',
-                      provide_shape=False, target_op=None):
+                      provide_shape=False, target_op=None, check_trace=False):
        output_val = numpy.random.random(output_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')
@@ -227,12 +230,13 @@ class BaseTestConv2d(unittest.TestCase):
                    border_mode=border_mode, subsample=subsample,
                    conv_mode=conv_mode)
        f = theano.function([], c, mode=mode)
-        self.assertTrue(hasattr(f.maker.fgraph.outputs[0].tag, 'trace'))
        f_ref = theano.function([], c_ref, mode='FAST_RUN')
        if target_op is not None:
            assert any([isinstance(n.op, target_op) for n
                        in f.maker.fgraph.toposort()])
+            if check_trace:
+                self.assertTrue(check_stack_trace(f, ops_to_check=target_op))
        res_ref = numpy.array(f_ref())
        res = numpy.array(f())
@@ -291,15 +295,18 @@ class TestCorrConv2d(BaseTestConv2d):
            raise SkipTest("Need blas to test conv2d")
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, provide_shape=provide_shape,
-                     border_mode=b, filter_flip=flip, target_op=CorrMM)
+                     border_mode=b, filter_flip=flip, target_op=CorrMM,
+                     check_trace=True)
        self.run_gradweight(inputs_shape=i, filters_shape=f,
                            output_shape=o, subsample=s, verify_grad=True,
                            provide_shape=provide_shape, border_mode=b,
-                            filter_flip=flip, target_op=CorrMM_gradWeights)
+                            filter_flip=flip, target_op=CorrMM_gradWeights,
+                            check_trace=True)
        self.run_gradinput(inputs_shape=i, filters_shape=f,
                           output_shape=o, subsample=s, verify_grad=True,
                           provide_shape=provide_shape, border_mode=b,
-                           filter_flip=flip, target_op=CorrMM_gradInputs)
+                           filter_flip=flip, target_op=CorrMM_gradInputs,
+                           check_trace=True)
 class TestCpuConv2d(BaseTestConv2d):
@@ -343,7 +350,8 @@ class TestCpuConv2d(BaseTestConv2d):
            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                         verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
-                         border_mode=b, filter_flip=flip, target_op=ConvOp)
+                         border_mode=b, filter_flip=flip, target_op=ConvOp,
+                         check_trace=True)
        else:
            self.assertRaises(AssertionError,
                              self.run_fwd,
@@ -354,7 +362,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              mode=mode,
                              provide_shape=provide_shape,
                              border_mode=b,
-                              filter_flip=flip)
+                              filter_flip=flip,
+                              check_trace=True)
        if gradweight_OK:
            if not theano.config.blas.ldflags:
@@ -364,7 +373,8 @@ class TestCpuConv2d(BaseTestConv2d):
                                verify_grad=False, mode=mode,
                                provide_shape=provide_shape, border_mode=b,
                                filter_flip=flip,
-                                target_op=(ConvOp, ConvGrad3D))
+                                target_op=(ConvOp, ConvGrad3D),
+                                check_trace=True)
        else:
            self.assertRaises(AssertionError,
                              self.run_gradweight,
@@ -376,7 +386,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              mode=mode,
                              provide_shape=provide_shape,
                              border_mode=b,
-                              filter_flip=flip)
+                              filter_flip=flip,
+                              check_trace=True)
        if gradinput_OK:
            if not theano.config.blas.ldflags:
@@ -386,7 +397,8 @@ class TestCpuConv2d(BaseTestConv2d):
                               verify_grad=False, mode=mode,
                               provide_shape=provide_shape, border_mode=b,
                               filter_flip=flip,
-                               target_op=(ConvOp, ConvTransp3D))
+                               target_op=(ConvOp, ConvTransp3D),
+                               check_trace=True)
        else:
            self.assertRaises(AssertionError,
                              self.run_gradinput,
@@ -398,7 +410,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              mode=mode,
                              provide_shape=provide_shape,
                              border_mode=b,
-                              filter_flip=flip)
+                              filter_flip=flip,
+                              check_trace=True)
 def test_constant_shapes():

--- a/theano/tensor/nnet/tests/test_conv3d2d.py
+++ b/theano/tensor/nnet/tests/test_conv3d2d.py
@@ -10,6 +10,7 @@ except ImportError:
 from six.moves import xrange
 import theano
+from theano.gof.opt import check_stack_trace
 from theano.tensor.nnet.conv3d2d import *
 import theano.tests.unittest_tools as utt
@@ -73,10 +74,11 @@ def pyconv3d(signals, filters):
                r_i += o_i[Tf2:o_i_sh0-Tf2, Hf2:-Hf2, Wf2:-Wf2]
    return rval
 def check_diagonal_subtensor_view_traces(fn):
-    for apply_node in fn.maker.fgraph.apply_nodes:
+    assert check_stack_trace(
-        if isinstance(apply_node.op, (DiagonalSubtensor, IncDiagonalSubtensor)):
+        fn, ops_to_check=(DiagonalSubtensor, IncDiagonalSubtensor))
-            assert hasattr(apply_node.outputs[0].tag, 'trace')
 def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
    if ndimage is None:
@@ -150,7 +152,6 @@ def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
    newconv3d = theano.function([], [],
                                updates={s_output: out},
                                mode=mode)
-    check_diagonal_subtensor_view_traces(newconv3d)
    t0 = time.time()
    newconv3d()
@@ -162,7 +163,6 @@ def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
                                          (s_signals, gsignals)],
                                 mode=mode,
                                 name='grad')
-    check_diagonal_subtensor_view_traces(gnewconv3d)
    t0 = time.time()
    gnewconv3d()

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
--- a/theano/tensor/nnet/tests/test_opt.py
+++ b/theano/tensor/nnet/tests/test_opt.py
 from __future__ import absolute_import, print_function, division
 import theano
 from theano import tensor
-from theano.tensor.nnet.blocksparse import sparse_block_dot
+from theano.gof.opt import check_stack_trace
+from theano.tensor.nnet.blocksparse import (
+    sparse_block_dot, sparse_block_gemv_inplace, sparse_block_outer_inplace,
+    sparse_block_gemv, sparse_block_outer)
 def test_blocksparse_inplace_gemv_opt():
@@ -14,12 +17,13 @@ def test_blocksparse_inplace_gemv_opt():
    o = sparse_block_dot(W, h, iIdx, b, oIdx)
    f = theano.function([W, h, iIdx, b, oIdx], o)
-    assert hasattr(f.maker.fgraph.outputs[0].tag, 'trace')
    if theano.config.mode == "FAST_COMPILE":
        assert not f.maker.fgraph.toposort()[-1].op.inplace
+        assert check_stack_trace(f, ops_to_check=[sparse_block_gemv])
    else:
        assert f.maker.fgraph.toposort()[-1].op.inplace
+        assert check_stack_trace(f, ops_to_check=[sparse_block_gemv_inplace])
 def test_blocksparse_inplace_outer_opt():
@@ -31,13 +35,12 @@ def test_blocksparse_inplace_outer_opt():
    o = sparse_block_dot(W, h, iIdx, b, oIdx)
-    theano.printing.debugprint(tensor.grad(o.sum(), wrt=W))
    f = theano.function([W, h, iIdx, b, oIdx],
                        [o, tensor.grad(o.sum(), wrt=W)])
-    assert hasattr(f.maker.fgraph.outputs[0].tag, 'trace')
    if theano.config.mode == "FAST_COMPILE":
        assert not f.maker.fgraph.toposort()[-1].op.inplace
+        assert check_stack_trace(f, ops_to_check=sparse_block_outer)
    else:
        assert f.maker.fgraph.toposort()[-1].op.inplace
+        assert check_stack_trace(f, ops_to_check=sparse_block_outer_inplace)
--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1031,6 +1031,8 @@ class ShapeFeature(object):
            # don't make the optimizer merge a zillion ones together
            # by always returning the same object to represent 1
            return self.lscalar_one
+        if type(s_i) is float and int(s_i) == s_i:
+            s_i = int(s_i)
        if (type(s_i) in integer_types or
                isinstance(s_i, numpy.integer) or
                (isinstance(s_i, numpy.ndarray) and s_i.ndim == 0)):
@@ -3753,8 +3755,20 @@ def local_useless_switch(node):
            if out.type.broadcastable != node.outputs[0].type.broadcastable:
                # We need to copy data to the new dimensions during execution
-                out = T.alloc(out, *[node.outputs[0].shape[i] for i
-                                     in xrange(out.ndim)])
+                # We should not depend on node.outputs as this would
+                # make the new node depend on the old one that will
+                # get optimized again. So this create a cycle.
+                shps = []
+                for idx, (b1, b2), in enumerate(zip(out.type.broadcastable,
+                                                    node.outputs[0].type.broadcastable)):
+                    if b1 == b2:
+                        shps.append(out.shape[idx])
+                    elif not node.inputs[1].type.broadcastable[idx]:
+                        shps.append(node.inputs[1].shape[idx])
+                    else:
+                        shps.append(node.inputs[2].shape[idx])
+                out = T.alloc(out, *shps)
            else:
                out = out

--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
@@ -14,7 +14,7 @@ from six.moves import xrange
 import six.moves.builtins as builtins
 import theano
-from theano import gof, Op, tensor, Variable, Apply
+from theano import gof, OpenMPOp, tensor, Variable, Apply
 def max_pool_2d_same_size(input, patch_size):
@@ -114,7 +114,7 @@ def pool_2d(input, ds, ignore_border=None, st=None, padding=(0, 0),
    return tensor.reshape(output, outshp, ndim=input.ndim)
-class Pool(Op):
+class Pool(OpenMPOp):
    """
    For N-dimensional tensors, consider that the last two dimensions span
    images. This Op downsamples these images by taking the max, sum or average
@@ -236,7 +236,8 @@ class Pool(Op):
        return rval
    def __init__(self, ds, ignore_border=False, st=None, padding=(0, 0),
-                 mode='max'):
+                 mode='max', openmp=None):
+        super(Pool, self).__init__(openmp=openmp)
        self.ds = tuple(ds)
        if not all([isinstance(d, integer_types) for d in ds]):
            raise ValueError(
@@ -350,7 +351,9 @@ class Pool(Op):
                                        x, gz)]
    def c_headers(self):
-        return ['<algorithm>']
+        headers = ['<algorithm>']
+        headers += super(Pool, self).c_headers()
+        return headers
    def c_code(self, node, name, inp, out, sub):
        if self.mode not in ('max', 'sum', 'average_exc_pad', 'average_inc_pad'):
@@ -362,6 +365,10 @@ class Pool(Op):
        ds0, ds1 = self.ds
        st0, st1 = self.st
        pd0, pd1 = self.padding
+        if self.openmp:
+            omp_parallel = '#pragma omp parallel for private(r_st, r_end, c_st, c_end, collector) schedule(static)'
+        else:
+            omp_parallel = ''
        ccode = """
        int typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
        int z_r, z_c; // shape of the output
@@ -443,13 +450,15 @@ class Pool(Op):
          %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
        }
        // used for indexing a pool region inside the input
-        int r_st, r_end, c_st, c_end;
        dtype_%(x)s collector; // temp var for the value in a region
        if (z_r && z_c)
        {
-            for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
+            int r_st, r_end, c_st, c_end;
-              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
+            %(omp_parallel)s
-                for(int i=0; i< z_r; i++){
+            for(int t = 0; t < PyArray_DIMS(%(x)s)[0] * PyArray_DIMS(%(x)s)[1]; t++){
+                int b = t %% PyArray_DIMS(%(x)s)[0];
+                int k = t / PyArray_DIMS(%(x)s)[0];
+                for(int i=0; i < z_r; i++){
                  r_st = i * %(st0)s;
                  r_end = r_st + %(ds0)s;
                  // skip the padding
@@ -526,15 +535,14 @@ class Pool(Op):
                }
              }
            }
-        }
        """
        return ccode % locals()
    def c_code_cache_version(self):
-        return (0, 6, 8, 4)
+        return (0, 6, 8, 4, self.openmp)
-class PoolGrad(Op):
+class PoolGrad(OpenMPOp):
    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
    @staticmethod
@@ -617,7 +625,7 @@ class PoolGrad(Op):
        rval = list(imgshape[:-2]) + [nr, nc]
        return rval
-    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max', openmp=None):
        self.ds = tuple(ds)
        self.ignore_border = ignore_border
        if st is None:
@@ -629,14 +637,15 @@ class PoolGrad(Op):
                "Pool mode parameter only support 'max', 'sum',"
                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
        self.mode = mode
+        super(PoolGrad, self).__init__(openmp=openmp)
    def infer_shape(self, node, in_shapes):
        return [in_shapes[0]]
 class MaxPoolGrad(PoolGrad):
-    def __init__(self, ds, ignore_border, st=None, padding=(0, 0)):
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), openmp=None):
-        PoolGrad.__init__(self, ds, ignore_border, st, padding, mode='max')
+        PoolGrad.__init__(self, ds, ignore_border, st, padding, 'max', openmp)
    def make_node(self, x, maxout, gz):
        # make_node should only be called by the grad function of
@@ -708,6 +717,10 @@ class MaxPoolGrad(PoolGrad):
        ds0, ds1 = self.ds
        st0, st1 = self.st
        pd0, pd1 = self.padding
+        if self.openmp:
+            omp_parallel = '#pragma omp parallel for private(r_st, r_end, c_st, c_end, maximum) schedule(static)'
+        else:
+            omp_parallel = ''
        return """
        // sanity checks
        int x_typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
@@ -757,13 +770,15 @@ class MaxPoolGrad(PoolGrad):
        else {
          PyArray_FILLWBYTE(%(gx)s, 0);
        }
-        int r_st, r_end, c_st, c_end; // used to index into the input img x
        dtype_%(z)s maximum; // temp var for maximum value in a region
        if (z_r && z_c)
        {
-            for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
+            int r_st, r_end, c_st, c_end;
-              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
+            %(omp_parallel)s
-                for(int i=0; i< z_r; i++){
+            for(int t = 0; t < PyArray_DIMS(%(x)s)[0] * PyArray_DIMS(%(x)s)[1]; t++){
+                int b = t %% PyArray_DIMS(%(x)s)[0];
+                int k = t / PyArray_DIMS(%(x)s)[0];
+                for(int i=0; i < z_r; i++){
                  r_st = i * %(st0)s;
                  r_end = r_st + %(ds0)s;
                  // skip the padding
@@ -803,11 +818,10 @@ class MaxPoolGrad(PoolGrad):
                }
              }
            }
-        }
        """ % locals()
    def c_code_cache_version(self):
-        return (0, 7)
+        return (0, 7, self.openmp)
 class AveragePoolGrad(PoolGrad):
@@ -895,10 +909,10 @@ class AveragePoolGrad(PoolGrad):
                     st=self.st, padding=self.padding, mode=self.mode)(ggx)]
-class DownsampleFactorMaxGradGrad(Op):
+class DownsampleFactorMaxGradGrad(OpenMPOp):
    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
-    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max', openmp=None):
        self.ds = tuple(ds)
        if not all([isinstance(d, integer_types) for d in ds]):
            raise ValueError(
@@ -917,6 +931,7 @@ class DownsampleFactorMaxGradGrad(Op):
            raise NotImplementedError(
                'padding_h and padding_w must be smaller than strides')
        self.mode = mode
+        super(DownsampleFactorMaxGradGrad, self).__init__(openmp=openmp)
        assert self.mode == 'max'
    def make_node(self, x, maxout, gz):
@@ -990,6 +1005,10 @@ class DownsampleFactorMaxGradGrad(Op):
        ds0, ds1 = self.ds
        st0, st1 = self.st
        pd0, pd1 = self.padding
+        if self.openmp:
+            omp_parallel = '#pragma omp parallel for private(r_st, r_end, c_st, c_end, maximum) schedule(static)'
+        else:
+            omp_parallel = ''
        return """
        int z_typenum = PyArray_ObjectType((PyObject*)%(maxout)s, 0);
        int z_r, z_c;
@@ -1017,10 +1036,12 @@ class DownsampleFactorMaxGradGrad(Op):
          PyArray_FILLWBYTE(%(z)s, 0);
        }
        dtype_%(maxout)s maximum; // temp var for maximum value in a region
-        int r_st, r_end, c_st, c_end; // used to index into the input img x
+        int r_st, r_end, c_st, c_end;
-        for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
+        %(omp_parallel)s
-              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
+        for(int t = 0; t < PyArray_DIMS(%(x)s)[0] * PyArray_DIMS(%(x)s)[1]; t++){
-                for(int i=0; i< z_r; i++){
+            int b = t %% PyArray_DIMS(%(x)s)[0];
+            int k = t / PyArray_DIMS(%(x)s)[0];
+                for(int i=0; i < z_r; i++){
                  r_st = i * %(st0)s;
                  r_end = r_st + %(ds0)s;
                  // skip the padding
@@ -1058,8 +1079,7 @@ class DownsampleFactorMaxGradGrad(Op):
                  }
                }
              }
-         }
        """ % locals()
    def c_code_cache_version(self):
-        return (0, 1)
+        return (0, 1, self.openmp)
--- a/theano/tensor/tests/test_sort.py
+++ b/theano/tensor/tests/test_sort.py
@@ -84,14 +84,13 @@ class test_sort(unittest.TestCase):
        data = np.random.rand(2, 3, 4).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, None), [data])
-    def test_grad_negative_axis(self):
+    def test_grad_negative_axis_2d(self):
-        # test 2D
        data = np.random.rand(2, 3).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, -1), [data])
        data = np.random.rand(2, 3).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, -2), [data])
-        # test 3D
+    def test_grad_negative_axis_3d(self):
        data = np.random.rand(2, 3, 4).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, -1), [data])
        data = np.random.rand(2, 3, 4).astype(theano.config.floatX)
@@ -99,7 +98,7 @@ class test_sort(unittest.TestCase):
        data = np.random.rand(2, 3, 4).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, -3), [data])
-        # test 4D
+    def test_grad_negative_axis_4d(self):
        data = np.random.rand(2, 3, 4, 2).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, -1), [data])
        data = np.random.rand(2, 3, 4, 2).astype(theano.config.floatX)
@@ -109,14 +108,13 @@ class test_sort(unittest.TestCase):
        data = np.random.rand(2, 3, 4, 2).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, -4), [data])
-    def test_grad_nonnegative_axis(self):
+    def test_grad_nonnegative_axis_2d(self):
-        # test 2D
        data = np.random.rand(2, 3).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, 0), [data])
        data = np.random.rand(2, 3).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, 1), [data])
-        # test 3D
+    def test_grad_nonnegative_axis_3d(self):
        data = np.random.rand(2, 3, 4).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, 0), [data])
        data = np.random.rand(2, 3, 4).astype(theano.config.floatX)
@@ -124,7 +122,7 @@ class test_sort(unittest.TestCase):
        data = np.random.rand(2, 3, 4).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, 2), [data])
-        # test 4D
+    def test_grad_nonnegative_axis_4d(self):
        data = np.random.rand(2, 3, 4, 2).astype(theano.config.floatX)
        utt.verify_grad(lambda x: sort(x, 0), [data])
        data = np.random.rand(2, 3, 4, 2).astype(theano.config.floatX)