Merge pull request #5073 from nouiz/Faruk-Ahmed-use_cxx_flag

Removing _op_use_c_code attribute

Merge pull request #5073 from nouiz/Faruk-Ahmed-use_cxx_flag
65af9781 · abergeron · GitHub · 18dd2955 · 0cb3b854 · 65af9781
--- a/doc/extending/extending_theano.txt
+++ b/doc/extending/extending_theano.txt
@@ -99,7 +99,7 @@ possibilities you may encounter or need.  For that refer to
            pass

        # Other implementations (pycuda, ...):
-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl=None):
            pass

        # optional:
@@ -190,11 +190,12 @@ or :func:`make_thunk`.
      valid, but shouldn't be required anymore for this call.
      The returned function must ensure that it sets the computed
      variables as computed in the `compute_map`.
-
+    - ``impl`` allow to select between multiple implementation.
+      It should have a default value of None.

  :func:`make_thunk` is useful if you want to generate code and compile
  it yourself. For example, this allows you to use PyCUDA to compile GPU
-  code.
+  code and keep state in the thunk.

  If :func:`make_thunk()` is defined by an op, it will be used by Theano
  to obtain the op's implementation.

--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -171,7 +171,7 @@ Optional methods or attributes
  returned, unless it is of length 1, where the single element will be
  returned by itself.

-.. function:: make_thunk(node, storage_map, compute_map, no_recycling)
+.. function:: make_thunk(node, storage_map, compute_map, no_recycling, impl=None)

   This function must return a thunk, that is a zero-arguments
   function that encapsulates the computation to be performed by this
@@ -192,6 +192,8 @@ Optional methods or attributes
     valid, but shouldn't be required anymore for this call.
   :param no_recycling: WRITEME
     WRITEME
+   :param impl: None, 'c' or 'py'
+     Which implementation to use.

   The returned function must ensure that is sets the computed
   variables as computed in the `compute_map`.

--- a/doc/tutorial/gpu_data_convert.txt
+++ b/doc/tutorial/gpu_data_convert.txt
@@ -92,7 +92,7 @@ You can use a GPU function compiled with PyCUDA in a Theano op:
               cuda.basic_ops.as_cuda_ndarray_variable(inp))
            assert inp.dtype == "float32"
            return theano.Apply(self, [inp], [inp.type()])
-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl=None):
            mod = SourceModule("""
        __global__ void my_fct(float * i0, float * o0, int size) {
        int i = blockIdx.x * blockDim.x + threadIdx.x;

--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -586,7 +586,7 @@ Modify and execute to work for a matrix of shape (20, 10).
            assert inp.dtype == "float32"
            return theano.Apply(self, [inp], [inp.type()])

-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl):
            mod = SourceModule("""
        __global__ void my_fct(float * i0, float * o0, int size) {
        int i = blockIdx.x*blockDim.x + threadIdx.x;

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -124,14 +124,11 @@ class OpFromGraph(gof.Op):
                         list(inputs) + self.shared_inputs,
                         [type() for type in self.output_types])

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-        ret = super(OpFromGraph, self).make_thunk(node, storage_map,
-                                                  compute_map, no_recycling)
-        if not hasattr(self, "fn"):
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        if not hasattr(self, "fn") and impl == 'py':
            self.fn = orig_function(self.new_inputs,
                                    self.new_outputs,
                                    **self.kwargs)
-        return ret

    def perform(self, node, inputs, outputs):
        variables = self.fn(*inputs)

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1837,8 +1837,6 @@ class _Linker(gof.link.LocalLinker):
                thunk.inputs = [storage_map[v] for v in node.inputs]
                thunk.outputs = [storage_map[v] for v in node.outputs]
                thunk_other = thunk
-            else:
-                node.op.prepare_node(node, storage_map, compute_map)

            debug = hasattr(node.op, 'debug_perform')

@@ -1852,6 +1850,7 @@ class _Linker(gof.link.LocalLinker):
                if not isinstance(node.op, gof.op.Op):
                    raise utils.MethodNotDefined()

+                node.op.prepare_node(node, storage_map, compute_map, 'c')
                thunk = node.op.make_c_thunk(node, storage_map, compute_map,
                                             no_recycling)
                thunks_c.append(thunk)
@@ -1864,6 +1863,7 @@ class _Linker(gof.link.LocalLinker):
            if (((self.maker.mode.check_py_code or thunks_c[-1] is None) and
                 node.op.perform.__code__ != gof.op.PureOp.perform.__code__) or
                    debug):
+                node.op.prepare_node(node, storage_map, compute_map, 'py')
                thunk = node.op.make_py_thunk(node, storage_map, compute_map,
                                              no_recycling, debug=debug)
                thunks_py.append(thunk)
@@ -1873,6 +1873,7 @@ class _Linker(gof.link.LocalLinker):
            if not self.maker.mode.check_c_code and thunks_py[-1] is None:
                _logger.warn("Op %s doesn't have a perform, "
                             "forcing check of the C code" % node.op)
+                node.op.prepare_node(node, storage_map, compute_map, 'c')
                thunk = node.op.make_c_thunk(node, storage_map, compute_map,
                                             no_recycling)
                thunks_c[-1] = thunk

--- a/theano/d3viz/formatting.py
+++ b/theano/d3viz/formatting.py
@@ -233,6 +233,7 @@ class PyDotFormatter(object):
                gf = PyDotFormatter()
                # Use different node prefix for sub-graphs
                gf.__node_prefix = __node_id
+                node.op.prepare_node(node, None, None, 'py')
                gf(node.op.fn, subgraph)
                graph.add_subgraph(subgraph)
                pd_node.get_attributes()['subg'] = subgraph.get_name()

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -1584,7 +1584,7 @@ class CLinker(link.Linker):
        else:
            # Set compute_map as None as clinker do not support lazy evaluation
            for node in self.node_order:
-                node.op.prepare_node(node, storage_map, None)
+                node.op.prepare_node(node, storage_map, None, 'c')
            module = get_module_cache().module_from_key(
                key=key, lnk=self, keep_lock=keep_lock)

@@ -1787,24 +1787,14 @@ class OpWiseCLinker(link.LocalLinker):

            thunks = []
            for node in order:
-                # Maker sure we use the C version of the code whenever
-                # possible
-                # There are ops that don't have _op_use_c_code property
-                # for example ifelse (or any ops that come with their own
-                # make_thunk
-                old_value = getattr(node.op, '_op_use_c_code', False)
-                try:
-                    if theano.config.cxx:
-                        node.op._op_use_c_code = True
-                    thunks += [node.op.make_thunk(node,
-                                                  storage_map,
-                                                  compute_map,
-                                                  no_recycling)]
-                    thunks[-1].inputs = [storage_map[v] for v in node.inputs]
-                    thunks[-1].outputs = [storage_map[v] for v in node.outputs]
-
-                finally:
-                    node.op._op_use_c_code = old_value
+                # make_thunk will try by default C code, otherwise
+                # it fall back to python.
+                thunks += [node.op.make_thunk(node,
+                                              storage_map,
+                                              compute_map,
+                                              no_recycling)]
+                thunks[-1].inputs = [storage_map[v] for v in node.inputs]
+                thunks[-1].outputs = [storage_map[v] for v in node.outputs]

            for node in order:
                if self.allow_gc:

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -823,17 +823,13 @@ class PerformLinker(LocalLinker):
            # the python version
            # Note : ops that implement their own make thunk don't usually
            # have this attribute defiend !!
-            old_value = getattr(node.op, '_op_use_c_code', False)
-            try:
-                node.op._op_use_c_code = False
-                thunks += [node.op.make_thunk(node,
-                                              storage_map,
-                                              compute_map,
-                                              no_recycling)]
-                thunks[-1].inputs = [storage_map[v] for v in node.inputs]
-                thunks[-1].outputs = [storage_map[v] for v in node.outputs]
-            finally:
-                node.op._op_use_c_code = old_value
+            thunks += [node.op.make_thunk(node,
+                                          storage_map,
+                                          compute_map,
+                                          no_recycling,
+                                          'py')]
+            thunks[-1].inputs = [storage_map[v] for v in node.inputs]
+            thunks[-1].outputs = [storage_map[v] for v in node.outputs]

        computed, last_user = gc_helper(order)
        if self.allow_gc:

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -32,6 +32,8 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>"

 __docformat__ = "restructuredtext en"

+_logger = logging.getLogger('theano.gof.op.Op')
+

 class CLinkerObject(object):
    """
@@ -779,34 +781,24 @@ class Op(utils.object2, PureOp, CLinkerOp):
    Convenience class to bundle `PureOp` and `CLinkerOp`.

    """
-    def __new__(cls, *args, **kwargs):
-        # this function exists to silently and transparently ensure that all
-        # existing Ops get a _op_use_c_code attribute
-        obj = object.__new__(cls)
-        if not hasattr(obj, '_op_use_c_code'):
-            obj._op_use_c_code = theano.config.cxx
-        return obj
-
-    def __init__(self, use_c_code=theano.config.cxx):
-        self._op_use_c_code = use_c_code
-
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        """
        Make any special modifications that the Op needs before doing
        make_thunk().

        This can modify the node inplace and should return nothing.

+        It can be called multiple time with different impl. It is the
+        op responsability to don't re-prepare the node when it isn't
+        good to do so.
+
        """
        pass

    def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
-        """
-        Like make_thunk, but will only try to make a C thunk.
+        """Like make_thunk, but will only try to make a C thunk.

        """
-        logger = logging.getLogger('theano.gof.op.Op')
-
        node_input_storage = [storage_map[r] for r in node.inputs]
        node_output_storage = [storage_map[r] for r in node.outputs]

@@ -828,7 +820,7 @@ class Op(utils.object2, PureOp, CLinkerOp):
        cl = theano.gof.cc.CLinker().accept(e,
                                            no_recycling=e_no_recycling)

-        logger.debug('Trying CLinker.make_thunk')
+        _logger.debug('Trying CLinker.make_thunk')
        outputs = cl.make_thunk(input_storage=node_input_storage,
                                output_storage=node_output_storage)
        fill_storage, node_input_filters, node_output_filters = outputs
@@ -883,7 +875,8 @@ class Op(utils.object2, PureOp, CLinkerOp):
        rval.lazy = False
        return rval

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling,
+                   impl=None):
        """
        This function must return a thunk, that is a zero-arguments
        function that encapsulates the computation to be performed
@@ -904,6 +897,9 @@ class Op(utils.object2, PureOp, CLinkerOp):
        no_recycling
            List of variables for which it is forbidden to reuse memory
            allocated by a previous call.
+        impl
+            Currently, None, 'c' or 'py'. If 'c' or 'py' we will only try
+            that version of the code.

        Notes
        -----
@@ -913,27 +909,26 @@ class Op(utils.object2, PureOp, CLinkerOp):
        the thunk can potentially cache return values (like CLinker does),
        then it must not do so for variables in the no_recycling list.

+        self.prepare_node(node, ...) is always called. If we try 'c' and it
+        fail and we try again 'py', prepare_node will be called twice.
        """
-        logger = logging.getLogger('theano.gof.op.Op')

-        self.prepare_node(node, storage_map=storage_map,
-                          compute_map=compute_map)
-
-        if not hasattr(self, '_op_use_c_code'):
-            warnings.warn(
-                "The  __getstate__ method of '%s' is not implemented correctly."
-                " It should keep the attributes added by the base class."
-                " To implement it correctly, it should keep all attributes"
-                " and only remove those it does not want." % (self),
-                stacklevel=2)
-        if getattr(self, '_op_use_c_code', theano.config.cxx):
+        if impl is None or impl == 'c':
+            self.prepare_node(node, storage_map=storage_map,
+                              compute_map=compute_map, impl='c')
            try:
                return self.make_c_thunk(node, storage_map, compute_map,
                                         no_recycling)
            except (NotImplementedError, utils.MethodNotDefined):
-                logger.debug('Falling back on perform')
+                # We requested the c code, so don't catch the error.
+                if impl == 'c':
+                    raise
+                _logger.debug('Falling back on perform')

-        # condition: either there was no c_code, or it failed
+        # condition: either there was no c_code, or it failed or
+        # python code was requested.
+        self.prepare_node(node, storage_map=storage_map,
+                          compute_map=compute_map, impl='py')
        return self.make_py_thunk(node, storage_map, compute_map, no_recycling)

    def make_node(self, *inputs):
@@ -1196,9 +1191,9 @@ int main( int argc, const char* argv[] )
                self.openmp = False
                theano.config.openmp = False

-    def prepare_node(self, node, storage_map,
-                     compute_map):
-        self.update_self_openmp()
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        if impl == 'c':
+            self.update_self_openmp()


 def simple_meth(tag):

--- a/theano/gof/tests/test_lazy.py
+++ b/theano/gof/tests/test_lazy.py
@@ -25,7 +25,7 @@ class IfElseIfElseIf(PureOp):
        assert t3.type == f3.type
        return Apply(self, [c1, t1, c2, t2, c3, t3, f3], [t1.type()])

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):

        input_computed = [compute_map[v] for v in node.inputs]
        output_computed = [compute_map[v] for v in node.outputs]
@@ -93,7 +93,7 @@ class NotImplementedOp(PureOp):
    def make_node(self, x):
        return Apply(self, [x], [x.type()])

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):
        def thunk():
            raise self.E()
        thunk.lazy = False

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -1043,12 +1043,14 @@ class VM_Linker(link.LocalLinker):
        t0 = time.time()
        for node in order:
            try:
+                impl = None
                if self.c_thunks is False:
-                    node.op._op_use_c_code = False
+                    impl = 'py'
                thunks.append(node.op.make_thunk(node,
                                                 storage_map,
                                                 compute_map,
-                                                 no_recycling))
+                                                 no_recycling,
+                                                 impl=impl))
                if not hasattr(thunks[-1], 'lazy'):
                    # We don't want all ops maker to think about lazy Ops.
                    # So if they didn't specify that its lazy or not, it isn't.

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -2620,11 +2620,9 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
    def get_params(self, node):
        return node.outputs[0].type.context

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        # cache the kernel object
        self.get_kernel_cache(node)
-        return super(GpuCAReduceCPY, self).make_thunk(
-            node, storage_map, compute_map, no_recycling)

    def get_kernel_cache(self, node):
        attr = '@cache_reduction_k'

--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -73,7 +73,7 @@ class CuRFFTOp(Op):

        return theano.Apply(self, [inp, s], [self.output_type(inp)()])

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):

        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
@@ -198,7 +198,7 @@ class CuIRFFTOp(Op):

        return theano.Apply(self, [inp, s], [self.output_type(inp)()])

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):

        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -20,7 +20,7 @@ import numpy
 import theano.tensor
 from theano.tensor import TensorType
 from theano import gof
-from theano.gof import PureOp, Apply
+from theano.gof import Op, Apply

 from six import iteritems
 from six.moves import xrange
@@ -41,7 +41,7 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 _logger = logging.getLogger('theano.ifelse')


-class IfElse(PureOp):
+class IfElse(Op):
    """
    Op that provides conditional graph evaluation if used with the CVM/VM
    linkers. Note that there exist a helpful function `ifelse` that should
@@ -235,7 +235,7 @@ class IfElse(PureOp):
                if_true_op(*if_true, **dict(return_list=True)) +
                if_false_op(*if_false, **dict(return_list=True)))

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
        cond = node.inputs[0]
        ts = node.inputs[1:][:self.n_outs]
        fs = node.inputs[1:][self.n_outs:]

--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
@@ -320,7 +320,7 @@ class PycudaElemwiseSourceModuleMakeThunkOp(Op):
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        return out_node

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        # TODO support broadcast!
        # TODO assert all input have the same shape
        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -246,18 +246,14 @@ class GpuOp(theano.gof.Op):

    """

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if use.device_number is None:
            use("gpu",
                force=True,
                default_to_move_computation_to_gpu=False,
                move_shared_float32_to_gpu=False,
                enable_cuda=False)
-        return super(GpuOp, self).make_thunk(node, storage_map,
-                                             compute_map, no_recycling)

-theano.compile.debugmode.default_make_thunk.append(
-    get_unbound_function(GpuOp.make_thunk))

 # We must do those import to be able to create the full doc when
 # nvcc is not available

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -541,10 +541,8 @@ class GpuGemm(GpuOp):
    def __setstate__(self, dct):
        self.__dict__.update(dct)

-        # Correctly reload older pickles where _op_use_c_code and
-        # destroy_map were not saved
-        if '_op_use_c_code' not in self.__dict__:
-            self._op_use_c_code = theano.config.cxx
+        # Correctly reload older pickles where destroy_map were not
+        # saved
        if 'destroy_map' not in self.__dict__ and self.inplace:
            self.destroy_map = {0: [0]}

@@ -661,10 +659,8 @@ class GpuGemv(GpuOp):
    def __setstate__(self, dct):
        self.__dict__.update(dct)

-        # Correctly reload older pickles where _op_use_c_code and
-        # destroy_map were not saved
-        if '_op_use_c_code' not in self.__dict__:
-            self._op_use_c_code = theano.config.cxx
+        # Correctly reload older pickles where destroy_map were not
+        # saved
        if 'destroy_map' not in self.__dict__ and self.inplace:
            self.destroy_map = {0: [0]}

@@ -761,10 +757,8 @@ class GpuGer(GpuOp):
    def __setstate__(self, dct):
        self.__dict__.update(dct)

-        # Correctly reload older pickles where _op_use_c_code and
-        # destroy_map were not saved
-        if '_op_use_c_code' not in self.__dict__:
-            self._op_use_c_code = theano.config.cxx
+        # Correctly reload older pickles where destroy_map were not
+        # saved
        if 'destroy_map' not in self.__dict__ and self.inplace:
            self.destroy_map = {0: [0]}

@@ -2187,7 +2181,9 @@ class GpuConv(GpuOp):
                     images[2] * images[3] * 2)
        return flops

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        super(GpuConv, self).prepare_node(node, storage_map, compute_map, impl)
+
        if node.op.max_threads_dim0 is None:
            cuda = theano.sandbox.cuda
            device_id = cuda.use.device_number
@@ -2240,8 +2236,8 @@ class GpuConv(GpuOp):
            bmode = 0
        if max_threads_dim0 is None:
            raise NotImplementedError("GpuConv.c_code should not be called "
-                                      "directly. It should be called by "
-                                      "make_thunk() that add some information "
+                                      "directly. It should be called after "
+                                      "prepare_node() that add some information "
                                      "related to the selected GPU.")
        sub.update(locals())
        return """

--- a/theano/sandbox/cuda/cula.py
+++ b/theano/sandbox/cuda/cula.py
@@ -51,10 +51,7 @@ class GpuSolve(GpuOp):
        assert inp2.ndim == 2
        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])

-    def make_thunk(self,
-                   node,
-                   storage_map, _,
-                   no_recycling=[]):
+    def make_thunk(self, node, storage_map, _, no_recycling, impl=None):

        # Initialize CULA the first time it is needed
        global cula_initialized

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1567,7 +1567,10 @@ class GpuDnnPool(DnnBase):
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        super(GpuDnnPool, self).prepare_node(
+            node, storage_map, compute_map, impl)
+
        if len(node.inputs) == 2:
            warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
            # Old interface
@@ -1803,7 +1806,7 @@ class GpuDnnPoolGrad(DnnBase):
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) == 4:
            warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
            # Old interface

--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
@@ -49,7 +49,7 @@ class GpuCumsum(CumsumOp, GpuOp):

        return theano.Apply(self, [x], [x.type()])

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
        node_ = copy.copy(node)
        assert node.op is node_.op
        if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None:
@@ -70,7 +70,7 @@ class GpuCumsum(CumsumOp, GpuOp):
            node_.op.max_grid_size2 = prop['maxGridSize2']

        return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
-                                                     compute_map, no_recycling)
+                                                     compute_map, no_recycling, impl)

    def __str__(self):
        return "%s{%s}" % (self.__class__.__name__, self.axis)

--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -48,7 +48,7 @@ class ScikitsCudaOp(GpuOp):

        return theano.Apply(self, [inp], [self.output_type(inp)()])

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        if not scikits_cuda_available:
            raise RuntimeError(
                "scikits.cuda is needed for all GPU fft implementation,"
@@ -61,7 +61,7 @@ class CuFFTOp(ScikitsCudaOp):
        return CudaNdarrayType(
            broadcastable=[False] * (inp.type.ndim + 1))

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)

        from theano.misc.pycuda_utils import to_gpuarray
@@ -118,7 +118,7 @@ class CuIFFTOp(ScikitsCudaOp):
        return CudaNdarrayType(
            broadcastable=[False] * (inp.type.ndim - 1))

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)

        from theano.misc.pycuda_utils import to_gpuarray
@@ -314,7 +314,7 @@ class BatchedComplexDotOp(ScikitsCudaOp):
    def output_type(self, inp):
        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)

        inputs = [storage_map[v] for v in node.inputs]

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -3064,7 +3064,7 @@ arctan = ArcTan(upgrade_to_float, name='arctan')


 class ArcTan2(BinaryScalarOp):
-    nfunc_spec = ('arctan2', 1, 1)
+    nfunc_spec = ('arctan2', 2, 1)

    def impl(self, y, x):
        # If x and y are int8 or uint8, numpy.arctan2 will compute the result
@@ -3663,11 +3663,15 @@ class Composite(ScalarOp):
        # Postpone the creation in case it isn't needed.
        #  self.init_name()      # self.name
        self.name = None
-
-    def prepare_node(self, node, storage_map, compute_map):
-        self.init_py_impls()  # self._impls
-        for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs):
-            n.op.prepare_node(n, None, None)
+        self.prepare_node_called = set()
+
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        if impl == 'py':
+            self.init_py_impls()  # self._impls
+        if impl not in self.prepare_node_called:
+            for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs):
+                n.op.prepare_node(n, None, None, impl)
+            self.prepare_node_called.add(impl)

    def output_types(self, input_types):
        if tuple(input_types) != self.inputs_type:

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -125,7 +125,7 @@ class Scan(PureOp):
                 outputs,
                 info,
                 typeConstructor=None,
-                ):
+                 ):
        if 'gpua' not in info:
            info['gpua'] = False
        # adding properties into self
@@ -346,8 +346,8 @@ class Scan(PureOp):
                       len(self.inner_shared(self.inputs)) +
                       len(self.inner_non_seqs(self.inputs)))
        assert n_outer_ins == n_inner_ins, \
-                ("The number of inputs given to the inner function of scan"
-                 " does not match the number of inputs given to scan.")
+            ("The number of inputs given to the inner function of scan"
+             " does not match the number of inputs given to scan.")
        new_inputs = [inputs[0]]
        # assert dtype is consistent
        err_msg1 = ('When compiling the inner function of scan (the '
@@ -372,7 +372,7 @@ class Scan(PureOp):
                    'have the same dimensionality, you can increase the '
                    'dimensionality of the varialbe in the initial state of scan '
                    'by using dimshuffle or shape_padleft. '
-                   )
+                    )
        err_msg2 = ('When compiling the inner function of scan the '
                    'following error has been encountered: The '
                    'initial state (`outputs_info` in scan nomenclature) '
@@ -399,7 +399,7 @@ class Scan(PureOp):
                    'have the same dimensionality, you can increase the '
                    'dimensionality of the variable in the initial state of scan '
                    'by using dimshuffle or shape_padleft. '
-                   )
+                    )

        def format(var, as_var):
            """
@@ -440,9 +440,9 @@ class Scan(PureOp):
        inner_mitmot = self.inner_mitmot(self.inputs)
        inner_mitmot_outs = self.inner_mitmot_outs(self.outputs)
        for idx, (itaps, otaps, _outer_mitmot) in enumerate(
-                                     zip(self.mitmot_taps(),
-                                         self.mitmot_out_taps(),
-                                         self.outer_mitmot(inputs))):
+                zip(self.mitmot_taps(),
+                    self.mitmot_out_taps(),
+                    self.outer_mitmot(inputs))):
            outer_mitmot = format(_outer_mitmot, as_var=inner_mitmot[ipos])
            new_inputs.append(outer_mitmot)
            for k in xrange(len(itaps)):
@@ -450,15 +450,15 @@ class Scan(PureOp):
                    outer_mitmot.type.dtype or
                    inner_mitmot[ipos + k].ndim != outer_mitmot.ndim - 1):
                    raise ValueError(err_msg1 % ('initial state (outputs_info'
-                                           ' in scan nomenclature) ',
-                                           str(outer_mitmot),
-                                           argoffset + idx,
-                                           outer_mitmot.type.dtype,
-                                           outer_mitmot.type.ndim,
-                                           str(inner_mitmot[ipos + k]),
-                                           inner_mitmot[ipos +
-                                                        k].type.dtype,
-                                           inner_mitmot[ipos + k].type.ndim))
+                                                 ' in scan nomenclature) ',
+                                                 str(outer_mitmot),
+                                                 argoffset + idx,
+                                                 outer_mitmot.type.dtype,
+                                                 outer_mitmot.type.ndim,
+                                                 str(inner_mitmot[ipos + k]),
+                                                 inner_mitmot[ipos +
+                                                              k].type.dtype,
+                                                 inner_mitmot[ipos + k].type.ndim))
            ipos += len(itaps)
            for k in xrange(len(otaps)):
                if (inner_mitmot_outs[opos + k].type.dtype !=
@@ -491,14 +491,14 @@ class Scan(PureOp):
                    outer_mitsot.type.dtype or
                    inner_mitsots[ipos + k].ndim != outer_mitsot.ndim - 1):
                    raise ValueError(err_msg1 % ('initial state (outputs_info'
-                                               ' in scan nomenclature) ',
-                                           str(outer_mitsot),
-                                           argoffset + idx,
-                                           outer_mitsot.type.dtype,
-                                           outer_mitsot.type.ndim,
-                                           str(inner_mitsots[ipos + k]),
-                                           inner_mitsots[ipos + k].type.dtype,
-                                           inner_mitsots[ipos + k].type.ndim))
+                                                 ' in scan nomenclature) ',
+                                                 str(outer_mitsot),
+                                                 argoffset + idx,
+                                                 outer_mitsot.type.dtype,
+                                                 outer_mitsot.type.ndim,
+                                                 str(inner_mitsots[ipos + k]),
+                                                 inner_mitsots[ipos + k].type.dtype,
+                                                 inner_mitsots[ipos + k].type.ndim))
            ipos += len(itaps)
            if inner_mitsot_out.type.dtype != outer_mitsot.type.dtype:
                raise ValueError(err_msg2 %
@@ -523,14 +523,14 @@ class Scan(PureOp):
            new_inputs.append(outer_sitsot)
            if (inner_sitsot.ndim != outer_sitsot.ndim - 1):
                raise ValueError(err_msg1 % ('initial state (outputs_info'
-                                           ' in scan nomenclature) ',
-                                str(outer_sitsot),
-                                argoffset + idx,
-                                outer_sitsot.type.dtype,
-                                outer_sitsot.type.ndim,
-                                str(inner_sitsot),
-                                inner_sitsot.type.dtype,
-                                inner_sitsot.type.ndim))
+                                             ' in scan nomenclature) ',
+                                             str(outer_sitsot),
+                                             argoffset + idx,
+                                             outer_sitsot.type.dtype,
+                                             outer_sitsot.type.ndim,
+                                             str(inner_sitsot),
+                                             inner_sitsot.type.dtype,
+                                             inner_sitsot.type.ndim))
            if inner_sitsot_out.type.dtype != outer_sitsot.type.dtype:
                raise ValueError(err_msg2 %
                                 (str(outer_sitsot),
@@ -570,14 +570,14 @@ class Scan(PureOp):
                (outer_shared.dtype != inner_shared.dtype or
                 outer_shared.ndim != inner_shared.ndim)):
                raise ValueError(err_msg1 % ('initial state (outputs_info'
-                                           ' in scan nomenclature) ',
-                                           str(outer_shared),
-                                           argoffset + idx,
-                                           outer_shared.dtype,
-                                           outer_shared.ndim,
-                                           str(inner_shared),
-                                           inner_shared.dtype,
-                                           inner_shared.ndim))
+                                             ' in scan nomenclature) ',
+                                             str(outer_shared),
+                                             argoffset + idx,
+                                             outer_shared.dtype,
+                                             outer_shared.ndim,
+                                             str(inner_shared),
+                                             inner_shared.dtype,
+                                             inner_shared.ndim))
        # We do not need to call `format` on outer_nisot arguments.
        # outer_nitsot stands for no input tap single output tap. This means
        # these are states that do not feed anything back in the recurrent
@@ -595,7 +595,7 @@ class Scan(PureOp):
            if inner_nonseq.type != outer_nonseq.type:
                raise ValueError(('Argument %s given to scan node does not'
                                 ' match its correspondance %s') %
-                                  (str(outer_nonseq), str(inner_nonseq)))
+                                 (str(outer_nonseq), str(inner_nonseq)))

        for outer_nitsot in self.outer_nitsot(inputs):
            # For every nit_sot input we get as input a int/uint that
@@ -697,7 +697,8 @@ class Scan(PureOp):
                     self._hash_inner_graph,
                     scan_utils.hash_listsDictsTuples(self.info)))

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling,
+                   impl=None):
        """

        Parameters
@@ -715,7 +716,8 @@ class Scan(PureOp):
        no_recycling
            List of variables for which it is forbidden to reuse memory
            allocated by a previous call.
-
+        impl
+            Use 'py' if we want python execution.
        Notes
        -----
        If the thunk consults the storage_map on every call, it is safe
@@ -786,7 +788,7 @@ class Scan(PureOp):
                        # Wrap the corresponding input as usual. Leave the
                        # output as-is.
                        wrapped_inputs.append(In(self.inputs[input_idx],
-                                                    borrow=False))
+                                                 borrow=False))
                    input_idx += 1

            # Wrap the inputs not associated to mitmots and wrap the remaining
@@ -839,7 +841,7 @@ class Scan(PureOp):
        profile = None
        if (theano.config.profile or
            (isinstance(self.profile, (string_types, bool, integer_types))
-                                      and self.profile)):
+                and self.profile)):
            if isinstance(self.profile, string_types):
                profile = ScanProfileStats(name=self.profile)
            else:
@@ -864,6 +866,8 @@ class Scan(PureOp):
                               for out in self.fn.maker.fgraph.outputs]

        try:
+            if impl == 'py':
+                raise theano.gof.cmodule.MissingGXX
            cython_mintaps = numpy.asarray(self.mintaps, dtype='int32')
            cython_tap_array_len = \
                numpy.asarray([len(x) for x in self.tap_array],
@@ -886,16 +890,16 @@ class Scan(PureOp):
                d1 = numpy.max(cython_mit_mot_out_nslices)
            d0 = len(self.mit_mot_out_slices)
            cython_mit_mot_out_slices = numpy.zeros((d0, d1),
-                                                      dtype='int32')
+                                                    dtype='int32')
            for _d0 in xrange(d0):
                for _d1 in xrange(cython_mit_mot_out_nslices[_d0]):
                    cython_mit_mot_out_slices[_d0, _d1] = \
                        self.mit_mot_out_slices[_d0][_d1]

            cython_vector_seqs = numpy.asarray(self.vector_seqs,
-                                                    dtype='int32')
+                                               dtype='int32')
            cython_vector_outs = numpy.asarray(self.vector_outs,
-                                                    dtype='int32')
+                                               dtype='int32')
            cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated,
                                                        dtype='int32')

@@ -906,39 +910,38 @@ class Scan(PureOp):

            if hasattr(self, 'destroy_map'):
                cython_destroy_map = [x in self.destroy_map
-                                  for x in xrange(len(node.outputs))]
+                                      for x in xrange(len(node.outputs))]
            else:
                cython_destroy_map = [0 for x in xrange(len(node.outputs))]
            cython_destroy_map = numpy.asarray(cython_destroy_map,
                                               dtype='int32')
            from . import scan_perform_ext
            p = lambda node, args, outs:\
-                    scan_perform_ext.perform(
-                        self.n_shared_outs,
-                        self.n_mit_mot_outs,
-                        self.n_seqs,
-                        self.n_mit_mot,
-                        self.n_mit_sot,
-                        self.n_sit_sot,
-                        self.n_nit_sot,
-                        args[0],
-                        self.as_while,
-                        cython_mintaps,
-                        cython_tap_array,
-                        cython_tap_array_len,
-                        cython_vector_seqs,
-                        cython_vector_outs,
-                        cython_mit_mot_out_slices,
-                        cython_mit_mot_out_nslices,
-                        cython_mitmots_preallocated,
-                        cython_inps_is_tensor,
-                        cython_outs_is_tensor,
-                        self.fn.fn,
-                        self.fn,
-                        cython_destroy_map,
-                        args,
-                        outs,
-                        self, node)
+                scan_perform_ext.perform(self.n_shared_outs,
+                                         self.n_mit_mot_outs,
+                                         self.n_seqs,
+                                         self.n_mit_mot,
+                                         self.n_mit_sot,
+                                         self.n_sit_sot,
+                                         self.n_nit_sot,
+                                         args[0],
+                                         self.as_while,
+                                         cython_mintaps,
+                                         cython_tap_array,
+                                         cython_tap_array_len,
+                                         cython_vector_seqs,
+                                         cython_vector_outs,
+                                         cython_mit_mot_out_slices,
+                                         cython_mit_mot_out_nslices,
+                                         cython_mitmots_preallocated,
+                                         cython_inps_is_tensor,
+                                         cython_outs_is_tensor,
+                                         self.fn.fn,
+                                         self.fn,
+                                         cython_destroy_map,
+                                         args,
+                                         outs,
+                                         self, node)
        except (ImportError, theano.gof.cmodule.MissingGXX):
            p = self.execute
        # default arguments are stored in the closure of `rval`
@@ -1000,8 +1003,8 @@ class Scan(PureOp):
    def inner_mitsot(self, list_inputs):
        n_mitmot_taps = sum(len(x) for x in self.tap_array[:self.n_mit_mot])
        ntaps_upto_sit_sot = sum(len(x) for x in
-                                  self.tap_array[:(self.n_mit_mot +
-                                                   self.n_mit_sot)])
+                                 self.tap_array[:(self.n_mit_mot +
+                                                  self.n_mit_sot)])
        return list_inputs[self.n_seqs + n_mitmot_taps:
                           self.n_seqs + ntaps_upto_sit_sot]

@@ -1090,7 +1093,7 @@ class Scan(PureOp):
        if isinstance(list_outputs, Apply):
            list_outputs = list_outputs.outputs
        offset = (self.n_mit_mot + self.n_mit_sot + self.n_sit_sot +
-                    self.n_nit_sot)
+                  self.n_nit_sot)
        return list_outputs[offset:offset + self.n_shared_outs]

    def inner_non_seqs(self, list_inputs):
@@ -1149,10 +1152,10 @@ class Scan(PureOp):
            for idx, seq in enumerate(args[1:self.seqs_arg_offset]):
                if seq.shape[0] < n_steps:
                    raise ValueError(('Sequence is shorter then the required '
-                                     'number of steps : (n_steps, seq, '
+                                      'number of steps : (n_steps, seq, '
                                      'seq.shape):'), n_steps,
-                                      node.inputs[1 + idx],
-                                      seq.shape)
+                                     node.inputs[1 + idx],
+                                     seq.shape)
                seqs.append(seq)

        # 2. Allocate memory for the outputs. Construct the list:
@@ -1161,15 +1164,15 @@ class Scan(PureOp):
        #                       output

        store_steps = [arg.shape[0] for arg
-                               in args[self.seqs_arg_offset:
-                                       self.shared_arg_offset]]
+                       in args[self.seqs_arg_offset:
+                               self.shared_arg_offset]]
        store_steps += [arg for arg in
-                            args[self.nit_sot_arg_offset:
-                                   self.nit_sot_arg_offset + self.n_nit_sot]
-                       ]
+                        args[self.nit_sot_arg_offset:
+                             self.nit_sot_arg_offset + self.n_nit_sot]
+                        ]

        pos = [(-self.mintaps[idx]) % store_steps[idx] for idx
-                         in xrange(self.n_outs + self.n_nit_sot)]
+               in xrange(self.n_outs + self.n_nit_sot)]
        if not getattr(self, 'destroy_map', None):
            self.destroy_map = OrderedDict()
        # 2.1 Create storage space for outputs
@@ -1203,7 +1206,7 @@ class Scan(PureOp):
        old_output_data = [None] * len(output_storage)
        fn = self.fn.fn
        offset = (self.n_seqs + sum(map(len, self.tap_array[:self.n_outs])) +
-                    self.n_shared_outs)
+                  self.n_shared_outs)
        for idx in xrange(len(other_args)):
            input_storage[idx + offset].storage[0] = other_args[idx]

@@ -1217,7 +1220,7 @@ class Scan(PureOp):
            for idx in xrange(self.n_seqs):
                if self.vector_seqs[idx]:
                    input_storage[idx].storage[0] = \
-                            seqs[idx][i:i + 1].reshape(())
+                        seqs[idx][i:i + 1].reshape(())
                else:
                    input_storage[idx].storage[0] = seqs[idx][i]

@@ -1227,7 +1230,7 @@ class Scan(PureOp):
                    for tap in self.tap_array[idx]:
                        _idx = (pos[idx] + tap) % store_steps[idx]
                        input_storage[offset].storage[0] =\
-                                outs[idx][0][_idx:_idx + 1].reshape(())
+                            outs[idx][0][_idx:_idx + 1].reshape(())
                        offset += 1
                else:
                    for tap in self.tap_array[idx]:
@@ -1396,7 +1399,7 @@ class Scan(PureOp):
                        # This output tap has not been preallocated, recover
                        # its value as usual
                        outs[j][0][k + pos[j]] = \
-                                output_storage[offset_out].storage[0]
+                            output_storage[offset_out].storage[0]
                        offset_out += 1

                    mitmot_out_idx += 1
@@ -1413,7 +1416,7 @@ class Scan(PureOp):
                # Copy the output value to `outs`, if necessary
                if store_steps[j] == 1 or self.vector_outs[j]:
                    outs[j][0][pos[j]] = \
-                            output_storage[offset_out + j].storage[0]
+                        output_storage[offset_out + j].storage[0]
                else:
                    # Check whether the initialization of the output storage
                    # map for this output has been reused.
@@ -1442,7 +1445,7 @@ class Scan(PureOp):
                if i == 0:
                    jout = j + offset_out
                    shape = (store_steps[j],) + \
-                            output_storage[jout].storage[0].shape
+                        output_storage[jout].storage[0].shape
                    if len(output_storage[jout].storage[0].shape) == 0:
                        self.vector_outs[j] = True
                    dtype = output_storage[jout].storage[0].dtype
@@ -1486,7 +1489,7 @@ class Scan(PureOp):
                outs[j][0] = output_storage[jout].storage[0]

            pos = [(idx + 1) % store for idx, store in
-                               izip(pos, store_steps)]
+                   izip(pos, store_steps)]
            i = i + 1

        # 6. Check if you need to re-order output buffers
@@ -1642,17 +1645,15 @@ class Scan(PureOp):
            self_outs = self.outputs[:-1]
        else:
            self_outs = self.outputs
-        outs_shape = scan_utils.infer_shape(
-                outs=self_outs,
-                inputs=self.inputs,
-                input_shapes=inner_ins_shapes)
+        outs_shape = scan_utils.infer_shape(outs=self_outs,
+                                            inputs=self.inputs,
+                                            input_shapes=inner_ins_shapes)
        # Will be used to check if outs_shape can be expressed without using
        # variables in self.inputs.
        # The shapes of node.inputs are valid.
-        validator = scan_utils.Validator(
-                valid=input_shapes,
-                invalid=self.inputs,
-                valid_equivalent=out_equivalent)
+        validator = scan_utils.Validator(valid=input_shapes,
+                                         invalid=self.inputs,
+                                         valid_equivalent=out_equivalent)

        offset = 1 + self.n_seqs
        scan_outs = [x for x in input_shapes[offset:offset + n_outs]]
@@ -1687,7 +1688,7 @@ class Scan(PureOp):
                scan_outs.append(tuple(shp))

        scan_outs += [x for x in
-                     input_shapes[offset:offset + self.n_shared_outs]]
+                      input_shapes[offset:offset + self.n_shared_outs]]
        # if we are dealing with a repeat-until, then we do not know the
        # leading dimension so we replace it for every entry with Shape_i
        if self.as_while:
@@ -1751,7 +1752,7 @@ class Scan(PureOp):
                    j_inp_idx = self.var_mappings["outer_inp_from_outer_out"][jidx]

                    if j_inp_idx != -1:
-                       if connection_pattern[j_inp_idx][iidx] == True:
+                        if connection_pattern[j_inp_idx][iidx] == True:
                            for k in xrange(len(connection_pattern)):
                                if connection_pattern[k][jidx]:
                                    connection_pattern[k][iidx] = True
@@ -1875,18 +1876,18 @@ class Scan(PureOp):

        # With the global mapping inferred, the individual mappings
        # can be produced
-        mappings = {"outer_inp_from_outer_out" : {},
-                    "inner_inp_from_outer_out" : {},
-                    "inner_out_from_outer_out" : {},
-                    "inner_inp_from_outer_inp" : {},
-                    "inner_out_from_outer_inp" : {},
-                    "outer_out_from_outer_inp" : {},
-                    "outer_inp_from_inner_inp" : {},
-                    "inner_out_from_inner_inp" : {},
-                    "outer_out_from_inner_inp" : {},
-                    "outer_inp_from_inner_out" : {},
-                    "inner_inp_from_inner_out" : {},
-                    "outer_out_from_inner_out" : {}}
+        mappings = {"outer_inp_from_outer_out": {},
+                    "inner_inp_from_outer_out": {},
+                    "inner_out_from_outer_out": {},
+                    "inner_inp_from_outer_inp": {},
+                    "inner_out_from_outer_inp": {},
+                    "outer_out_from_outer_inp": {},
+                    "outer_inp_from_inner_inp": {},
+                    "inner_out_from_inner_inp": {},
+                    "outer_out_from_inner_inp": {},
+                    "outer_inp_from_inner_out": {},
+                    "inner_inp_from_inner_out": {},
+                    "outer_out_from_inner_out": {}}

        for (oinp, iinp, iout, oout) in izip(outer_input_indices,
                                             inner_input_indices,
@@ -1932,7 +1933,7 @@ class Scan(PureOp):
            grad_steps = self.outer_sitsot_outs(outs)[0].shape[0] - 1
        elif self.n_mit_sot > 0:
            grad_steps = self.outer_mitsot_outs(outs)[0].shape[0] +\
-                    self.mintaps[self.n_mit_mot]
+                self.mintaps[self.n_mit_mot]
        else:
            grad_steps = inputs[0]

@@ -2019,14 +2020,13 @@ class Scan(PureOp):
            # to X.
            known_grads = OrderedDict([(k.copy(), v) for (k, v) in known_grads.items()])

-            grads = gradient.grad(
-                        cost=None,
-                        known_grads=known_grads,
-                        wrt=wrt,
-                        consider_constant=wrt,
-                        disconnected_inputs='ignore',
-                        return_disconnected='None',
-                        null_gradients='return')
+            grads = gradient.grad(cost=None,
+                                  known_grads=known_grads,
+                                  wrt=wrt,
+                                  consider_constant=wrt,
+                                  disconnected_inputs='ignore',
+                                  return_disconnected='None',
+                                  null_gradients='return')

            for i in range(len(wrt)):
                gmp[wrt[i]] = grads[i]
@@ -2086,7 +2086,6 @@ class Scan(PureOp):
                dC_dXt = safe_new(dC_douts[idx][0])
            dC_dXts.append(dC_dXt)

-
        known_grads = OrderedDict()
        dc_dxts_idx = 0
        for i in range(len(diff_outputs)):
@@ -2141,7 +2140,7 @@ class Scan(PureOp):
                dC_dXtm1s.append(safe_new(dC_dXts[opos]))
                if hasattr(x, 'dtype') and x.dtype != dC_dXts[opos].dtype:
                    dC_dinps_t[pos + self.n_seqs] = \
-                            x.astype(dC_dXts[opos].dtype)
+                        x.astype(dC_dXts[opos].dtype)
            else:
                dC_dXtm1s.append(safe_new(x))

@@ -2168,7 +2167,7 @@ class Scan(PureOp):
            seq = outs[idx]
            for k in self.tap_array[idx]:
                if outmaxtap - k != 0:
-                    nw_seq = seq[k - mintap: -(outmaxtap-k)][::-1]
+                    nw_seq = seq[k - mintap: -(outmaxtap - k)][::-1]
                else:
                    nw_seq = seq[k - mintap:][::-1]
                outer_inp_seqs.append(nw_seq)
@@ -2276,7 +2275,6 @@ class Scan(PureOp):
                        new_inner_out_mitmot = theano.clone(new_inner_out_mitmot,
                                                            replace=[(to_replace, replacement)])

-
                    inner_out_mitmot.append(new_inner_out_mitmot)

                if not disconnected_dC_dinps_t[ins_pos]:
@@ -2541,8 +2539,7 @@ class Scan(PureOp):
                gradients.append(NullType(t)())

        end = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
-        for p, (x, t) in enumerate(
-            zip(outputs[:end], type_outs[:end])):
+        for p, (x, t) in enumerate(zip(outputs[:end], type_outs[:end])):
            if t == 'connected':
                gradients.append(x[::-1])
            elif t == 'disconnected':
@@ -2575,12 +2572,11 @@ class Scan(PureOp):

        start = len(gradients)
        gradients += [DisconnectedType()()
-                for x in xrange(self.n_nit_sot)]
+                      for x in xrange(self.n_nit_sot)]
        begin = end

        end = begin + n_sitsot_outs
-        for p, (x, t) in enumerate(
-            zip(outputs[begin:end], type_outs[begin:end])):
+        for p, (x, t) in enumerate(zip(outputs[begin:end], type_outs[begin:end])):
            if t == 'connected':
                gradients.append(x[-1])
            elif t == 'disconnected':
@@ -2617,7 +2613,7 @@ class Scan(PureOp):
                                            self.outputs, '_rop')
        self_inputs = rval[0]
        rop_of_inputs = rval[0][:self.n_seqs + self.n_outs] + \
-                rval[0][self.n_seqs + self.n_outs + self.n_shared_outs:]
+            rval[0][self.n_seqs + self.n_outs + self.n_shared_outs:]
        self_outputs = rval[1]
        # Step 1. Compute the R_op of the inner function
        inner_eval_points = [scan_utils.safe_new(x, '_evalpoint')
@@ -2628,8 +2624,7 @@ class Scan(PureOp):
            rop_self_outputs = self_outputs
        if self.info['n_shared_outs'] > 0:
            rop_self_outputs = rop_self_outputs[:-self.info['n_shared_outs']]
-        rop_outs = tensor.Rop(rop_self_outputs, rop_of_inputs,
-             inner_eval_points)
+        rop_outs = tensor.Rop(rop_self_outputs, rop_of_inputs, inner_eval_points)
        if type(rop_outs) not in (list, tuple):
            rop_outs = [rop_outs]
        # Step 2. Figure out what corresponds to what in the scan
@@ -2709,8 +2704,8 @@ class Scan(PureOp):
        e = e + self.n_mit_sot
        ib = ie
        ie = ie + int(numpy.sum([len(x) for x in
-                         self.tap_array[self.n_mit_mot:\
-                                        self.n_mit_mot + self.n_mit_sot]]))
+                                 self.tap_array[self.n_mit_mot: \
+                                                self.n_mit_mot + self.n_mit_sot]]))
        clean_eval_points = []
        for inp, evp in zip(inputs[b:e], eval_points[b:e]):
            if evp is not None:

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -1015,7 +1015,7 @@ class GetItemList(gof.op.Op):
    def grad(self, inputs, g_outputs):
        x, indices = inputs
        gout, = g_outputs
-        return [GetItemListGrad(self)(x, indices, gout),
+        return [get_item_list_grad(x, indices, gout),
                grad_undefined(self, 1, indices, "No gradient for this input")]

 get_item_list = GetItemList()
@@ -1110,7 +1110,7 @@ class GetItem2Lists(gof.op.Op):
    def grad(self, inputs, g_outputs):
        x, ind1, ind2 = inputs
        gout, = g_outputs
-        return [GetItem2ListsGrad(self)(x, ind1, ind2, gout),
+        return [get_item_2lists_grad(x, ind1, ind2, gout),
                grad_undefined(self, 1, ind1, "No gradient for this input"),
                grad_undefined(self, 1, ind2, "No gradient for this input")]


--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -297,9 +297,6 @@ class Ger(Op):
    This interface to GER allows non-destructive operation on A via the
    `destructive` argument to the constructor.

-    :TODO: Create better classes ScipyGer and CGer that inherit from this class
-    and override the make_thunk() method to use Scipy and C respectively.
-
    """

    __props__ = ("destructive",)
@@ -837,10 +834,8 @@ class Gemm(GemmRelated):
        else:
            self.setup_z_Nz_Sz = self.setup_z_Nz_Sz_outplace

-        # Correctly reload older pickles where _op_use_c_code and
-        # destroy_map were not saved
-        if '_op_use_c_code' not in self.__dict__:
-            self._op_use_c_code = theano.config.cxx
+        # Correctly reload older pickles where destroy_map were not
+        # saved
        if 'destroy_map' not in self.__dict__ and self.inplace:
            self.destroy_map = {0: [0]}


--- a/theano/tensor/blas_scipy.py
+++ b/theano/tensor/blas_scipy.py
@@ -22,46 +22,34 @@ if have_fblas:

 class ScipyGer(Ger):

-    # keep everything else, but override the make_thunk
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-
-        node_input_storage = [storage_map[r] for r in node.inputs]
-        node_output_storage = [storage_map[r] for r in node.outputs]
-        node_output_compute = [compute_map[r] for r in node.outputs]
-
-        # get vars for containers
-        cA, calpha, cx, cy = node_input_storage
-        cZ, = node_output_storage
-        local_ger = _blas_ger_fns[numpy.dtype(node.inputs[0].type.dtype)]
-
-        def rval():
-            # N.B. some versions of scipy (e.g. mine) don't actually work
-            # in-place on a, even when I tell it to.
-            A = cA[0]
-            if A.size == 0:
-                # We don't have to compute anything, A is empty.
-                # We need this special case because Numpy considers it
-                # C-contiguous, wich is confusing.
-                if not self.destructive:
-                    # Sometimes numpy thinks empty matrices can share memory,
-                    # so here to stop DebugMode from complaining.
-                    A = A.copy()
-            elif A.flags['C_CONTIGUOUS']:
-                A = local_ger(calpha[0], cy[0], cx[0], a=A.T,
-                              overwrite_a=int(self.destructive)).T
-            else:
-                A = local_ger(calpha[0], cx[0], cy[0], a=A,
-                              overwrite_a=int(self.destructive))
-            cZ[0] = A
-            for o in node_output_compute:
-                o[0] = True
-
-        # TODO: If this is currently an unofficial part of the thunk API,
-        #      then maybe it should be documented and made official?
-        rval.inputs = node_input_storage
-        rval.outputs = node_output_storage
-        rval.lazy = False
-        return rval
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        if impl == 'py':
+            node.tag.local_ger = _blas_ger_fns[numpy.dtype(
+                node.inputs[0].type.dtype)]
+
+    def perform(self, node, inputs, output_storage):
+        cA, calpha, cx, cy = inputs
+        cZ, = output_storage
+        # N.B. some versions of scipy (e.g. mine) don't actually work
+        # in-place on a, even when I tell it to.
+        A = cA
+        local_ger = node.tag.local_ger
+        if A.size == 0:
+            # We don't have to compute anything, A is empty.
+            # We need this special case because Numpy considers it
+            # C-contiguous, wich is confusing.
+            if not self.destructive:
+                # Sometimes numpy thinks empty matrices can share memory,
+                # so here to stop DebugMode from complaining.
+                A = A.copy()
+        elif A.flags['C_CONTIGUOUS']:
+            A = local_ger(calpha, cy, cx, a=A.T,
+                          overwrite_a=int(self.destructive)).T
+        else:
+            A = local_ger(calpha, cx, cy, a=A,
+                          overwrite_a=int(self.destructive))
+        cZ[0] = A
+

 scipy_ger_no_inplace = ScipyGer(False)
 scipy_ger_inplace = ScipyGer(True)

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -787,14 +787,15 @@ second dimension

        return ret

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        # Postpone the ufunc building to the last minutes
        # NumPy ufunc support only up to 31 inputs.
        # But our c code support more.
        if (len(node.inputs) < 32 and
                (self.nfunc is None or
                 self.scalar_op.nin != len(node.inputs)) and
-                self.ufunc is None):
+                self.ufunc is None and
+                impl == 'py'):

            ufunc = numpy.frompyfunc(self.scalar_op.impl,
                                     len(node.inputs),
@@ -830,7 +831,7 @@ second dimension
            [get_scalar_type(dtype=output.type.dtype).make_variable()
             for output in node.outputs])

-        self.scalar_op.prepare_node(node.tag.fake_node, None, None)
+        self.scalar_op.prepare_node(node.tag.fake_node, None, None, impl)

    def perform(self, node, inputs, output_storage):
        if len(node.inputs) >= 32:
@@ -890,14 +891,18 @@ second dimension
            # numpy the first (faster) version leads to segfaults
            if self.ufunc:
                ufunc = self.ufunc
+            elif not hasattr(node.tag, 'ufunc'):
+                # It happen that make_thunk isn't called, like in
+                # get_scalar_constant_value
+                self.prepare_node(node, None, None, 'py')
+                # prepare_node will add ufunc to self or the tag
+                # depending if we can reuse it or not. So we need to
+                # test both again.
+                if self.ufunc:
+                    ufunc = self.ufunc
+                else:
+                    ufunc = node.tag.ufunc
            else:
-                if not hasattr(node.tag, 'ufunc'):
-                    # It happen that make_thunk isn't called, like in
-                    # get_scalar_constant_value
-                    node.tag.ufunc = numpy.frompyfunc(self.scalar_op.impl,
-                                                      len(node.inputs),
-                                                      self.scalar_op.nout)
-
                ufunc = node.tag.ufunc

            nout = ufunc.nout
@@ -977,7 +982,7 @@ second dimension
        # To not request all of them to call prepare_node(), do it here.
        # There is no harm if it get called multile time.
        if not hasattr(node.tag, 'fake_node'):
-            self.prepare_node(node, None, None)
+            self.prepare_node(node, None, None, 'c')
        _inames = inames
        _onames = onames


--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -6299,20 +6299,12 @@ def constant_folding(node):
    for o in node.outputs:
        storage_map[o] = [None]
        compute_map[o] = [False]
+    impl = None
    if (hasattr(node.op, 'python_constant_folding') and
            node.op.python_constant_folding(node)):
-        old_value = getattr(node.op, '_op_use_c_code', False)
-        try:
-            node.op._op_use_c_code = False
-            thunk = node.op.make_thunk(node,
-                                       storage_map,
-                                       compute_map,
-                                       [])
-        finally:
-            node.op._op_use_c_code = old_value
-    else:
-        thunk = node.op.make_thunk(node, storage_map, compute_map,
-                                   no_recycling=[])
+        impl = 'py'
+    thunk = node.op.make_thunk(node, storage_map, compute_map,
+                               no_recycling=[], impl=impl)

    required = thunk()
    assert not required  # a node whose inputs are all provided should always

--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
@@ -263,7 +263,7 @@ class Pool(OpenMPOp):
                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
        self.mode = mode

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) == 1:
            # Old interface
            self.ndim = len(node.op.ds)
@@ -796,7 +796,7 @@ class PoolGrad(OpenMPOp):
        self.mode = mode
        super(PoolGrad, self).__init__(openmp=openmp)

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) < 5:  # 5 for AveragePoolGrad, 6 for MaxPoolGrad
            # Old interface
            self.ndim = len(node.op.ds)