Merge pull request #5073 from nouiz/Faruk-Ahmed-use_cxx_flag

Removing _op_use_c_code attribute

Merge pull request #5073 from nouiz/Faruk-Ahmed-use_cxx_flag
65af9781 · abergeron · GitHub · 18dd2955 · 0cb3b854 · 65af9781
--- a/doc/extending/extending_theano.txt
+++ b/doc/extending/extending_theano.txt
@@ -99,7 +99,7 @@ possibilities you may encounter or need.  For that refer to
            pass

        # Other implementations (pycuda, ...):
-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl=None):
            pass

        # optional:
@@ -190,11 +190,12 @@ or :func:`make_thunk`.
      valid, but shouldn't be required anymore for this call.
      The returned function must ensure that it sets the computed
      variables as computed in the `compute_map`.
-
+    - ``impl`` allow to select between multiple implementation.
+      It should have a default value of None.

  :func:`make_thunk` is useful if you want to generate code and compile
  it yourself. For example, this allows you to use PyCUDA to compile GPU
-  code.
+  code and keep state in the thunk.

  If :func:`make_thunk()` is defined by an op, it will be used by Theano
  to obtain the op's implementation.

--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -171,7 +171,7 @@ Optional methods or attributes
  returned, unless it is of length 1, where the single element will be
  returned by itself.

-.. function:: make_thunk(node, storage_map, compute_map, no_recycling)
+.. function:: make_thunk(node, storage_map, compute_map, no_recycling, impl=None)

   This function must return a thunk, that is a zero-arguments
   function that encapsulates the computation to be performed by this
@@ -192,6 +192,8 @@ Optional methods or attributes
     valid, but shouldn't be required anymore for this call.
   :param no_recycling: WRITEME
     WRITEME
+   :param impl: None, 'c' or 'py'
+     Which implementation to use.

   The returned function must ensure that is sets the computed
   variables as computed in the `compute_map`.

--- a/doc/tutorial/gpu_data_convert.txt
+++ b/doc/tutorial/gpu_data_convert.txt
@@ -92,7 +92,7 @@ You can use a GPU function compiled with PyCUDA in a Theano op:
               cuda.basic_ops.as_cuda_ndarray_variable(inp))
            assert inp.dtype == "float32"
            return theano.Apply(self, [inp], [inp.type()])
-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl=None):
            mod = SourceModule("""
        __global__ void my_fct(float * i0, float * o0, int size) {
        int i = blockIdx.x * blockDim.x + threadIdx.x;

--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -586,7 +586,7 @@ Modify and execute to work for a matrix of shape (20, 10).
            assert inp.dtype == "float32"
            return theano.Apply(self, [inp], [inp.type()])

-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl):
            mod = SourceModule("""
        __global__ void my_fct(float * i0, float * o0, int size) {
        int i = blockIdx.x*blockDim.x + threadIdx.x;

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -124,14 +124,11 @@ class OpFromGraph(gof.Op):
                         list(inputs) + self.shared_inputs,
                         [type() for type in self.output_types])

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-        ret = super(OpFromGraph, self).make_thunk(node, storage_map,
-                                                  compute_map, no_recycling)
-        if not hasattr(self, "fn"):
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        if not hasattr(self, "fn") and impl == 'py':
            self.fn = orig_function(self.new_inputs,
                                    self.new_outputs,
                                    **self.kwargs)
-        return ret

    def perform(self, node, inputs, outputs):
        variables = self.fn(*inputs)

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1837,8 +1837,6 @@ class _Linker(gof.link.LocalLinker):
                thunk.inputs = [storage_map[v] for v in node.inputs]
                thunk.outputs = [storage_map[v] for v in node.outputs]
                thunk_other = thunk
-            else:
-                node.op.prepare_node(node, storage_map, compute_map)

            debug = hasattr(node.op, 'debug_perform')

@@ -1852,6 +1850,7 @@ class _Linker(gof.link.LocalLinker):
                if not isinstance(node.op, gof.op.Op):
                    raise utils.MethodNotDefined()

+                node.op.prepare_node(node, storage_map, compute_map, 'c')
                thunk = node.op.make_c_thunk(node, storage_map, compute_map,
                                             no_recycling)
                thunks_c.append(thunk)
@@ -1864,6 +1863,7 @@ class _Linker(gof.link.LocalLinker):
            if (((self.maker.mode.check_py_code or thunks_c[-1] is None) and
                 node.op.perform.__code__ != gof.op.PureOp.perform.__code__) or
                    debug):
+                node.op.prepare_node(node, storage_map, compute_map, 'py')
                thunk = node.op.make_py_thunk(node, storage_map, compute_map,
                                              no_recycling, debug=debug)
                thunks_py.append(thunk)
@@ -1873,6 +1873,7 @@ class _Linker(gof.link.LocalLinker):
            if not self.maker.mode.check_c_code and thunks_py[-1] is None:
                _logger.warn("Op %s doesn't have a perform, "
                             "forcing check of the C code" % node.op)
+                node.op.prepare_node(node, storage_map, compute_map, 'c')
                thunk = node.op.make_c_thunk(node, storage_map, compute_map,
                                             no_recycling)
                thunks_c[-1] = thunk

--- a/theano/d3viz/formatting.py
+++ b/theano/d3viz/formatting.py
@@ -233,6 +233,7 @@ class PyDotFormatter(object):
                gf = PyDotFormatter()
                # Use different node prefix for sub-graphs
                gf.__node_prefix = __node_id
+                node.op.prepare_node(node, None, None, 'py')
                gf(node.op.fn, subgraph)
                graph.add_subgraph(subgraph)
                pd_node.get_attributes()['subg'] = subgraph.get_name()

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -1584,7 +1584,7 @@ class CLinker(link.Linker):
        else:
            # Set compute_map as None as clinker do not support lazy evaluation
            for node in self.node_order:
-                node.op.prepare_node(node, storage_map, None)
+                node.op.prepare_node(node, storage_map, None, 'c')
            module = get_module_cache().module_from_key(
                key=key, lnk=self, keep_lock=keep_lock)

@@ -1787,24 +1787,14 @@ class OpWiseCLinker(link.LocalLinker):

            thunks = []
            for node in order:
-                # Maker sure we use the C version of the code whenever
-                # possible
-                # There are ops that don't have _op_use_c_code property
-                # for example ifelse (or any ops that come with their own
-                # make_thunk
-                old_value = getattr(node.op, '_op_use_c_code', False)
-                try:
-                    if theano.config.cxx:
-                        node.op._op_use_c_code = True
-                    thunks += [node.op.make_thunk(node,
-                                                  storage_map,
-                                                  compute_map,
-                                                  no_recycling)]
-                    thunks[-1].inputs = [storage_map[v] for v in node.inputs]
-                    thunks[-1].outputs = [storage_map[v] for v in node.outputs]
-
-                finally:
-                    node.op._op_use_c_code = old_value
+                # make_thunk will try by default C code, otherwise
+                # it fall back to python.
+                thunks += [node.op.make_thunk(node,
+                                              storage_map,
+                                              compute_map,
+                                              no_recycling)]
+                thunks[-1].inputs = [storage_map[v] for v in node.inputs]
+                thunks[-1].outputs = [storage_map[v] for v in node.outputs]

            for node in order:
                if self.allow_gc:

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -823,17 +823,13 @@ class PerformLinker(LocalLinker):
            # the python version
            # Note : ops that implement their own make thunk don't usually
            # have this attribute defiend !!
-            old_value = getattr(node.op, '_op_use_c_code', False)
-            try:
-                node.op._op_use_c_code = False
-                thunks += [node.op.make_thunk(node,
-                                              storage_map,
-                                              compute_map,
-                                              no_recycling)]
-                thunks[-1].inputs = [storage_map[v] for v in node.inputs]
-                thunks[-1].outputs = [storage_map[v] for v in node.outputs]
-            finally:
-                node.op._op_use_c_code = old_value
+            thunks += [node.op.make_thunk(node,
+                                          storage_map,
+                                          compute_map,
+                                          no_recycling,
+                                          'py')]
+            thunks[-1].inputs = [storage_map[v] for v in node.inputs]
+            thunks[-1].outputs = [storage_map[v] for v in node.outputs]

        computed, last_user = gc_helper(order)
        if self.allow_gc:

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -32,6 +32,8 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>"

 __docformat__ = "restructuredtext en"

+_logger = logging.getLogger('theano.gof.op.Op')
+

 class CLinkerObject(object):
    """
@@ -779,34 +781,24 @@ class Op(utils.object2, PureOp, CLinkerOp):
    Convenience class to bundle `PureOp` and `CLinkerOp`.

    """
-    def __new__(cls, *args, **kwargs):
-        # this function exists to silently and transparently ensure that all
-        # existing Ops get a _op_use_c_code attribute
-        obj = object.__new__(cls)
-        if not hasattr(obj, '_op_use_c_code'):
-            obj._op_use_c_code = theano.config.cxx
-        return obj
-
-    def __init__(self, use_c_code=theano.config.cxx):
-        self._op_use_c_code = use_c_code
-
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        """
        Make any special modifications that the Op needs before doing
        make_thunk().

        This can modify the node inplace and should return nothing.

+        It can be called multiple time with different impl. It is the
+        op responsability to don't re-prepare the node when it isn't
+        good to do so.
+
        """
        pass

    def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
-        """
-        Like make_thunk, but will only try to make a C thunk.
+        """Like make_thunk, but will only try to make a C thunk.

        """
-        logger = logging.getLogger('theano.gof.op.Op')
-
        node_input_storage = [storage_map[r] for r in node.inputs]
        node_output_storage = [storage_map[r] for r in node.outputs]

@@ -828,7 +820,7 @@ class Op(utils.object2, PureOp, CLinkerOp):
        cl = theano.gof.cc.CLinker().accept(e,
                                            no_recycling=e_no_recycling)

-        logger.debug('Trying CLinker.make_thunk')
+        _logger.debug('Trying CLinker.make_thunk')
        outputs = cl.make_thunk(input_storage=node_input_storage,
                                output_storage=node_output_storage)
        fill_storage, node_input_filters, node_output_filters = outputs
@@ -883,7 +875,8 @@ class Op(utils.object2, PureOp, CLinkerOp):
        rval.lazy = False
        return rval

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling,
+                   impl=None):
        """
        This function must return a thunk, that is a zero-arguments
        function that encapsulates the computation to be performed
@@ -904,6 +897,9 @@ class Op(utils.object2, PureOp, CLinkerOp):
        no_recycling
            List of variables for which it is forbidden to reuse memory
            allocated by a previous call.
+        impl
+            Currently, None, 'c' or 'py'. If 'c' or 'py' we will only try
+            that version of the code.

        Notes
        -----
@@ -913,27 +909,26 @@ class Op(utils.object2, PureOp, CLinkerOp):
        the thunk can potentially cache return values (like CLinker does),
        then it must not do so for variables in the no_recycling list.

+        self.prepare_node(node, ...) is always called. If we try 'c' and it
+        fail and we try again 'py', prepare_node will be called twice.
        """
-        logger = logging.getLogger('theano.gof.op.Op')

-        self.prepare_node(node, storage_map=storage_map,
-                          compute_map=compute_map)
-
-        if not hasattr(self, '_op_use_c_code'):
-            warnings.warn(
-                "The  __getstate__ method of '%s' is not implemented correctly."
-                " It should keep the attributes added by the base class."
-                " To implement it correctly, it should keep all attributes"
-                " and only remove those it does not want." % (self),
-                stacklevel=2)
-        if getattr(self, '_op_use_c_code', theano.config.cxx):
+        if impl is None or impl == 'c':
+            self.prepare_node(node, storage_map=storage_map,
+                              compute_map=compute_map, impl='c')
            try:
                return self.make_c_thunk(node, storage_map, compute_map,
                                         no_recycling)
            except (NotImplementedError, utils.MethodNotDefined):
-                logger.debug('Falling back on perform')
+                # We requested the c code, so don't catch the error.
+                if impl == 'c':
+                    raise
+                _logger.debug('Falling back on perform')

-        # condition: either there was no c_code, or it failed
+        # condition: either there was no c_code, or it failed or
+        # python code was requested.
+        self.prepare_node(node, storage_map=storage_map,
+                          compute_map=compute_map, impl='py')
        return self.make_py_thunk(node, storage_map, compute_map, no_recycling)

    def make_node(self, *inputs):
@@ -1196,9 +1191,9 @@ int main( int argc, const char* argv[] )
                self.openmp = False
                theano.config.openmp = False

-    def prepare_node(self, node, storage_map,
-                     compute_map):
-        self.update_self_openmp()
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        if impl == 'c':
+            self.update_self_openmp()


 def simple_meth(tag):

--- a/theano/gof/tests/test_lazy.py
+++ b/theano/gof/tests/test_lazy.py
@@ -25,7 +25,7 @@ class IfElseIfElseIf(PureOp):
        assert t3.type == f3.type
        return Apply(self, [c1, t1, c2, t2, c3, t3, f3], [t1.type()])

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):

        input_computed = [compute_map[v] for v in node.inputs]
        output_computed = [compute_map[v] for v in node.outputs]
@@ -93,7 +93,7 @@ class NotImplementedOp(PureOp):
    def make_node(self, x):
        return Apply(self, [x], [x.type()])

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):
        def thunk():
            raise self.E()
        thunk.lazy = False

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -1043,12 +1043,14 @@ class VM_Linker(link.LocalLinker):
        t0 = time.time()
        for node in order:
            try:
+                impl = None
                if self.c_thunks is False:
-                    node.op._op_use_c_code = False
+                    impl = 'py'
                thunks.append(node.op.make_thunk(node,
                                                 storage_map,
                                                 compute_map,
-                                                 no_recycling))
+                                                 no_recycling,
+                                                 impl=impl))
                if not hasattr(thunks[-1], 'lazy'):
                    # We don't want all ops maker to think about lazy Ops.
                    # So if they didn't specify that its lazy or not, it isn't.

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -2620,11 +2620,9 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
    def get_params(self, node):
        return node.outputs[0].type.context

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        # cache the kernel object
        self.get_kernel_cache(node)
-        return super(GpuCAReduceCPY, self).make_thunk(
-            node, storage_map, compute_map, no_recycling)

    def get_kernel_cache(self, node):
        attr = '@cache_reduction_k'

--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -73,7 +73,7 @@ class CuRFFTOp(Op):

        return theano.Apply(self, [inp, s], [self.output_type(inp)()])

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):

        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
@@ -198,7 +198,7 @@ class CuIRFFTOp(Op):

        return theano.Apply(self, [inp, s], [self.output_type(inp)()])

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):

        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -20,7 +20,7 @@ import numpy
 import theano.tensor
 from theano.tensor import TensorType
 from theano import gof
-from theano.gof import PureOp, Apply
+from theano.gof import Op, Apply

 from six import iteritems
 from six.moves import xrange
@@ -41,7 +41,7 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 _logger = logging.getLogger('theano.ifelse')


-class IfElse(PureOp):
+class IfElse(Op):
    """
    Op that provides conditional graph evaluation if used with the CVM/VM
    linkers. Note that there exist a helpful function `ifelse` that should
@@ -235,7 +235,7 @@ class IfElse(PureOp):
                if_true_op(*if_true, **dict(return_list=True)) +
                if_false_op(*if_false, **dict(return_list=True)))

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
        cond = node.inputs[0]
        ts = node.inputs[1:][:self.n_outs]
        fs = node.inputs[1:][self.n_outs:]

--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
@@ -320,7 +320,7 @@ class PycudaElemwiseSourceModuleMakeThunkOp(Op):
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        return out_node

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        # TODO support broadcast!
        # TODO assert all input have the same shape
        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -246,18 +246,14 @@ class GpuOp(theano.gof.Op):

    """

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if use.device_number is None:
            use("gpu",
                force=True,
                default_to_move_computation_to_gpu=False,
                move_shared_float32_to_gpu=False,
                enable_cuda=False)
-        return super(GpuOp, self).make_thunk(node, storage_map,
-                                             compute_map, no_recycling)

-theano.compile.debugmode.default_make_thunk.append(
-    get_unbound_function(GpuOp.make_thunk))

 # We must do those import to be able to create the full doc when
 # nvcc is not available

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -541,10 +541,8 @@ class GpuGemm(GpuOp):
    def __setstate__(self, dct):
        self.__dict__.update(dct)

-        # Correctly reload older pickles where _op_use_c_code and
-        # destroy_map were not saved
-        if '_op_use_c_code' not in self.__dict__:
-            self._op_use_c_code = theano.config.cxx
+        # Correctly reload older pickles where destroy_map were not
+        # saved
        if 'destroy_map' not in self.__dict__ and self.inplace:
            self.destroy_map = {0: [0]}

@@ -661,10 +659,8 @@ class GpuGemv(GpuOp):
    def __setstate__(self, dct):
        self.__dict__.update(dct)

-        # Correctly reload older pickles where _op_use_c_code and
-        # destroy_map were not saved
-        if '_op_use_c_code' not in self.__dict__:
-            self._op_use_c_code = theano.config.cxx
+        # Correctly reload older pickles where destroy_map were not
+        # saved
        if 'destroy_map' not in self.__dict__ and self.inplace:
            self.destroy_map = {0: [0]}

@@ -761,10 +757,8 @@ class GpuGer(GpuOp):
    def __setstate__(self, dct):
        self.__dict__.update(dct)

-        # Correctly reload older pickles where _op_use_c_code and
-        # destroy_map were not saved
-        if '_op_use_c_code' not in self.__dict__:
-            self._op_use_c_code = theano.config.cxx
+        # Correctly reload older pickles where destroy_map were not
+        # saved
        if 'destroy_map' not in self.__dict__ and self.inplace:
            self.destroy_map = {0: [0]}

@@ -2187,7 +2181,9 @@ class GpuConv(GpuOp):
                     images[2] * images[3] * 2)
        return flops

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        super(GpuConv, self).prepare_node(node, storage_map, compute_map, impl)
+
        if node.op.max_threads_dim0 is None:
            cuda = theano.sandbox.cuda
            device_id = cuda.use.device_number
@@ -2240,8 +2236,8 @@ class GpuConv(GpuOp):
            bmode = 0
        if max_threads_dim0 is None:
            raise NotImplementedError("GpuConv.c_code should not be called "
-                                      "directly. It should be called by "
-                                      "make_thunk() that add some information "
+                                      "directly. It should be called after "
+                                      "prepare_node() that add some information "
                                      "related to the selected GPU.")
        sub.update(locals())
        return """

--- a/theano/sandbox/cuda/cula.py
+++ b/theano/sandbox/cuda/cula.py
@@ -51,10 +51,7 @@ class GpuSolve(GpuOp):
        assert inp2.ndim == 2
        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])

-    def make_thunk(self,
-                   node,
-                   storage_map, _,
-                   no_recycling=[]):
+    def make_thunk(self, node, storage_map, _, no_recycling, impl=None):

        # Initialize CULA the first time it is needed
        global cula_initialized

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1567,7 +1567,10 @@ class GpuDnnPool(DnnBase):
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        super(GpuDnnPool, self).prepare_node(
+            node, storage_map, compute_map, impl)
+
        if len(node.inputs) == 2:
            warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
            # Old interface
@@ -1803,7 +1806,7 @@ class GpuDnnPoolGrad(DnnBase):
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) == 4:
            warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
            # Old interface

--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
@@ -49,7 +49,7 @@ class GpuCumsum(CumsumOp, GpuOp):

        return theano.Apply(self, [x], [x.type()])

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
        node_ = copy.copy(node)
        assert node.op is node_.op
        if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None:
@@ -70,7 +70,7 @@ class GpuCumsum(CumsumOp, GpuOp):
            node_.op.max_grid_size2 = prop['maxGridSize2']

        return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
-                                                     compute_map, no_recycling)
+                                                     compute_map, no_recycling, impl)

    def __str__(self):
        return "%s{%s}" % (self.__class__.__name__, self.axis)

--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -48,7 +48,7 @@ class ScikitsCudaOp(GpuOp):

        return theano.Apply(self, [inp], [self.output_type(inp)()])

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        if not scikits_cuda_available:
            raise RuntimeError(
                "scikits.cuda is needed for all GPU fft implementation,"
@@ -61,7 +61,7 @@ class CuFFTOp(ScikitsCudaOp):
        return CudaNdarrayType(
            broadcastable=[False] * (inp.type.ndim + 1))

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)

        from theano.misc.pycuda_utils import to_gpuarray
@@ -118,7 +118,7 @@ class CuIFFTOp(ScikitsCudaOp):
        return CudaNdarrayType(
            broadcastable=[False] * (inp.type.ndim - 1))

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)

        from theano.misc.pycuda_utils import to_gpuarray
@@ -314,7 +314,7 @@ class BatchedComplexDotOp(ScikitsCudaOp):
    def output_type(self, inp):
        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)

        inputs = [storage_map[v] for v in node.inputs]

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -3064,7 +3064,7 @@ arctan = ArcTan(upgrade_to_float, name='arctan')


 class ArcTan2(BinaryScalarOp):
-    nfunc_spec = ('arctan2', 1, 1)
+    nfunc_spec = ('arctan2', 2, 1)

    def impl(self, y, x):
        # If x and y are int8 or uint8, numpy.arctan2 will compute the result
@@ -3663,11 +3663,15 @@ class Composite(ScalarOp):
        # Postpone the creation in case it isn't needed.
        #  self.init_name()      # self.name
        self.name = None
-
-    def prepare_node(self, node, storage_map, compute_map):
-        self.init_py_impls()  # self._impls
-        for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs):
-            n.op.prepare_node(n, None, None)
+        self.prepare_node_called = set()
+
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        if impl == 'py':
+            self.init_py_impls()  # self._impls
+        if impl not in self.prepare_node_called:
+            for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs):
+                n.op.prepare_node(n, None, None, impl)
+            self.prepare_node_called.add(impl)

    def output_types(self, input_types):
        if tuple(input_types) != self.inputs_type:

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -1015,7 +1015,7 @@ class GetItemList(gof.op.Op):
    def grad(self, inputs, g_outputs):
        x, indices = inputs
        gout, = g_outputs
-        return [GetItemListGrad(self)(x, indices, gout),
+        return [get_item_list_grad(x, indices, gout),
                grad_undefined(self, 1, indices, "No gradient for this input")]

 get_item_list = GetItemList()
@@ -1110,7 +1110,7 @@ class GetItem2Lists(gof.op.Op):
    def grad(self, inputs, g_outputs):
        x, ind1, ind2 = inputs
        gout, = g_outputs
-        return [GetItem2ListsGrad(self)(x, ind1, ind2, gout),
+        return [get_item_2lists_grad(x, ind1, ind2, gout),
                grad_undefined(self, 1, ind1, "No gradient for this input"),
                grad_undefined(self, 1, ind2, "No gradient for this input")]


--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -297,9 +297,6 @@ class Ger(Op):
    This interface to GER allows non-destructive operation on A via the
    `destructive` argument to the constructor.

-    :TODO: Create better classes ScipyGer and CGer that inherit from this class
-    and override the make_thunk() method to use Scipy and C respectively.
-
    """

    __props__ = ("destructive",)
@@ -837,10 +834,8 @@ class Gemm(GemmRelated):
        else:
            self.setup_z_Nz_Sz = self.setup_z_Nz_Sz_outplace

-        # Correctly reload older pickles where _op_use_c_code and
-        # destroy_map were not saved
-        if '_op_use_c_code' not in self.__dict__:
-            self._op_use_c_code = theano.config.cxx
+        # Correctly reload older pickles where destroy_map were not
+        # saved
        if 'destroy_map' not in self.__dict__ and self.inplace:
            self.destroy_map = {0: [0]}


--- a/theano/tensor/blas_scipy.py
+++ b/theano/tensor/blas_scipy.py
@@ -22,46 +22,34 @@ if have_fblas:

 class ScipyGer(Ger):

-    # keep everything else, but override the make_thunk
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-
-        node_input_storage = [storage_map[r] for r in node.inputs]
-        node_output_storage = [storage_map[r] for r in node.outputs]
-        node_output_compute = [compute_map[r] for r in node.outputs]
-
-        # get vars for containers
-        cA, calpha, cx, cy = node_input_storage
-        cZ, = node_output_storage
-        local_ger = _blas_ger_fns[numpy.dtype(node.inputs[0].type.dtype)]
-
-        def rval():
-            # N.B. some versions of scipy (e.g. mine) don't actually work
-            # in-place on a, even when I tell it to.
-            A = cA[0]
-            if A.size == 0:
-                # We don't have to compute anything, A is empty.
-                # We need this special case because Numpy considers it
-                # C-contiguous, wich is confusing.
-                if not self.destructive:
-                    # Sometimes numpy thinks empty matrices can share memory,
-                    # so here to stop DebugMode from complaining.
-                    A = A.copy()
-            elif A.flags['C_CONTIGUOUS']:
-                A = local_ger(calpha[0], cy[0], cx[0], a=A.T,
-                              overwrite_a=int(self.destructive)).T
-            else:
-                A = local_ger(calpha[0], cx[0], cy[0], a=A,
-                              overwrite_a=int(self.destructive))
-            cZ[0] = A
-            for o in node_output_compute:
-                o[0] = True
-
-        # TODO: If this is currently an unofficial part of the thunk API,
-        #      then maybe it should be documented and made official?
-        rval.inputs = node_input_storage
-        rval.outputs = node_output_storage
-        rval.lazy = False
-        return rval
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        if impl == 'py':
+            node.tag.local_ger = _blas_ger_fns[numpy.dtype(
+                node.inputs[0].type.dtype)]
+
+    def perform(self, node, inputs, output_storage):
+        cA, calpha, cx, cy = inputs
+        cZ, = output_storage
+        # N.B. some versions of scipy (e.g. mine) don't actually work
+        # in-place on a, even when I tell it to.
+        A = cA
+        local_ger = node.tag.local_ger
+        if A.size == 0:
+            # We don't have to compute anything, A is empty.
+            # We need this special case because Numpy considers it
+            # C-contiguous, wich is confusing.
+            if not self.destructive:
+                # Sometimes numpy thinks empty matrices can share memory,
+                # so here to stop DebugMode from complaining.
+                A = A.copy()
+        elif A.flags['C_CONTIGUOUS']:
+            A = local_ger(calpha, cy, cx, a=A.T,
+                          overwrite_a=int(self.destructive)).T
+        else:
+            A = local_ger(calpha, cx, cy, a=A,
+                          overwrite_a=int(self.destructive))
+        cZ[0] = A
+

 scipy_ger_no_inplace = ScipyGer(False)
 scipy_ger_inplace = ScipyGer(True)

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -787,14 +787,15 @@ second dimension

        return ret

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        # Postpone the ufunc building to the last minutes
        # NumPy ufunc support only up to 31 inputs.
        # But our c code support more.
        if (len(node.inputs) < 32 and
                (self.nfunc is None or
                 self.scalar_op.nin != len(node.inputs)) and
-                self.ufunc is None):
+                self.ufunc is None and
+                impl == 'py'):

            ufunc = numpy.frompyfunc(self.scalar_op.impl,
                                     len(node.inputs),
@@ -830,7 +831,7 @@ second dimension
            [get_scalar_type(dtype=output.type.dtype).make_variable()
             for output in node.outputs])

-        self.scalar_op.prepare_node(node.tag.fake_node, None, None)
+        self.scalar_op.prepare_node(node.tag.fake_node, None, None, impl)

    def perform(self, node, inputs, output_storage):
        if len(node.inputs) >= 32:
@@ -890,14 +891,18 @@ second dimension
            # numpy the first (faster) version leads to segfaults
            if self.ufunc:
                ufunc = self.ufunc
+            elif not hasattr(node.tag, 'ufunc'):
+                # It happen that make_thunk isn't called, like in
+                # get_scalar_constant_value
+                self.prepare_node(node, None, None, 'py')
+                # prepare_node will add ufunc to self or the tag
+                # depending if we can reuse it or not. So we need to
+                # test both again.
+                if self.ufunc:
+                    ufunc = self.ufunc
+                else:
+                    ufunc = node.tag.ufunc
            else:
-                if not hasattr(node.tag, 'ufunc'):
-                    # It happen that make_thunk isn't called, like in
-                    # get_scalar_constant_value
-                    node.tag.ufunc = numpy.frompyfunc(self.scalar_op.impl,
-                                                      len(node.inputs),
-                                                      self.scalar_op.nout)
-
                ufunc = node.tag.ufunc

            nout = ufunc.nout
@@ -977,7 +982,7 @@ second dimension
        # To not request all of them to call prepare_node(), do it here.
        # There is no harm if it get called multile time.
        if not hasattr(node.tag, 'fake_node'):
-            self.prepare_node(node, None, None)
+            self.prepare_node(node, None, None, 'c')
        _inames = inames
        _onames = onames


--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -6299,20 +6299,12 @@ def constant_folding(node):
    for o in node.outputs:
        storage_map[o] = [None]
        compute_map[o] = [False]
+    impl = None
    if (hasattr(node.op, 'python_constant_folding') and
            node.op.python_constant_folding(node)):
-        old_value = getattr(node.op, '_op_use_c_code', False)
-        try:
-            node.op._op_use_c_code = False
-            thunk = node.op.make_thunk(node,
-                                       storage_map,
-                                       compute_map,
-                                       [])
-        finally:
-            node.op._op_use_c_code = old_value
-    else:
-        thunk = node.op.make_thunk(node, storage_map, compute_map,
-                                   no_recycling=[])
+        impl = 'py'
+    thunk = node.op.make_thunk(node, storage_map, compute_map,
+                               no_recycling=[], impl=impl)

    required = thunk()
    assert not required  # a node whose inputs are all provided should always

--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
@@ -263,7 +263,7 @@ class Pool(OpenMPOp):
                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
        self.mode = mode

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) == 1:
            # Old interface
            self.ndim = len(node.op.ds)
@@ -796,7 +796,7 @@ class PoolGrad(OpenMPOp):
        self.mode = mode
        super(PoolGrad, self).__init__(openmp=openmp)

-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) < 5:  # 5 for AveragePoolGrad, 6 for MaxPoolGrad
            # Old interface
            self.ndim = len(node.op.ds)