Change the prepare_node logic to make it safe to call make_py_thunk and make_c_thunk directly.

23e43b1b · Frederic Bastien · 179e4085 · 23e43b1b · 23e43b1b · 23e43b1b
--- a/doc/extending/extending_theano.txt
+++ b/doc/extending/extending_theano.txt
@@ -99,7 +99,7 @@ possibilities you may encounter or need.  For that refer to
            pass
        # Other implementations (pycuda, ...):
-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl=None):
            pass
        # optional:
@@ -190,11 +190,12 @@ or :func:`make_thunk`.
      valid, but shouldn't be required anymore for this call.
      The returned function must ensure that it sets the computed
      variables as computed in the `compute_map`.
+    - ``impl`` allow to select between multiple implementation.
+      It should have a default value of None.
  :func:`make_thunk` is useful if you want to generate code and compile
  it yourself. For example, this allows you to use PyCUDA to compile GPU
-  code.
+  code and keep state in the thunk.
  If :func:`make_thunk()` is defined by an op, it will be used by Theano
  to obtain the op's implementation.

--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -171,7 +171,7 @@ Optional methods or attributes
  returned, unless it is of length 1, where the single element will be
  returned by itself.
-.. function:: make_thunk(node, storage_map, compute_map, no_recycling)
+.. function:: make_thunk(node, storage_map, compute_map, no_recycling, impl=None)
   This function must return a thunk, that is a zero-arguments
   function that encapsulates the computation to be performed by this
@@ -192,6 +192,8 @@ Optional methods or attributes
     valid, but shouldn't be required anymore for this call.
   :param no_recycling: WRITEME
     WRITEME
+   :param impl: None, 'c' or 'py'
+     Which implementation to use.
   The returned function must ensure that is sets the computed
   variables as computed in the `compute_map`.

--- a/doc/tutorial/gpu_data_convert.txt
+++ b/doc/tutorial/gpu_data_convert.txt
@@ -92,7 +92,7 @@ You can use a GPU function compiled with PyCUDA in a Theano op:
               cuda.basic_ops.as_cuda_ndarray_variable(inp))
            assert inp.dtype == "float32"
            return theano.Apply(self, [inp], [inp.type()])
-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl=None):
            mod = SourceModule("""
        __global__ void my_fct(float * i0, float * o0, int size) {
        int i = blockIdx.x * blockDim.x + threadIdx.x;

--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -586,7 +586,7 @@ Modify and execute to work for a matrix of shape (20, 10).
            assert inp.dtype == "float32"
            return theano.Apply(self, [inp], [inp.type()])
-        def make_thunk(self, node, storage_map, _, _2):
+        def make_thunk(self, node, storage_map, _, _2, impl):
            mod = SourceModule("""
        __global__ void my_fct(float * i0, float * o0, int size) {
        int i = blockIdx.x*blockDim.x + threadIdx.x;

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -124,8 +124,8 @@ class OpFromGraph(gof.Op):
                         list(inputs) + self.shared_inputs,
                         [type() for type in self.output_types])
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
-        if not hasattr(node.tag, "fn"):
+        if not hasattr(node.tag, "fn") and impl == 'py':
            node.tag.fn = orig_function(self.new_inputs,
                                        self.new_outputs,
                                        **self.kwargs)

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1837,8 +1837,6 @@ class _Linker(gof.link.LocalLinker):
                thunk.inputs = [storage_map[v] for v in node.inputs]
                thunk.outputs = [storage_map[v] for v in node.outputs]
                thunk_other = thunk
-            else:
-                node.op.prepare_node(node, storage_map, compute_map)
            debug = hasattr(node.op, 'debug_perform')
@@ -1852,6 +1850,7 @@ class _Linker(gof.link.LocalLinker):
                if not isinstance(node.op, gof.op.Op):
                    raise utils.MethodNotDefined()
+                node.op.prepare_node(node, storage_map, compute_map, 'c')
                thunk = node.op.make_c_thunk(node, storage_map, compute_map,
                                             no_recycling)
                thunks_c.append(thunk)
@@ -1864,6 +1863,7 @@ class _Linker(gof.link.LocalLinker):
            if (((self.maker.mode.check_py_code or thunks_c[-1] is None) and
                 node.op.perform.__code__ != gof.op.PureOp.perform.__code__) or
                    debug):
+                node.op.prepare_node(node, storage_map, compute_map, 'py')
                thunk = node.op.make_py_thunk(node, storage_map, compute_map,
                                              no_recycling, debug=debug)
                thunks_py.append(thunk)
@@ -1873,6 +1873,7 @@ class _Linker(gof.link.LocalLinker):
            if not self.maker.mode.check_c_code and thunks_py[-1] is None:
                _logger.warn("Op %s doesn't have a perform, "
                             "forcing check of the C code" % node.op)
+                node.op.prepare_node(node, storage_map, compute_map, 'c')
                thunk = node.op.make_c_thunk(node, storage_map, compute_map,
                                             no_recycling)
                thunks_c[-1] = thunk

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -1584,7 +1584,7 @@ class CLinker(link.Linker):
        else:
            # Set compute_map as None as clinker do not support lazy evaluation
            for node in self.node_order:
-                node.op.prepare_node(node, storage_map, None)
+                node.op.prepare_node(node, storage_map, None, 'c')
            module = get_module_cache().module_from_key(
                key=key, lnk=self, keep_lock=keep_lock)
@@ -1787,21 +1787,12 @@ class OpWiseCLinker(link.LocalLinker):
            thunks = []
            for node in order:
-                # Maker sure we use the C version of the code whenever
+                # make_thunk will try by default C code, otherwise
-                # possible
+                # it fall back to python.
-                # There are ops that don't have _op_use_c_code property
+                thunks += [node.op.make_thunk(node,
-                # for example ifelse (or any ops that come with their own
+                                              storage_map,
-                # make_thunk
+                                              compute_map,
-                if theano.config.cxx:
+                                              no_recycling)]
-                    thunks += [node.op.make_c_thunk(node,
-                                                    storage_map,
-                                                    compute_map,
-                                                    no_recycling)]
-                else:
-                    thunks += [node.op.make_thunk(node,
-                                                  storage_map,
-                                                  compute_map,
-                                                  no_recycling)]
                thunks[-1].inputs = [storage_map[v] for v in node.inputs]
                thunks[-1].outputs = [storage_map[v] for v in node.outputs]

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -823,10 +823,11 @@ class PerformLinker(LocalLinker):
            # the python version
            # Note : ops that implement their own make thunk don't usually
            # have this attribute defiend !!
-            thunks += [node.op.make_py_thunk(node,
+            thunks += [node.op.make_thunk(node,
-                                             storage_map,
+                                          storage_map,
-                                             compute_map,
+                                          compute_map,
-                                             no_recycling)]
+                                          no_recycling,
+                                          'py')]
            thunks[-1].inputs = [storage_map[v] for v in node.inputs]
            thunks[-1].outputs = [storage_map[v] for v in node.outputs]

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -792,19 +792,22 @@ class Op(utils.object2, PureOp, CLinkerOp):
    def __init__(self, use_c_code=theano.config.cxx):
        self._op_use_c_code = use_c_code
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        """
        Make any special modifications that the Op needs before doing
        make_thunk().
        This can modify the node inplace and should return nothing.
+        It can be called multiple time with different impl. It is the
+        op responsability to don't re-prepare the node when it isn't
+        good to do so.
        """
        pass
    def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
-        """
+        """Like make_thunk, but will only try to make a C thunk.
-        Like make_thunk, but will only try to make a C thunk.
        """
        node_input_storage = [storage_map[r] for r in node.inputs]
@@ -883,7 +886,8 @@ class Op(utils.object2, PureOp, CLinkerOp):
        rval.lazy = False
        return rval
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling,
+                   impl=None):
        """
        This function must return a thunk, that is a zero-arguments
        function that encapsulates the computation to be performed
@@ -904,6 +908,9 @@ class Op(utils.object2, PureOp, CLinkerOp):
        no_recycling
            List of variables for which it is forbidden to reuse memory
            allocated by a previous call.
+        impl
+            Currently, None, 'c' or 'py'. If 'c' or 'py' we will only try
+            that version of the code.
        Notes
        -----
@@ -913,26 +920,26 @@ class Op(utils.object2, PureOp, CLinkerOp):
        the thunk can potentially cache return values (like CLinker does),
        then it must not do so for variables in the no_recycling list.
+        self.prepare_node(node, ...) is always called. If we try 'c' and it
+        fail and we try again 'py', prepare_node will be called twice.
        """
-        self.prepare_node(node, storage_map=storage_map,
+        if impl is None or impl == 'c':
-                          compute_map=compute_map)
+            self.prepare_node(node, storage_map=storage_map,
+                              compute_map=compute_map, impl='c')
-        if not hasattr(self, '_op_use_c_code'):
-            warnings.warn(
-                "The  __getstate__ method of '%s' is not implemented correctly."
-                " It should keep the attributes added by the base class."
-                " To implement it correctly, it should keep all attributes"
-                " and only remove those it does not want." % (self),
-                stacklevel=2)
-        if getattr(self, '_op_use_c_code', theano.config.cxx):
            try:
                return self.make_c_thunk(node, storage_map, compute_map,
                                         no_recycling)
            except (NotImplementedError, utils.MethodNotDefined):
+                # We requested the c code, so don't catch the error.
+                if impl == 'c':
+                    raise
                _logger.debug('Falling back on perform')
-        # condition: either there was no c_code, or it failed
+        # condition: either there was no c_code, or it failed or
+        # python code was requested.
+        self.prepare_node(node, storage_map=storage_map,
+                          compute_map=compute_map, impl='py')
        return self.make_py_thunk(node, storage_map, compute_map, no_recycling)
    def make_node(self, *inputs):
@@ -1195,9 +1202,9 @@ int main( int argc, const char* argv[] )
                self.openmp = False
                theano.config.openmp = False
-    def prepare_node(self, node, storage_map,
+    def prepare_node(self, node, storage_map, compute_map, impl):
-                     compute_map):
+        if impl == 'c':
-        self.update_self_openmp()
+            self.update_self_openmp()
 def simple_meth(tag):

--- a/theano/gof/tests/test_lazy.py
+++ b/theano/gof/tests/test_lazy.py
@@ -25,7 +25,7 @@ class IfElseIfElseIf(PureOp):
        assert t3.type == f3.type
        return Apply(self, [c1, t1, c2, t2, c3, t3, f3], [t1.type()])
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):
        input_computed = [compute_map[v] for v in node.inputs]
        output_computed = [compute_map[v] for v in node.outputs]
@@ -93,7 +93,7 @@ class NotImplementedOp(PureOp):
    def make_node(self, x):
        return Apply(self, [x], [x.type()])
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl):
        def thunk():
            raise self.E()
        thunk.lazy = False

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -1043,16 +1043,14 @@ class VM_Linker(link.LocalLinker):
        t0 = time.time()
        for node in order:
            try:
+                impl = None
                if self.c_thunks is False:
-                    thunks.append(node.op.make_py_thunk(node,
+                    impl = 'py'
-                                                        storage_map,
+                thunks.append(node.op.make_thunk(node,
-                                                        compute_map,
+                                                 storage_map,
-                                                        no_recycling))
+                                                 compute_map,
-                else:
+                                                 no_recycling,
-                    thunks.append(node.op.make_thunk(node,
+                                                 impl=impl))
-                                                     storage_map,
-                                                     compute_map,
-                                                     no_recycling))
                if not hasattr(thunks[-1], 'lazy'):
                    # We don't want all ops maker to think about lazy Ops.
                    # So if they didn't specify that its lazy or not, it isn't.

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -2640,7 +2640,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
    def get_params(self, node):
        return node.outputs[0].type.context
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        # cache the kernel object
        self.get_kernel_cache(node)

--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -73,7 +73,7 @@ class CuRFFTOp(Op):
        return theano.Apply(self, [inp, s], [self.output_type(inp)()])
-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
@@ -198,7 +198,7 @@ class CuIRFFTOp(Op):
        return theano.Apply(self, [inp, s], [self.output_type(inp)()])
-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -235,7 +235,7 @@ class IfElse(Op):
                if_true_op(*if_true, **dict(return_list=True)) +
                if_false_op(*if_false, **dict(return_list=True)))
-    def make_py_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
        cond = node.inputs[0]
        ts = node.inputs[1:][:self.n_outs]
        fs = node.inputs[1:][self.n_outs:]

--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
@@ -320,7 +320,7 @@ class PycudaElemwiseSourceModuleMakeThunkOp(Op):
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        return out_node
-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        # TODO support broadcast!
        # TODO assert all input have the same shape
        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -246,7 +246,7 @@ class GpuOp(theano.gof.Op):
    """
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if use.device_number is None:
            use("gpu",
                force=True,

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -2119,7 +2119,7 @@ class GpuConv(GpuOp):
                     images[2] * images[3] * 2)
        return flops
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        super(GpuConv, self).prepare_node(node, storage_map, compute_map, impl)
        if node.op.max_threads_dim0 is None:

--- a/theano/sandbox/cuda/cula.py
+++ b/theano/sandbox/cuda/cula.py
@@ -51,10 +51,7 @@ class GpuSolve(GpuOp):
        assert inp2.ndim == 2
        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
-    def make_thunk(self,
+    def make_thunk(self, node, storage_map, _, no_recycling, impl=None):
-                   node,
-                   storage_map, _,
-                   no_recycling=[]):
        # Initialize CULA the first time it is needed
        global cula_initialized

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1512,8 +1512,9 @@ class GpuDnnPool(DnnBase):
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
-        super(GpuDnnPool, self).prepare_node(node, storage_map, compute_map)
+        super(GpuDnnPool, self).prepare_node(
+            node, storage_map, compute_map, impl)
        if len(node.inputs) == 2:
            warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
@@ -1752,7 +1753,7 @@ class GpuDnnPoolGrad(DnnBase):
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) == 4:
            warnings.warn("Theano GPUDnnPoolGrad internal changed.", stacklevel=3)
            # Old interface

--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
@@ -49,20 +49,12 @@ class GpuCumsum(CumsumOp, GpuOp):
        return theano.Apply(self, [x], [x.type()])
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
        node_ = copy.copy(node)
        assert node.op is node_.op
        if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None:
            cuda = theano.sandbox.cuda
            device_id = cuda.use.device_number
-            if device_id is None:
-                cuda.use("gpu",
-                         force=False,
-                         default_to_move_computation_to_gpu=False,
-                         move_shared_float32_to_gpu=False,
-                         enable_cuda=False,
-                         test_driver=True)
-                device_id = cuda.use.device_number
            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
            prop = cuda_ndarray.device_properties(device_id)
            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
@@ -70,7 +62,7 @@ class GpuCumsum(CumsumOp, GpuOp):
            node_.op.max_grid_size2 = prop['maxGridSize2']
        return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
-                                                     compute_map, no_recycling)
+                                                     compute_map, no_recycling, impl)
    def __str__(self):
        return "%s{%s}" % (self.__class__.__name__, self.axis)

--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -48,7 +48,7 @@ class ScikitsCudaOp(GpuOp):
        return theano.Apply(self, [inp], [self.output_type(inp)()])
-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        if not scikits_cuda_available:
            raise RuntimeError(
                "scikits.cuda is needed for all GPU fft implementation,"
@@ -61,7 +61,7 @@ class CuFFTOp(ScikitsCudaOp):
        return CudaNdarrayType(
            broadcastable=[False] * (inp.type.ndim + 1))
-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)
        from theano.misc.pycuda_utils import to_gpuarray
@@ -118,7 +118,7 @@ class CuIFFTOp(ScikitsCudaOp):
        return CudaNdarrayType(
            broadcastable=[False] * (inp.type.ndim - 1))
-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)
        from theano.misc.pycuda_utils import to_gpuarray
@@ -314,7 +314,7 @@ class BatchedComplexDotOp(ScikitsCudaOp):
    def output_type(self, inp):
        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, _2, impl=None):
        super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)
        inputs = [storage_map[v] for v in node.inputs]

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -3664,10 +3664,12 @@ class Composite(ScalarOp):
        #  self.init_name()      # self.name
        self.name = None
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
-        self.init_py_impls()  # self._impls
+        if impl == 'py':
-        for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs):
+            self.init_py_impls()  # self._impls
-            n.op.prepare_node(n, None, None)
+        elif impl == 'c':
+            for n in theano.gof.graph.list_of_nodes(self.inputs, self.outputs):
+                n.op.prepare_node(n, None, None, impl)
    def output_types(self, input_types):
        if tuple(input_types) != self.inputs_type:

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -698,7 +698,7 @@ class Scan(PureOp):
                     scan_utils.hash_listsDictsTuples(self.info)))
    def make_thunk(self, node, storage_map, compute_map, no_recycling,
-                   python_exec=False):
+                   impl=None):
        """
        Parameters
@@ -716,8 +716,8 @@ class Scan(PureOp):
        no_recycling
            List of variables for which it is forbidden to reuse memory
            allocated by a previous call.
-        python_exec
+        impl
-            If we want python execution.
+            Use 'py' if we want python execution.
        Notes
        -----
        If the thunk consults the storage_map on every call, it is safe
@@ -866,7 +866,7 @@ class Scan(PureOp):
                               for out in self.fn.maker.fgraph.outputs]
        try:
-            if python_exec is True:
+            if impl  == 'py':
                raise theano.gof.cmodule.MissingGXX
            cython_mintaps = numpy.asarray(self.mintaps, dtype='int32')
            cython_tap_array_len = \
@@ -965,13 +965,6 @@ class Scan(PureOp):
        rval.lazy = False
        return rval
-    def make_py_thunk(self, node, storage_map, compute_map, no_recycling):
-        return self.make_thunk(node=node,
-                               storage_map=storage_map,
-                               compute_map=compute_map,
-                               no_recycling=no_recycling,
-                               python_exec=True)
    def inner_seqs(self, list_inputs):
        # Given the list of inner inputs this function grabs those
        # corresponding to sequences

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -297,9 +297,6 @@ class Ger(Op):
    This interface to GER allows non-destructive operation on A via the
    `destructive` argument to the constructor.
-    :TODO: Create better classes ScipyGer and CGer that inherit from this class
-    and override the make_thunk() method to use Scipy and C respectively.
    """
    __props__ = ("destructive",)

--- a/theano/tensor/blas_scipy.py
+++ b/theano/tensor/blas_scipy.py
@@ -22,7 +22,7 @@ if have_fblas:
 class ScipyGer(Ger):
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if impl == 'py':
            node.tag.local_ger = _blas_ger_fns[numpy.dtype(
                node.inputs[0].type.dtype)]

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -787,14 +787,15 @@ second dimension
        return ret
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        # Postpone the ufunc building to the last minutes
        # NumPy ufunc support only up to 31 inputs.
        # But our c code support more.
        if (len(node.inputs) < 32 and
                (self.nfunc is None or
                 self.scalar_op.nin != len(node.inputs)) and
-                self.ufunc is None):
+                self.ufunc is None and
+                impl == 'py'):
            ufunc = numpy.frompyfunc(self.scalar_op.impl,
                                     len(node.inputs),
@@ -830,7 +831,7 @@ second dimension
            [get_scalar_type(dtype=output.type.dtype).make_variable()
             for output in node.outputs])
-        self.scalar_op.prepare_node(node.tag.fake_node, None, None)
+        self.scalar_op.prepare_node(node.tag.fake_node, None, None, impl)
    def perform(self, node, inputs, output_storage):
        if len(node.inputs) >= 32:
@@ -891,13 +892,6 @@ second dimension
            if self.ufunc:
                ufunc = self.ufunc
            else:
-                if not hasattr(node.tag, 'ufunc'):
-                    # It happen that make_thunk isn't called, like in
-                    # get_scalar_constant_value
-                    node.tag.ufunc = numpy.frompyfunc(self.scalar_op.impl,
-                                                      len(node.inputs),
-                                                      self.scalar_op.nout)
                ufunc = node.tag.ufunc
            nout = ufunc.nout
@@ -977,7 +971,7 @@ second dimension
        # To not request all of them to call prepare_node(), do it here.
        # There is no harm if it get called multile time.
        if not hasattr(node.tag, 'fake_node'):
-            self.prepare_node(node, None, None)
+            self.prepare_node(node, None, None, 'c')
        _inames = inames
        _onames = onames

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -6295,15 +6295,12 @@ def constant_folding(node):
    for o in node.outputs:
        storage_map[o] = [None]
        compute_map[o] = [False]
+    impl = None
    if (hasattr(node.op, 'python_constant_folding') and
            node.op.python_constant_folding(node)):
-        thunk = node.op.make_py_thunk(node,
+        impl = 'py'
-                                   storage_map,
+    thunk = node.op.make_thunk(node, storage_map, compute_map,
-                                   compute_map,
+                               no_recycling=[], impl=impl)
-                                   [])
-    else:
-        thunk = node.op.make_thunk(node, storage_map, compute_map,
-                                   no_recycling=[])
    required = thunk()
    assert not required  # a node whose inputs are all provided should always

--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
@@ -241,7 +241,7 @@ class Pool(OpenMPOp):
                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
        self.mode = mode
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) == 1:
            # Old interface
            self.mode = node.op.mode
@@ -686,7 +686,7 @@ class PoolGrad(OpenMPOp):
        self.mode = mode
        super(PoolGrad, self).__init__(openmp=openmp)
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        if len(node.inputs) < 5:  # 5 for AveragePoolGrad, 6 for MaxPoolGrad
            # Old interface
            self.mode = node.op.mode