Make Op.perform an abstractmethod and provide Op type hints

This change makes `Op.perform` a mandatory method. Since more than a few `Op`s do not have Python implementations, they've been made to extend `_NoPython*Op` classes that provide an `Op.perform` that simply raises a `NotImplementedError`.

Make Op.perform an abstractmethod and provide Op type hints
0366c559 · Brandon T. Willard · Brandon T. Willard · 5a1a147d · 0366c559 · 0366c559
--- a/tests/compile/test_debugmode.py
+++ b/tests/compile/test_debugmode.py
@@ -118,6 +118,9 @@ class WeirdBrokenOp(COp):
        r = Apply(self, [a_], [a_.type()])
        return r

+    def perform(*args, **kwargs):
+        raise NotImplementedError()
+
    def dontuse_perform(self, node, inp, out_):
        (a,) = inp
        (out,) = out_

--- a/tests/gof/test_compute_test_value.py
+++ b/tests/gof/test_compute_test_value.py
@@ -41,6 +41,9 @@ class IncOneC(COp):
        (z,) = outputs
        return f"{z} = {x} + 1;"

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
+

 class TestComputeTestValue:
    def test_destroy_map(self):

--- a/tests/gof/test_destroyhandler.py
+++ b/tests/gof/test_destroyhandler.py
@@ -85,6 +85,9 @@ class MyOp(Op):
        outputs = [MyVariable(self.name + "_R") for i in range(self.nout)]
        return Apply(self, inputs, outputs)

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
+
    def __str__(self):
        return self.name


--- a/tests/gof/test_graph.py
+++ b/tests/gof/test_graph.py
@@ -58,6 +58,9 @@ class MyOp(Op):
        outputs = [MyVariable(sum(input.type.thingy for input in inputs))]
        return Apply(self, list(inputs), outputs)

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
+

 MyOp = MyOp()


--- a/tests/gof/test_op.py
+++ b/tests/gof/test_op.py
@@ -60,6 +60,9 @@ class MyOp(Op):
            outputs = [MyType(sum([input.type.thingy for input in inputs]))()]
            return Apply(self, inputs, outputs)

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
+

 MyOp = MyOp()

@@ -104,6 +107,9 @@ counter%(name)s++;
    def c_code_cache_version(self):
        return (1,)

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
+

 class TestOp:

@@ -206,6 +212,9 @@ class TestMakeThunk:
                (z,) = outputs
                return f"{z} = {x} + 1;"

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError("No Python implementation available.")
+
        i = scalar.int32("i")
        o = IncOneC()(i)


--- a/tests/gof/test_toolbox.py
+++ b/tests/gof/test_toolbox.py
@@ -48,6 +48,9 @@ class TestNodeFinder:
            def __str__(self):
                return self.name

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        sigmoid = MyOp(1, "Sigmoid")
        add = MyOp(2, "Add")
        dot = MyOp(2, "Dot")

--- a/tests/gof/test_types.py
+++ b/tests/gof/test_types.py
@@ -39,6 +39,9 @@ Py_INCREF(%(inp)s);
    def c_code_cache_version(self):
        return (0,)

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
+

 class GetOp(COp):
    __props__ = ()
@@ -65,6 +68,9 @@ Py_INCREF(%(out)s);
    def c_code_cache_version(self):
        return (0,)

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
+

 @pytest.mark.skipif(
    not theano.config.cxx, reason="G++ not available, so we need to skip this test."
@@ -192,6 +198,9 @@ class MyOpCEnumType(COp):
    def make_node(self):
        return Apply(self, [], [scalar.uint32()])

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
+
    def c_code_cache_version(self):
        return (3,)


--- a/tests/gpuarray/test_cgpukernelbase.py
+++ b/tests/gpuarray/test_cgpukernelbase.py
@@ -4,7 +4,6 @@ import pytest
 import theano
 from theano import config, tensor
 from theano.gof.graph import Apply
-from theano.gof.op import Op
 from theano.gof.params_type import ParamsType
 from theano.gpuarray.basic_ops import CGpuKernelBase
 from theano.gpuarray.type import GpuArrayType, get_context, gpu_context_type
@@ -12,11 +11,11 @@ from theano.gradient import grad_undefined
 from theano.scalar import int32 as int_t


-# This is an implementation to test that CGpuKernelBase works and also
-# to use as an example in the docs.  It is not used for user graphs.
-class GpuEye(CGpuKernelBase, Op):
-    """
-    Eye for GPU.
+class GpuEye(CGpuKernelBase):
+    """Eye for GPU.
+
+    This is an implementation to test that `CGpuKernelBase` works and also
+    to use as an example in the docs.  It is not used for user graphs.

    """

@@ -28,9 +27,7 @@ class GpuEye(CGpuKernelBase, Op):
            dtype = config.floatX
        self.dtype = dtype
        self.context_name = context_name
-        CGpuKernelBase.__init__(
-            self, ["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)"
-        )
+        super().__init__(["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)")

    def get_params(self, node):
        pygpu_gpuarray = pytest.importorskip("pygpu.gpuarray")

--- a/tests/tensor/test_basic.py
+++ b/tests/tensor/test_basic.py
@@ -984,6 +984,9 @@ class ApplyDefaultTestOp(Op):
        x = tt.as_tensor_variable(x)
        return Apply(self, [x], [x.type()])

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
+

 def test_constant():
    int8_vector_type = tt.TensorType(dtype="int8", broadcastable=(False,))
@@ -3862,6 +3865,9 @@ class TestGrad:
            gz0, gz1 = grads
            return self.gval0, self.gval1

+        def perform(self, *args, **kwargs):
+            raise NotImplementedError()
+
    def test_1param(self):
        # grad: Test passing a single variable param
        o = TestGrad.Obj1()

--- a/tests/tensor/test_merge.py
+++ b/tests/tensor/test_merge.py
@@ -38,6 +38,9 @@ class MyOp(Op):
        outputs = [MyType()()]
        return Apply(self, inputs, outputs)

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
+
    def __str__(self):
        return self.name


--- a/tests/test_gradient.py
+++ b/tests/test_gradient.py
@@ -51,6 +51,9 @@ class TestGradSourcesInputs:
                (x,) = inp
                (gz,) = grads

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        a = retNone().make_node()
        with pytest.raises(TypeError):
            grad_sources_inputs([(a.out, one)], None)
@@ -68,6 +71,9 @@ class TestGradSourcesInputs:
            def grad(self, inputs, grads):
                return [inputs[0].zeros_like()]

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        i = theano.tensor.vector()
        j = theano.tensor.vector()
        a1 = retOne().make_node(i)
@@ -91,6 +97,9 @@ class TestGradSourcesInputs:
            def grad(self, inp, grads):
                return (gval,)

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        a1 = TestOp().make_node()
        g = grad_sources_inputs([(a1.outputs[0], one)], None)
        assert g[a1.inputs[0]] is gval
@@ -112,6 +121,9 @@ class TestGradSourcesInputs:
                gz1, gz2 = grads
                return (gval,)

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        a1 = TestOp().make_node()
        g = grad_sources_inputs([(a1.outputs[0], one)], None)
        assert g[a1.inputs[0]] is gval
@@ -134,6 +146,9 @@ class TestGradSourcesInputs:
                (gz,) = grads
                return (gval0, gval1)

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        a1 = TestOp().make_node()
        g = grad_sources_inputs([(a1.outputs[0], one)], None)
        assert g[a1.inputs[0]] is gval0
@@ -155,6 +170,9 @@ class TestGradSourcesInputs:
            def grad(self, inp, grads):
                return gval0, gval1

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        a1 = TestOp().make_node()
        g = grad_sources_inputs([(a1.outputs[0], one)], None)
        assert g[a1.inputs[0]] is gval0
@@ -190,6 +208,9 @@ class TestGrad:
            def grad(self, inputs, output_grads):
                return [theano.gradient.grad_not_implemented(self, 0, inputs[0])]

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        a = theano.tensor.scalar()
        b = DummyOp()(a)

@@ -208,6 +229,9 @@ class TestGrad:
            def grad(self, inputs, output_grads):
                return [theano.gradient.grad_undefined(self, 0, inputs[0])]

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        a = theano.tensor.scalar()
        b = DummyOp()(a)

@@ -380,6 +404,9 @@ class TestGrad:
            def grad(self, inputs, output_grads):
                return [inputs[0].zeros_like()]

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        # Op2 has two inputs, f and g
        # Its gradient with respect to g is not defined
        class Op2(Op):
@@ -391,6 +418,9 @@ class TestGrad:
            def grad(self, inputs, output_grads):
                return [inputs[0].zeros_like(), NullType()()]

+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
+
        x = theano.tensor.vector()
        f, g = Op1()(x)
        cost = Op2()(f, g)

--- a/tests/test_ifelse.py
+++ b/tests/test_ifelse.py
@@ -581,6 +581,9 @@ class IfElseIfElseIf(Op):
        thunk.lazy = True
        return thunk

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
+

 class NotImplementedOpException(Exception):
    pass
@@ -597,6 +600,9 @@ class NotImplementedOp(Op):
        thunk.lazy = False
        return thunk

+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
+

 def test_ifelse():
    a = tt.scalar()

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -10,8 +10,22 @@ import inspect
 import os
 import re
 import sys
-import typing
 import warnings
+from abc import abstractmethod
+from typing import (
+    Any,
+    Callable,
+    ClassVar,
+    Dict,
+    List,
+    NoReturn,
+    Optional,
+    Pattern,
+    Set,
+    Text,
+    Tuple,
+    Union,
+)

 import numpy as np

@@ -19,7 +33,7 @@ import theano
 from theano.configdefaults import config
 from theano.gof.fg import FunctionGraph
 from theano.gof.graph import Apply, NoParams, Variable
-from theano.gof.params_type import ParamsType
+from theano.gof.params_type import Params, ParamsType
 from theano.gof.utils import (
    MetaObject,
    MethodNotDefined,
@@ -30,15 +44,22 @@ from theano.gof.utils import (
 from theano.link.c.interface import CLinkerOp


-__authors__ = "theano-dev"
+__authors__ = "theano-dev" "PyMC Developers"
 __copyright__ = "(c) 2010, Universite de Montreal"
-__license__ = "3-clause BSD License"
-__contact__ = "theano-dev <theano-dev@googlegroups.com>"

 __docformat__ = "restructuredtext en"

+StorageMapType = List[Optional[List[Any]]]
+ComputeMapType = List[bool]
+OutputStorageType = List[Optional[List[Any]]]
+ParamsInputType = Optional[Tuple[Any]]
+PerformMethodType = Callable[
+    [Apply, List[Any], OutputStorageType, ParamsInputType], NoReturn
+]
+ThunkType = Callable[[PerformMethodType, StorageMapType, ComputeMapType, Apply], Any]
+

-def compute_test_value(node):
+def compute_test_value(node: Apply):
    """Computes the test value of a node.

    Parameters
@@ -149,7 +170,7 @@ class Op(MetaObject):

    """

-    def make_node(self, *inputs) -> Apply:
+    def make_node(self, *inputs: Variable) -> Apply:
        """Construct an `Apply` node that represent the application of this operation to the given inputs.

        This must be implemented by sub-classes.
@@ -182,9 +203,7 @@ class Op(MetaObject):
            )
        return Apply(self, inputs, [o() for o in self.otypes])

-    def __call__(
-        self, *inputs, **kwargs
-    ) -> typing.Union[Variable, typing.List[Variable],]:
+    def __call__(self, *inputs: Any, **kwargs) -> Union[Variable, List[Variable]]:
        """Construct an `Apply` node using `self.make_node` and return its outputs.

        This method is just a wrapper around `Op.make_node`.
@@ -246,14 +265,16 @@ class Op(MetaObject):
            else:
                return node.outputs

-    def __ne__(self, other):
+    def __ne__(self, other: Any) -> bool:
        return not (self == other)

    # Convenience so that subclass implementers don't have to import utils
    # just to self.add_tag_trace
    add_tag_trace = staticmethod(add_tag_trace)

-    def grad(self, inputs, output_grads):
+    def grad(
+        self, inputs: List[Variable], output_grads: List[Variable]
+    ) -> List[Variable]:
        """Construct a graph for the gradient with respect to each input variable.

        Each returned `Variable` represents the gradient with respect to that
@@ -277,7 +298,12 @@ class Op(MetaObject):
        """
        raise NotImplementedError()

-    def L_op(self, inputs, outputs, output_grads):
+    def L_op(
+        self,
+        inputs: List[Variable],
+        outputs: List[Variable],
+        output_grads: List[Variable],
+    ) -> List[Variable]:
        r"""Construct a graph for the L-operator.

        This method is primarily used by `tensor.Lop` and dispatches to
@@ -298,7 +324,9 @@ class Op(MetaObject):
        """
        return self.grad(inputs, output_grads)

-    def R_op(self, inputs, eval_points):
+    def R_op(
+        self, inputs: List[Variable], eval_points: Union[Variable, List[Variable]]
+    ) -> List[Variable]:
        """Construct a graph for the R-operator.

        This method is primarily used by tensor.Rop
@@ -325,10 +353,15 @@ class Op(MetaObject):
        """
        raise NotImplementedError()

-    def perform(self, node, inputs, output_storage, params=None):
-        """
-        Required: Calculate the function on the inputs and put the variables in
-        the output storage. Return None.
+    @abstractmethod
+    def perform(
+        self,
+        node: Apply,
+        inputs: List[Variable],
+        output_storage: OutputStorageType,
+        params: ParamsInputType = None,
+    ) -> NoReturn:
+        """Calculate the function on the inputs and put the variables in the output storage.

        Parameters
        ----------
@@ -358,21 +391,9 @@ class Op(MetaObject):
        A `Op` is free to reuse `output_storage` as it sees fit, or to
        discard it and allocate new memory.

-        Raises
-        ------
-        MethodNotDefined
-            The subclass does not override this method.
-
        """
-        raise MethodNotDefined(
-            "perform",
-            type(self),
-            self.__class__.__name__,
-            "Did you used Theano flags mode=FAST_COMPILE?"
-            " You can use optimizer=fast_compile instead.",
-        )

-    def do_constant_folding(self, fgraph: FunctionGraph, node: Apply):
+    def do_constant_folding(self, fgraph: FunctionGraph, node: Apply) -> bool:
        """Determine whether or not constant folding should be performed for the given node.

        This allows each `Op` to determine if it wants to be constant
@@ -393,9 +414,8 @@ class Op(MetaObject):
        """
        return True

-    # We add a default get_params() implementation which will try to detect params from the op
-    # if params_type is set to a ParamsType. If not, we raise a MethodNotDefined exception.
-    def get_params(self, node):
+    def get_params(self, node: Apply) -> Params:
+        """Try to detect params from the op if `Op.params_type` is set to a `ParamsType`."""
        if hasattr(self, "params_type") and isinstance(self.params_type, ParamsType):
            wrapper = self.params_type
            if not all(hasattr(self, field) for field in wrapper.fields):
@@ -410,10 +430,14 @@ class Op(MetaObject):
            return self.params_type.get_params(self)
        raise MethodNotDefined("get_params")

-    def prepare_node(self, node, storage_map, compute_map, impl):
-        """
-        Make any special modifications that the Op needs before doing
-        make_thunk().
+    def prepare_node(
+        self,
+        node: Apply,
+        storage_map: StorageMapType,
+        compute_map: ComputeMapType,
+        impl: Optional[Text],
+    ) -> NoReturn:
+        """Make any special modifications that the Op needs before doing `Op.make_thunk`.

        This can modify the node inplace and should return nothing.

@@ -423,9 +447,17 @@ class Op(MetaObject):

        """

-    def make_py_thunk(self, node, storage_map, compute_map, no_recycling, debug=False):
-        """
-        Like make_thunk() but only makes python thunks.
+    def make_py_thunk(
+        self,
+        node: Apply,
+        storage_map: StorageMapType,
+        compute_map: ComputeMapType,
+        no_recycling: bool,
+        debug: bool = False,
+    ) -> ThunkType:
+        """Make a Python thunk.
+
+        Like `Op.make_thunk` but only makes python thunks.

        """
        node_input_storage = [storage_map[r] for r in node.inputs]
@@ -467,7 +499,14 @@ class Op(MetaObject):
        rval.lazy = False
        return rval

-    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
+    def make_thunk(
+        self,
+        node: Apply,
+        storage_map: StorageMapType,
+        compute_map: ComputeMapType,
+        no_recycling: bool,
+        impl: Optional[Text] = None,
+    ) -> ThunkType:
        """Create a thunk.

        This function must return a thunk, that is a zero-arguments
@@ -513,8 +552,18 @@ class Op(MetaObject):
 class COp(Op, CLinkerOp):
    """An `Op` with a C implementation."""

-    def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
-        """Like make_thunk, but will only try to make a C thunk."""
+    def make_c_thunk(
+        self,
+        node: Apply,
+        storage_map: StorageMapType,
+        compute_map: ComputeMapType,
+        no_recycling: bool,
+    ) -> ThunkType:
+        """Create a thunk for a C implementation.
+
+        Like `Op.make_thunk`, but will only try to make a C thunk.
+
+        """
        # FIXME: Putting the following import on the module level causes an import cycle.
        #        The conclusion should be that the antire "make_c_thunk" method should be defined
        #        in theano.link.c and dispatched onto the Op!
@@ -593,7 +642,7 @@ class COp(Op, CLinkerOp):
        )


-def get_test_value(v):
+def get_test_value(v: Variable) -> Any:
    """Get the test value for `v`.

    If input `v` is not already a variable, it is turned into one by calling
@@ -610,7 +659,7 @@ def get_test_value(v):
    return v.get_test_value()


-def missing_test_message(msg):
+def missing_test_message(msg: Text) -> NoReturn:
    """
    Displays msg, a message saying that some test_value is missing,
    in the appropriate form based on config.compute_test_value:
@@ -635,8 +684,9 @@ def missing_test_message(msg):
        assert action in ["ignore", "off"]


-def get_test_values(*args):
-    """
+def get_test_values(*args: Variable) -> Union[Any, List[Any]]:
+    """Get test values for multiple `Variable`s.
+
    Intended use:

        for val_1, ..., val_n in get_debug_values(var_1, ..., var_n):
@@ -681,7 +731,7 @@ def get_test_values(*args):
    return [tuple(rval)]


-ops_with_inner_function = {}
+ops_with_inner_function: Dict[Op, Text] = {}
 """
 Registry of Ops that have an inner compiled Theano function.

@@ -711,18 +761,18 @@ class OpenMPOp(COp):

    """

-    gxx_support_openmp = None
+    gxx_support_openmp: Optional[bool] = None
    """
    True/False after we tested this.

    """

-    def __init__(self, openmp=None):
+    def __init__(self, openmp: Optional[bool] = None):
        if openmp is None:
            openmp = config.openmp
        self.openmp = openmp

-    def __setstate__(self, d):
+    def __setstate__(self, d: Dict):
        self.__dict__.update(d)
        # If we unpickle old op
        if not hasattr(self, "openmp"):
@@ -748,9 +798,7 @@ class OpenMPOp(COp):

    @staticmethod
    def test_gxx_support():
-        """
-        Check if openMP is supported
-        """
+        """Check if openMP is supported."""
        from theano.link.c.cmodule import GCC_compiler

        code = """
@@ -769,7 +817,7 @@ int main( int argc, const char* argv[] )
        )
        return default_openmp

-    def update_self_openmp(self):
+    def update_self_openmp(self) -> NoReturn:
        """
        Make sure self.openmp is not True if there is no support in gxx.

@@ -797,21 +845,60 @@ int main( int argc, const char* argv[] )
            self.update_self_openmp()


+def lquote_macro(txt: Text) -> Text:
+    """Turn the last line of text into a ``\\``-commented line."""
+    res = []
+    spl = txt.split("\n")
+    for l in spl[:-1]:
+        res.append(l + " \\")
+    res.append(spl[-1])
+    return "\n".join(res)
+
+
+def get_sub_macros(sub: Dict[Text, Text]) -> Tuple[Text]:
+    define_macros = []
+    undef_macros = []
+    define_macros.append(f"#define FAIL {lquote_macro(sub['fail'])}")
+    undef_macros.append("#undef FAIL")
+    if "params" in sub:
+        define_macros.append(f"#define PARAMS {sub['params']}")
+        undef_macros.append("#undef PARAMS")
+
+    return "\n".join(define_macros), "\n".join(undef_macros)
+
+
+def get_io_macros(inputs: List[Text], outputs: List[Text]) -> Tuple[List[Text]]:
+    define_macros = []
+    undef_macros = []
+
+    for i, inp in enumerate(inputs):
+        define_macros.append(f"#define INPUT_{int(i)} {inp}")
+        undef_macros.append(f"#undef INPUT_{int(i)}")
+
+    for i, out in enumerate(outputs):
+        define_macros.append(f"#define OUTPUT_{int(i)} {out}")
+        undef_macros.append(f"#undef OUTPUT_{int(i)}")
+
+    return "\n".join(define_macros), "\n".join(undef_macros)
+
+
 class ExternalCOp(COp):
-    """
-    Class to allow an op to have an external C implementation.
+    """Class for an `Op` with an external C implementation.

-    An op can use this class by inheriting from it and calling its
-    __init__() method, providing it with a path to an external file containing
-    the C implementation and the name of the function, in that file, to call
-    to perform the computations for the op.
+    One can inherit from this class, provide its constructor with a path to
+    an external C source file and the name of a function within it, and define
+    an `Op` for said function.

    """

-    section_re = re.compile(r"^#section ([a-zA-Z0-9_]+)$", re.MULTILINE)
-    backward_re = re.compile(r"^THEANO_(APPLY|SUPPORT)_CODE_SECTION$", re.MULTILINE)
+    section_re: ClassVar[Pattern] = re.compile(
+        r"^#section ([a-zA-Z0-9_]+)$", re.MULTILINE
+    )
+    backward_re: ClassVar[Pattern] = re.compile(
+        r"^THEANO_(APPLY|SUPPORT)_CODE_SECTION$", re.MULTILINE
+    )
    # This is the set of allowed markers
-    SECTIONS = {
+    SECTIONS: ClassVar[Set[Text]] = {
        "init_code",
        "init_code_apply",
        "init_code_struct",
@@ -824,11 +911,10 @@ class ExternalCOp(COp):
    }

    @classmethod
-    def get_path(cls, f):
-        """
-        Convert a path relative to the location of the class file into
-        an aboslute path. Paths that are already absolute are passed
-        through unchanged.
+    def get_path(cls, f: Text) -> Text:
+        """Convert a path relative to the location of the class file into an absolute path.
+
+        Paths that are already absolute are passed through unchanged.

        """
        if not os.path.isabs(f):
@@ -837,7 +923,9 @@ class ExternalCOp(COp):
            f = os.path.realpath(os.path.join(class_dir, f))
        return f

-    def __init__(self, func_files, func_name=None):
+    def __init__(
+        self, func_files: Union[Text, List[Text]], func_name: Optional[Text] = None
+    ):
        """
        Sections are loaded from files in order with sections in later
        files overriding sections in previous files.
@@ -868,10 +956,8 @@ class ExternalCOp(COp):
                    "and specify the func_name"
                )

-    def load_c_code(self, func_files):
-        """
-        Loads the c code to perform the Op
-        """
+    def load_c_code(self, func_files: List[Text]) -> NoReturn:
+        """Loads the C code to perform the `Op`."""
        func_files = [self.get_path(f) for f in func_files]
        self.func_codes = []
        for func_file in func_files:
@@ -940,10 +1026,8 @@ class ExternalCOp(COp):
                    f"No valid section marker was found in file {func_files[i]}"
                )

-    def __get_op_params(self):
-        """
-        Returns a list of (name, value) pairs that will be turned into
-        macros for use within the op code.
+    def __get_op_params(self) -> List[Text]:
+        """Construct name, value pairs that will be turned into macros for use within the `Op`'s code.

        The names must be strings that are not a C keyword and the
        values must be strings of literal C representations.
@@ -1031,10 +1115,12 @@ class ExternalCOp(COp):
        else:
            return super().c_cleanup_code_struct(node, name)

-    def format_c_function_args(self, inp, out):
-        # Generate an string containing the arguments sent to the external C
-        # function. The argstring will be of format :
-        # "input0, input1, input2, &output0, &output1"
+    def format_c_function_args(self, inp: List[Text], out: List[Text]) -> Text:
+        """Generate a string containing the arguments sent to the external C function.
+
+        The result will have the format: ``"input0, input1, input2, &output0, &output1"``.
+
+        """
        inp = list(inp)
        numi = getattr(self, "_cop_num_inputs", len(inp))
        while len(inp) < numi:
@@ -1045,7 +1131,10 @@ class ExternalCOp(COp):
            out.append("NULL")
        return ", ".join(inp + out)

-    def get_c_macros(self, node, name, check_input=None):
+    def get_c_macros(
+        self, node: Apply, name: Text, check_input: Optional[bool] = None
+    ) -> Tuple[Text]:
+        "Construct a pair of C ``#define`` and ``#undef`` code strings."
        define_template = "#define %s %s"
        undef_template = "#undef %s"
        define_macros = []
@@ -1097,37 +1186,6 @@ class ExternalCOp(COp):

        return "\n".join(define_macros), "\n".join(undef_macros)

-    def _lquote_macro(self, txt):
-        res = []
-        spl = txt.split("\n")
-        for l in spl[:-1]:
-            res.append(l + " \\")
-        res.append(spl[-1])
-        return "\n".join(res)
-
-    def get_sub_macros(self, sub):
-        define_macros = []
-        undef_macros = []
-        define_macros.append(f"#define FAIL {self._lquote_macro(sub['fail'])}")
-        undef_macros.append("#undef FAIL")
-        if "params" in sub:
-            define_macros.append(f"#define PARAMS {sub['params']}")
-            undef_macros.append("#undef PARAMS")
-
-        return "\n".join(define_macros), "\n".join(undef_macros)
-
-    def get_io_macros(self, inputs, outputs):
-        define_macros = []
-        undef_macros = []
-
-        for i, inp in enumerate(inputs):
-            define_macros.append(f"#define INPUT_{int(i)} {inp}")
-            undef_macros.append(f"#undef INPUT_{int(i)}")
-
-        for i, out in enumerate(outputs):
-            define_macros.append(f"#define OUTPUT_{int(i)} {inp}")
-            undef_macros.append(f"#undef OUTPUT_{int(i)}")
-
    def c_init_code_struct(self, node, name, sub):
        """
        Stitches all the macros and "init_code" together
@@ -1137,7 +1195,7 @@ class ExternalCOp(COp):
            op_code = self.code_sections["init_code_struct"]

            def_macros, undef_macros = self.get_c_macros(node, name)
-            def_sub, undef_sub = self.get_sub_macros(sub)
+            def_sub, undef_sub = get_sub_macros(sub)

            return "\n".join(
                ["", def_macros, def_sub, op_code, undef_sub, undef_macros]
@@ -1179,8 +1237,8 @@ class ExternalCOp(COp):
                op_code = self.code_sections["code"]

                def_macros, undef_macros = self.get_c_macros(node, name)
-                def_sub, undef_sub = self.get_sub_macros(sub)
-                def_io, undef_io = self.get_io_macros(inp, out)
+                def_sub, undef_sub = get_sub_macros(sub)
+                def_io, undef_io = get_io_macros(inp, out)

                return "\n".join(
                    [
@@ -1204,8 +1262,8 @@ class ExternalCOp(COp):
            op_code = self.code_sections["code_cleanup"]

            def_macros, undef_macros = self.get_c_macros(node, name)
-            def_sub, undef_sub = self.get_sub_macros(sub)
-            def_io, undef_io = self.get_io_macros(inputs, outputs)
+            def_sub, undef_sub = get_sub_macros(sub)
+            def_io, undef_io = get_io_macros(inputs, outputs)

            return "\n".join(
                [
@@ -1220,3 +1278,38 @@ class ExternalCOp(COp):
            )
        else:
            return super().c_code_cleanup(node, name, inputs, outputs, sub)
+
+
+class _NoPythonOp(Op):
+    """A class used to indicate that an `Op` does not provide a Python implementation.
+
+    XXX: Do not use this class; it's only for tracking bad implementations internally.
+
+    """
+
+    def perform(self, node, inputs, output_storage, params=None):
+        raise NotImplementedError("No Python implementation is provided by this Op.")
+
+
+class _NoPythonCOp(COp):
+    """A class used to indicate that a `COp` does not provide a Python implementation.
+
+    XXX: Do not use this class; it's only for tracking bad implementations internally.
+
+    """
+
+    def perform(self, node, inputs, output_storage, params=None):
+        raise NotImplementedError("No Python implementation is provided by this COp.")
+
+
+class _NoPythonExternalCOp(ExternalCOp):
+    """A class used to indicate that a `ExternalCOp` does not provide a Python implementation.
+
+    XXX: Do not use this class; it's only for tracking bad implementations internally.
+
+    """
+
+    def perform(self, node, inputs, output_storage, params=None):
+        raise NotImplementedError(
+            "No Python implementation is provided by this ExternalCOp."
+        )
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -9,7 +9,7 @@ import theano
 from theano import tensor
 from theano.configdefaults import config
 from theano.gof.graph import Apply, Variable
-from theano.gof.op import COp, ExternalCOp, Op
+from theano.gof.op import COp, ExternalCOp, Op, _NoPythonOp
 from theano.gof.opt import copy_stack_trace
 from theano.gof.params_type import ParamsType
 from theano.gof.type import CType
@@ -493,6 +493,14 @@ int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
        return (9,)


+class GpuKernelBaseCOp(GpuKernelBase, COp):
+    pass
+
+
+class GpuKernelBaseExternalCOp(GpuKernelBase, ExternalCOp):
+    pass
+
+
 def forward_string_meth(name):
    def f(*args):
        res = getattr(GpuKernelBase, name)(*args)
@@ -517,7 +525,7 @@ def get_dtype(s):
        return np.dtype(s)


-class CGpuKernelBase(ExternalCOp, GpuKernelBase):
+class CGpuKernelBase(GpuKernelBaseExternalCOp, _NoPythonOp):
    """
    Class to combine GpuKernelBase and ExternalCOp.

@@ -1498,7 +1506,7 @@ class GpuJoin(HideC, Join):
 gpu_join = GpuJoin()


-class GpuSplit(HideC, Split):
+class GpuSplit(HideC, Split, _NoPythonOp):
    """
    Split for GPU.

@@ -1748,7 +1756,7 @@ def profile_printer(
        print("", file=file)


-class GpuEye(GpuKernelBase, Op):
+class GpuEye(GpuKernelBaseCOp, _NoPythonOp):
    """
    Eye for GPU.

@@ -1882,7 +1890,7 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
        return (10,)


-class GpuTri(GpuKernelBase, Op):
+class GpuTri(GpuKernelBaseCOp, _NoPythonOp):
    """
    Tri for GPU.


--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
 import theano
 from theano.compile import optdb
 from theano.gof.graph import Apply
-from theano.gof.op import COp
+from theano.gof.op import _NoPythonCOp
 from theano.gof.opt import LocalOptGroup
 from theano.gof.params_type import ParamsType
 from theano.scalar import bool as bool_t
@@ -27,7 +27,7 @@ except ImportError:
    pass


-class BlasOp(COp):
+class BlasOp(_NoPythonCOp):
    def c_headers(self, **kwargs):
        return ["<blas_api.h>", "<numpy_compat.h>", "<gpuarray_helper.h>"]

@@ -412,7 +412,7 @@ class GpuDot22(BlasOp):
 gpu_dot22 = GpuDot22()


-class GpuGemmBatch(BlasOp):
+class GpuGemmBatch(BlasOp, _NoPythonCOp):
    params_type = ParamsType(inplace=bool_t)
    __props__ = ("inplace",)
    _f16_ok = True
@@ -1009,7 +1009,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        )


-class GpuCorrMM(BaseGpuCorrMM):
+class GpuCorrMM(BaseGpuCorrMM, _NoPythonCOp):
    """
    GPU correlation implementation using Matrix Multiplication.

@@ -1129,7 +1129,7 @@ class GpuCorrMM(BaseGpuCorrMM):
        return d_bottom, d_weights


-class GpuCorrMM_gradWeights(BaseGpuCorrMM):
+class GpuCorrMM_gradWeights(BaseGpuCorrMM, _NoPythonCOp):
    """
    Gradient wrt. filters for `GpuCorrMM`.

@@ -1235,7 +1235,7 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
            return [[1], [1], [0], [0]]  # no connection to height, width


-class GpuCorrMM_gradInputs(BaseGpuCorrMM):
+class GpuCorrMM_gradInputs(BaseGpuCorrMM, _NoPythonCOp):
    """
    Gradient wrt. inputs for `GpuCorrMM`.

@@ -1337,7 +1337,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            return [[1], [1], [0], [0]]  # no connection to height, width


-class BaseGpuCorr3dMM(CGpuKernelBase):
+class BaseGpuCorr3dMM(CGpuKernelBase, _NoPythonCOp):
    """
    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
    `GpuCorr3dMM_gradInputs`. Cannot be used directly.
@@ -1777,7 +1777,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        )


-class GpuCorr3dMM(BaseGpuCorr3dMM):
+class GpuCorr3dMM(BaseGpuCorr3dMM, _NoPythonCOp):
    """
    GPU correlation implementation using Matrix Multiplication.

@@ -1881,7 +1881,7 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        return d_bottom, d_weights


-class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
+class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM, _NoPythonCOp):
    """
    Gradient wrt. filters for `GpuCorr3dMM`.

@@ -1970,7 +1970,7 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth


-class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
+class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM, _NoPythonCOp):
    """
    Gradient wrt. inputs for `GpuCorr3dMM`.


--- a/theano/gpuarray/blocksparse.py
+++ b/theano/gpuarray/blocksparse.py
@@ -4,7 +4,7 @@ import numpy as np

 from theano import tensor
 from theano.gof.graph import Apply
-from theano.gof.op import ExternalCOp
+from theano.gof.op import _NoPythonExternalCOp
 from theano.gof.params_type import ParamsType
 from theano.gradient import grad_undefined
 from theano.scalar import bool as bool_t
@@ -17,7 +17,7 @@ from .type import gpu_context_type
 _logger = logging.getLogger("theano.gpuarray.blocksparse")


-class GpuSparseBlockGemv(ExternalCOp):
+class GpuSparseBlockGemv(_NoPythonExternalCOp):
    """
    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
    information.
@@ -32,7 +32,7 @@ class GpuSparseBlockGemv(ExternalCOp):
    # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.

    def __init__(self, inplace=False):
-        ExternalCOp.__init__(self, "c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
+        super().__init__("c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}
@@ -92,7 +92,7 @@ gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
 gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)


-class GpuSparseBlockOuter(ExternalCOp):
+class GpuSparseBlockOuter(_NoPythonExternalCOp):
    """
    GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
    information.
@@ -106,7 +106,7 @@ class GpuSparseBlockOuter(ExternalCOp):
    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)

    def __init__(self, inplace=False):
-        ExternalCOp.__init__(self, ["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)")
+        super().__init__(["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

--- a/theano/gpuarray/ctc.py
+++ b/theano/gpuarray/ctc.py
@@ -4,7 +4,7 @@ import sys
 import theano.tensor as tt
 from theano.configdefaults import config
 from theano.gof.graph import Apply
-from theano.gof.op import ExternalCOp
+from theano.gof.op import _NoPythonExternalCOp
 from theano.gof.opt import local_optimizer
 from theano.gpuarray import pygpu
 from theano.gpuarray.basic_ops import (
@@ -20,7 +20,7 @@ from theano.tensor.nnet.ctc import ctc_available
 from theano.tensor.opt import register_canonicalize


-class GpuConnectionistTemporalClassification(ExternalCOp):
+class GpuConnectionistTemporalClassification(_NoPythonExternalCOp):
    """
    GPU wrapper for Baidu CTC loss function.


--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -12,7 +12,7 @@ from theano import tensor
 from theano.compile.ops import shape_i, shape_i_op
 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME, config
 from theano.gof.graph import Apply, Variable
-from theano.gof.op import COp, ExternalCOp
+from theano.gof.op import ExternalCOp, _NoPythonCOp, _NoPythonExternalCOp
 from theano.gof.params_type import ParamsType
 from theano.gof.type import CDataType, EnumList, Generic
 from theano.gpuarray import cudnn_defs, pygpu
@@ -302,7 +302,7 @@ class MakerCDataType(CDataType):
        return self._get_func()(ptr)


-class CDataMaker(COp):
+class CDataMaker(_NoPythonCOp):
    """This is the equally lame `Op` that accompanies `MakerCDataType`."""

    __props__ = ("rtype",)
@@ -350,7 +350,7 @@ def CUDNNDataType(name, freefunc=None):
    )


-class DnnVersion(COp):
+class DnnVersion(_NoPythonCOp):
    __props__ = ()

    def c_headers(self, **kwargs):
@@ -460,7 +460,7 @@ def get_precision(precision, inputs, for_grad=False):
    return precision, common_dtype


-class DnnBase(ExternalCOp):
+class DnnBase(_NoPythonExternalCOp):

    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -496,7 +496,7 @@ class DnnBase(ExternalCOp):
    def __init__(self, files=None, c_func=None):
        if files is None:
            files = []
-        ExternalCOp.__init__(self, ["c_code/dnn_base.c"] + files, c_func)
+        super().__init__(["c_code/dnn_base.c"] + files, c_func)

    def c_headers(self, **kwargs):
        return [
@@ -535,7 +535,7 @@ class DnnBase(ExternalCOp):
        return (super().c_code_cache_version(), version(), 4)


-class GpuDnnConvDesc(ExternalCOp):
+class GpuDnnConvDesc(_NoPythonExternalCOp):

    """
    This Op builds a convolution descriptor for use in the other convolution
@@ -607,7 +607,7 @@ class GpuDnnConvDesc(ExternalCOp):
        precision="float32",
        num_groups=1,
    ):
-        ExternalCOp.__init__(self, ["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")
+        super().__init__(["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")

        if version() < 6000 and any([d != 1 for d in dilation]):
            raise RuntimeError("Dilation > 1 not supported for cuDNN version < 6.")
@@ -756,8 +756,7 @@ class GpuDnnConv(DnnBase):
    )

    def __init__(self, algo=None, inplace=False, num_groups=1):
-        DnnBase.__init__(
-            self,
+        super().__init__(
            ["c_code/dnn_conv_base.c", "c_code/dnn_fwd.c"],
            "APPLY_SPECIFIC(conv_fwd)",
        )
@@ -918,8 +917,7 @@ class GpuDnnConvGradW(DnnBase):
    )

    def __init__(self, inplace=False, algo=None, num_groups=1):
-        DnnBase.__init__(
-            self,
+        super().__init__(
            ["c_code/dnn_conv_base.c", "c_code/dnn_gw.c"],
            "APPLY_SPECIFIC(conv_gw)",
        )
@@ -1088,8 +1086,7 @@ class GpuDnnConvGradI(DnnBase):
    )

    def __init__(self, inplace=False, algo=None, num_groups=1):
-        DnnBase.__init__(
-            self,
+        super().__init__(
            ["c_code/dnn_conv_base.c", "c_code/dnn_gi.c"],
            "APPLY_SPECIFIC(conv_gi)",
        )
@@ -1767,7 +1764,7 @@ def dnn_gradinput3d(
    )


-class GpuDnnPoolDesc(COp):
+class GpuDnnPoolDesc(_NoPythonCOp):
    """
    This Op builds a pooling descriptor for use in the other
    pooling operations.
@@ -1911,7 +1908,7 @@ class GpuDnnPoolBase(DnnBase):
    params_type = ParamsType(mode=cudnn.cudnnPoolingMode_t, handle=handle_type)

    def __init__(self, mode="max"):
-        DnnBase.__init__(self, [self.c_file], self.c_function)
+        super().__init__([self.c_file], self.c_function)
        if mode == "average":
            mode = "average_inc_pad"
        # Supported modes depend on runtime cuDNN version.
@@ -2114,7 +2111,7 @@ class GpuDnnSoftmaxBase(DnnBase):
    )

    def __init__(self, algo, mode):
-        DnnBase.__init__(self, [self.file], self.c_func)
+        super().__init__([self.file], self.c_func)

        assert cudnn.cudnnSoftmaxAlgorithm_t.has_alias(algo)
        self.algo = algo
@@ -2207,7 +2204,7 @@ class GpuDnnReduction(DnnBase):
    )

    def __init__(self, red_op, axis, acc_dtype, dtype, return_indices):
-        DnnBase.__init__(self, ["c_code/dnn_redux.c"], "APPLY_SPECIFIC(dnn_redux)")
+        super().__init__(["c_code/dnn_redux.c"], "APPLY_SPECIFIC(dnn_redux)")
        assert cudnn.cudnnReduceTensorOp_t.has_alias(red_op)
        self.red_op = red_op
        assert acc_dtype in ["float16", "float32", "float64"]
@@ -2328,8 +2325,7 @@ class GpuDnnBatchNorm(DnnBase):
        inplace_running_var=False,
        inplace_output=False,
    ):
-        DnnBase.__init__(
-            self,
+        super().__init__(
            ["c_code/dnn_batchnorm_base.c", "c_code/dnn_batchnorm.c"],
            "dnn_batchnorm_op",
        )
@@ -2460,8 +2456,7 @@ class GpuDnnBatchNormInference(DnnBase):
    )

    def __init__(self, mode="per-activation", inplace=False):
-        DnnBase.__init__(
-            self,
+        super().__init__(
            ["c_code/dnn_batchnorm_base.c", "c_code/dnn_batchnorm_inf.c"],
            "dnn_batchnorm_op",
        )
@@ -2546,8 +2541,7 @@ class GpuDnnBatchNormGrad(DnnBase):
    params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t, handle=handle_type)

    def __init__(self, mode="per-activation"):
-        DnnBase.__init__(
-            self,
+        super().__init__(
            ["c_code/dnn_batchnorm_base.c", "c_code/dnn_batchnorm_grad.c"],
            "dnn_batchnorm_grad",
        )
@@ -2585,7 +2579,7 @@ class GpuDnnDropoutOp(DnnBase):
    __props__ = ("inplace",)

    def __init__(self, inplace=False):
-        DnnBase.__init__(self, ["c_code/dnn_dropout_fwd.c"], "dnn_dropout_fwd")
+        super().__init__(["c_code/dnn_dropout_fwd.c"], "dnn_dropout_fwd")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {1: [2]}
@@ -2605,7 +2599,7 @@ class _DropoutDescriptor(DnnBase):
    __props__ = ("context_name",)

    def __init__(self, context_name):
-        DnnBase.__init__(self, ["c_code/dnn_dropout_desc.c"], "dnn_dropout_desc")
+        super().__init__(["c_code/dnn_dropout_desc.c"], "dnn_dropout_desc")
        self.context_name = context_name

    def dnn_context(self, node):
@@ -2666,7 +2660,7 @@ class _RNNDescriptor(DnnBase):
    def __init__(self, context_name):
        if version() < 5005:
            raise RuntimeError("cudnn RNN require cudnn v5 final or higher.")
-        DnnBase.__init__(self, ["c_code/dnn_rnn_desc.c"], "dnn_rnn_desc")
+        super().__init__(["c_code/dnn_rnn_desc.c"], "dnn_rnn_desc")
        self.context_name = context_name

    def dnn_context(self, node):
@@ -2759,7 +2753,7 @@ class _RNNParamSize(DnnBase):
    __props__ = ("context_name",)

    def __init__(self, context_name):
-        DnnBase.__init__(self, ["c_code/dnn_rnn_paramsize.c"], "dnn_rnn_paramsize")
+        super().__init__(["c_code/dnn_rnn_paramsize.c"], "dnn_rnn_paramsize")
        self.context_name = context_name

    def dnn_context(self, node):
@@ -2792,7 +2786,7 @@ class _RNNSplitParams(DnnBase):
    __props__ = ("rnn_mode",)

    def __init__(self, rnn_mode):
-        DnnBase.__init__(self)
+        super().__init__()
        self.rnn_mode = rnn_mode

    def make_node(self, w, desc, layer, isize, typecode):
@@ -3035,7 +3029,7 @@ class GpuDnnRNNOp(DnnBase):
    _cop_num_outputs = 4

    def __init__(self, rnn_mode, direction_mode):
-        DnnBase.__init__(self, ["c_code/dnn_rnn_fwd.c"], "dnn_rnn_fwd")
+        super().__init__(["c_code/dnn_rnn_fwd.c"], "dnn_rnn_fwd")
        self.rnn_mode = rnn_mode
        if direction_mode == "bidirectional":
            self.num_dirs = 2
@@ -3126,7 +3120,7 @@ class GpuDnnRNNGradInputs(DnnBase):
    _cop_num_outputs = 4

    def __init__(self, rnn_mode, grad_h, grad_c):
-        DnnBase.__init__(self, ["c_code/dnn_rnn_gi.c"], "dnn_rnn_gi")
+        super().__init__(["c_code/dnn_rnn_gi.c"], "dnn_rnn_gi")
        self.rnn_mode = rnn_mode
        self.grad_h = grad_h
        self.grad_c = grad_c
@@ -3175,7 +3169,7 @@ class GpuDnnRNNGradWeights(DnnBase):
    __props__ = ()

    def __init__(self):
-        DnnBase.__init__(self, ["c_code/dnn_rnn_gw.c"], "dnn_rnn_gw")
+        super().__init__(["c_code/dnn_rnn_gw.c"], "dnn_rnn_gw")

    def make_node(self, desc, x, hx, y, reserve, w):
        # We trust the callers here
@@ -3579,9 +3573,7 @@ class GpuDnnTransformerGrid(DnnBase):
    check_input = False

    def __init__(self):
-        DnnBase.__init__(
-            self, ["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)"
-        )
+        super().__init__(["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)")

    def make_node(self, theta, out_dims):
        """
@@ -3640,8 +3632,8 @@ class GpuDnnTransformerSampler(DnnBase):
    check_input = False

    def __init__(self):
-        DnnBase.__init__(
-            self, ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)"
+        super().__init__(
+            ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)"
        )

    def make_node(self, img, grid):
@@ -3704,7 +3696,7 @@ class GpuDnnTransformerGradI(DnnBase):
    check_input = False

    def __init__(self):
-        DnnBase.__init__(self, ["c_code/dnn_sptf_gi.c"], "APPLY_SPECIFIC(dnn_sptf_gi)")
+        super().__init__(["c_code/dnn_sptf_gi.c"], "APPLY_SPECIFIC(dnn_sptf_gi)")

    def make_node(self, img, grid, dy):
        context_name = infer_context_name(img, grid, dy)
@@ -3742,7 +3734,7 @@ class GpuDnnTransformerGradT(DnnBase):
    check_input = False

    def __init__(self):
-        DnnBase.__init__(self, ["c_code/dnn_sptf_gt.c"], "APPLY_SPECIFIC(dnn_sptf_gt)")
+        super().__init__(["c_code/dnn_sptf_gt.c"], "APPLY_SPECIFIC(dnn_sptf_gt)")

    def make_node(self, dgrid):
        context_name = infer_context_name(dgrid)

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -5,7 +5,7 @@ import numpy as np

 from theano import scalar
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.gof.utils import MethodNotDefined
 from theano.link.c.interface import HideC
 from theano.scalar import Composite, Scalar
@@ -84,7 +84,7 @@ def max_inputs_to_GpuElemwise(node_or_outputs):
    return max_nb_inputs


-class GpuElemwise(HideC, Elemwise):
+class GpuElemwise(_NoPythonOp, HideC, Elemwise):
    """
    Elemwise on the GPU.

@@ -414,9 +414,6 @@ class GpuElemwise(HideC, Elemwise):

        return str(code)

-    # To disable the superclass perform.
-    perform = Op.perform
-
    # Since we don't have a perform ...
    def python_constant_folding(self, node):
        return False
@@ -482,7 +479,7 @@ class GpuDimShuffle(DimShuffle):
        storage[0] = res


-class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
+class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype, _NoPythonOp):
    """
    GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.

@@ -616,9 +613,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            ],
        )

-    def perform(self, node, inp, out, ctx):
-        Op.perform(self, node, inp, out, ctx)
-
    def supports_c_code(self, inputs):
        """
        Returns True if the current op and reduce pattern has functioning C code.

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.tensor.extra_ops import CumOp


@@ -11,7 +11,7 @@ except ImportError:
 import theano.scalar as scalar
 from theano.gof.params_type import ParamsType
 from theano.gpuarray.basic_ops import (
-    GpuKernelBase,
+    GpuKernelBaseCOp,
    GpuReshape,
    Kernel,
    as_gpuarray_variable,
@@ -22,7 +22,7 @@ from theano.gpuarray.opt import op_lifter, register_opt, register_opt2
 from theano.gpuarray.type import gpu_context_type


-class GpuCumOp(GpuKernelBase, Op):
+class GpuCumOp(GpuKernelBaseCOp, _NoPythonOp):
    """
    Parameters
    ----------
@@ -505,7 +505,7 @@ class GpuCumOp(GpuKernelBase, Op):

 # GpuCumsumOp exists only to serve backward compatibility.
 # Once an object is created, it will be converted to CumOp object.
-class GpuCumsumOp(GpuKernelBase, Op):
+class GpuCumsumOp(GpuKernelBaseCOp, _NoPythonOp):
    SUPPORTED_NDIMS = 3
    __props__ = ("axis",)


--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -2,7 +2,7 @@ import numpy as np

 import theano.tensor as tt
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.gpuarray.basic_ops import (
    as_gpuarray_variable,
    gpu_contiguous,
@@ -37,7 +37,7 @@ except Exception:
    skcuda_available = False


-class CuRFFTOp(Op):
+class CuRFFTOp(_NoPythonOp):

    __props__ = ()

@@ -168,7 +168,7 @@ class CuRFFTOp(Op):
 curfft_op = CuRFFTOp()


-class CuIRFFTOp(Op):
+class CuIRFFTOp(_NoPythonOp):

    __props__ = ()


--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -11,12 +11,12 @@ except ImportError:
 import theano
 import theano.sandbox.multinomial
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.scalar import as_scalar
 from theano.tensor import NotScalarConstantError, get_scalar_constant_value

 from .basic_ops import (
-    GpuKernelBase,
+    GpuKernelBaseCOp,
    Kernel,
    as_gpuarray_variable,
    gpuarray_helper_inc_dir,
@@ -28,12 +28,12 @@ from .opt import op_lifter, register_opt, register_opt2
 from .type import GpuArrayType


-class GPUAMultinomialFromUniform(GpuKernelBase, Op):
+class GPUAMultinomialFromUniform(GpuKernelBaseCOp, _NoPythonOp):
    __props__ = ("odtype",)
    _f16_ok = True

    def __init__(self, odtype):
-        Op.__init__(self)
+        super().__init__(self)
        self.odtype = odtype

    def get_params(self, node):
@@ -251,7 +251,7 @@ KERNEL void k_multi_warp_multinomial(
        return (7,)


-class GPUAChoiceFromUniform(GpuKernelBase, Op):
+class GPUAChoiceFromUniform(GpuKernelBaseCOp, _NoPythonOp):
    """
    The output is transposed compared to MultinomialWOReplacementFromUniform.
    We must insert a Transpose op after it.
@@ -263,7 +263,7 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):
    __props__ = ("odtype", "replace")

    def __init__(self, odtype, replace=False):
-        Op.__init__(self)
+        super().__init__(self)
        self.odtype = odtype
        self.replace = replace


--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
 import theano.tensor as tt
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.gof.params_type import ParamsType
 from theano.tensor.nnet.neighbours import Images2Neibs

@@ -11,7 +11,7 @@ except ImportError:
    pass

 from theano.gpuarray.basic_ops import (
-    GpuKernelBase,
+    GpuKernelBaseCOp,
    Kernel,
    as_gpuarray_variable,
    infer_context_name,
@@ -19,7 +19,7 @@ from theano.gpuarray.basic_ops import (
 from theano.gpuarray.type import GpuArrayType, gpu_context_type


-class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
+class GpuImages2Neibs(GpuKernelBaseCOp, Images2Neibs, _NoPythonOp):
    """
    Images2Neibs for the GPU.

@@ -627,7 +627,3 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
            params=sub["params"],
            fail=sub["fail"],
        )
-
-    def perform(self, node, inp, out, params):
-        # Disable the perform method from the CPU version
-        Op.perform(self, node, inp, out, params)
--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -3,7 +3,7 @@ from io import StringIO
 import numpy as np

 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp


 try:
@@ -12,18 +12,18 @@ try:
 except ImportError:
    pass

-from .basic_ops import (
-    GpuKernelBase,
+from theano.gpuarray.basic_ops import (
+    GpuKernelBaseCOp,
    Kernel,
    as_gpuarray_variable,
    gpuarray_helper_inc_dir,
    infer_context_name,
 )
-from .fp16_help import load_w, work_dtype, write_w
-from .type import GpuArrayType
+from theano.gpuarray.fp16_help import load_w, work_dtype, write_w
+from theano.gpuarray.type import GpuArrayType


-class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
+class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBaseCOp, _NoPythonOp):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.

@@ -283,7 +283,7 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = (
 )


-class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
+class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBaseCOp, _NoPythonOp):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.

@@ -508,7 +508,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()


-class GpuSoftmax(GpuKernelBase, Op):
+class GpuSoftmax(GpuKernelBaseCOp, _NoPythonOp):
    """
    Implement Softmax on the gpu.

@@ -804,7 +804,7 @@ class GpuSoftmax(GpuKernelBase, Op):
 gpu_softmax = GpuSoftmax()


-class GpuSoftmaxWithBias(GpuKernelBase, Op):
+class GpuSoftmaxWithBias(GpuKernelBaseCOp, _NoPythonOp):
    """
    Implement SoftmaxWithBias on the gpu.


--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -20,7 +20,7 @@ import theano.tensor
 from theano.compile import optdb
 from theano.configdefaults import config
 from theano.gof.graph import Apply, Variable, is_in_ancestors
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.gof.opt import GlobalOptimizer, local_optimizer
 from theano.scan.utils import clone
 from theano.tensor import TensorType, opt
@@ -40,7 +40,7 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 _logger = logging.getLogger("theano.ifelse")


-class IfElse(Op):
+class IfElse(_NoPythonOp):
    """
    Op that provides conditional graph evaluation if used with the CVM/VM
    linkers. Note that there exist a helpful function `ifelse` that should

--- a/theano/scan/op.py
+++ b/theano/scan/op.py
@@ -1067,7 +1067,8 @@ class Scan(Op):
                )

        except (ImportError, MissingGXX):
-            p = self.execute
+            p = self.perform
+
        # default arguments are stored in the closure of `rval`

        # Big ugly hack since we can't get the real value of allow_gc
@@ -1246,9 +1247,10 @@ class Scan(Op):
        )
        return list_inputs[offset:]

-    def execute(self, node, args, outs):
-        """
-        The args are packed like this:
+    def perform(self, node, inputs, output_storage, params=None):
+        """Compute the scan operation in Python.
+
+        The `inputs` are packed like this:

            n_steps

@@ -1259,7 +1261,7 @@ class Scan(Op):

            W other inputs w_1, w_2, ... w_W

-        There are at least 1 + self.n_seqs + self.n_outs inputs, and the
+        There are at least ``1 + self.n_seqs + self.n_outs`` inputs, and the
        ones above this number are passed to the scanned function as
        non-sequential inputs.

@@ -1272,7 +1274,7 @@ class Scan(Op):
        # negative flip sequences around, and make n_steps positive
        t0_call = time.time()
        t_fn = 0
-        n_steps = args[0]
+        n_steps = inputs[0]
        seqs = []
        if n_steps < 0:
            # History, in the past, this was used for backward
@@ -1285,7 +1287,7 @@ class Scan(Op):
                "We didn't implemented yet the case where scan do 0 iteration"
            )
        else:
-            for idx, seq in enumerate(args[1 : self.seqs_arg_offset]):
+            for idx, seq in enumerate(inputs[1 : self.seqs_arg_offset]):
                if seq.shape[0] < n_steps:
                    raise ValueError(
                        (
@@ -1305,11 +1307,12 @@ class Scan(Op):
        #                       output

        store_steps = [
-            arg.shape[0] for arg in args[self.seqs_arg_offset : self.shared_arg_offset]
+            arg.shape[0]
+            for arg in inputs[self.seqs_arg_offset : self.shared_arg_offset]
        ]
        store_steps += [
            arg
-            for arg in args[
+            for arg in inputs[
                self.nit_sot_arg_offset : self.nit_sot_arg_offset + self.n_nit_sot
            ]
        ]
@@ -1325,31 +1328,32 @@ class Scan(Op):
            if idx in self.destroy_map:
                # ^ Case 1. Outputs should be computed inplace of their
                # initial state
-                outs[idx][0] = args[self.seqs_arg_offset + idx]
+                output_storage[idx][0] = inputs[self.seqs_arg_offset + idx]
            elif (
-                outs[idx][0] is not None
-                and outs[idx][0].shape[1:] == args[self.seqs_arg_offset + idx].shape[1:]
-                and outs[idx][0].shape[0] >= store_steps[idx]
+                output_storage[idx][0] is not None
+                and output_storage[idx][0].shape[1:]
+                == inputs[self.seqs_arg_offset + idx].shape[1:]
+                and output_storage[idx][0].shape[0] >= store_steps[idx]
            ):
                # Put in the values of the initial state
-                outs[idx][0] = outs[idx][0][: store_steps[idx]]
+                output_storage[idx][0] = output_storage[idx][0][: store_steps[idx]]
                if idx > self.n_mit_mot:
                    l = -self.mintaps[idx]
-                    outs[idx][0][:l] = args[self.seqs_arg_offset + idx][:l]
+                    output_storage[idx][0][:l] = inputs[self.seqs_arg_offset + idx][:l]
                else:
-                    outs[idx][0][:] = args[self.seqs_arg_offset + idx]
+                    output_storage[idx][0][:] = inputs[self.seqs_arg_offset + idx]
            else:
-                outs[idx][0] = args[self.seqs_arg_offset + idx].copy()
+                output_storage[idx][0] = inputs[self.seqs_arg_offset + idx].copy()

        offset = self.nit_sot_arg_offset + self.n_nit_sot
-        other_args = args[offset:]
-        input_storage = self.fn.input_storage
+        other_args = inputs[offset:]
+        inner_input_storage = self.fn.input_storage
        nb_mitmot_in = sum(map(len, self.tap_array[: self.n_mit_mot]))
        old_mitmot_input_storage = [None] * nb_mitmot_in
        old_mitmot_input_data = [None] * nb_mitmot_in
-        output_storage = self.fn.output_storage
-        old_output_storage = [None] * len(output_storage)
-        old_output_data = [None] * len(output_storage)
+        inner_output_storage = self.fn.output_storage
+        old_inner_output_storage = [None] * len(inner_output_storage)
+        old_inner_output_data = [None] * len(inner_output_storage)
        fn = self.fn.fn
        offset = (
            self.n_seqs
@@ -1357,7 +1361,7 @@ class Scan(Op):
            + self.n_shared_outs
        )
        for idx in range(len(other_args)):
-            input_storage[idx + offset].storage[0] = other_args[idx]
+            inner_input_storage[idx + offset].storage[0] = other_args[idx]

        i = 0
        cond = True
@@ -1368,34 +1372,40 @@ class Scan(Op):
            # 3. collect input slices
            for idx in range(self.n_seqs):
                if self.vector_seqs[idx]:
-                    input_storage[idx].storage[0] = seqs[idx][i : i + 1].reshape(())
+                    inner_input_storage[idx].storage[0] = seqs[idx][i : i + 1].reshape(
+                        ()
+                    )
                else:
-                    input_storage[idx].storage[0] = seqs[idx][i]
+                    inner_input_storage[idx].storage[0] = seqs[idx][i]

            offset = self.n_seqs
            for idx in range(self.n_outs):
                if self.vector_outs[idx]:
                    for tap in self.tap_array[idx]:
                        _idx = (pos[idx] + tap) % store_steps[idx]
-                        input_storage[offset].storage[0] = outs[idx][0][
+                        inner_input_storage[offset].storage[0] = output_storage[idx][0][
                            _idx : _idx + 1
                        ].reshape(())
                        offset += 1
                else:
                    for tap in self.tap_array[idx]:
                        _idx = (pos[idx] + tap) % store_steps[idx]
-                        input_storage[offset].storage[0] = outs[idx][0][_idx]
+                        inner_input_storage[offset].storage[0] = output_storage[idx][0][
+                            _idx
+                        ]
                        offset += 1

            a_offset = self.shared_arg_offset
            o_offset = self.n_outs + self.n_nit_sot
            if i == 0:
                for j in range(self.n_shared_outs):
-                    input_storage[offset].storage[0] = args[a_offset + j]
+                    inner_input_storage[offset].storage[0] = inputs[a_offset + j]
                    offset += 1
            else:
                for j in range(self.n_shared_outs):
-                    input_storage[offset].storage[0] = outs[o_offset + j][0]
+                    inner_input_storage[offset].storage[0] = output_storage[
+                        o_offset + j
+                    ][0]
                    offset += 1

            # 4. collecting slices where the output should be stored
@@ -1404,7 +1414,7 @@ class Scan(Op):
            offset = 0
            for idx in range(self.n_mit_mot_outs):
                if not self.mitmots_preallocated[idx]:
-                    output_storage[offset].storage[0] = None
+                    inner_output_storage[offset].storage[0] = None
                    offset += 1

            # 4.2. Collect slices for mitsots, sitsots and nitsots
@@ -1414,25 +1424,25 @@ class Scan(Op):
                        store_steps[idx + self.n_mit_mot] == 1
                        or self.vector_outs[idx + self.n_mit_mot]
                    ):
-                        output_storage[idx + offset].storage[0] = None
+                        inner_output_storage[idx + offset].storage[0] = None
                    else:
                        _pos0 = idx + self.n_mit_mot
-                        output_storage[idx + offset].storage[0] = outs[_pos0][0][
-                            pos[_pos0]
-                        ]
+                        inner_output_storage[idx + offset].storage[0] = output_storage[
+                            _pos0
+                        ][0][pos[_pos0]]
            else:
                for idx in range(self.n_outs + self.n_nit_sot - self.n_mit_mot):
-                    output_storage[idx + offset].storage[0] = None
+                    inner_output_storage[idx + offset].storage[0] = None

            # 4.3. Collect slices for shared outputs
            offset += self.n_outs + self.n_nit_sot - self.n_mit_mot
            for idx in range(self.n_shared_outs):
-                output_storage[idx + offset].storage[0] = None
+                inner_output_storage[idx + offset].storage[0] = None

            # 4.4. If there is a condition add it to the mix
            if self.as_while:
                pdx = offset + self.n_shared_outs
-                output_storage[pdx].storage[0] = None
+                inner_output_storage[pdx].storage[0] = None

            # 4.5. Keep a reference to the variables (ndarrays, GpuArrays,
            # etc) currently in the output_storage to be able to compare them
@@ -1440,17 +1450,17 @@ class Scan(Op):
            # execution. Also keep pointers to their data to be able to detect
            # cases where outputs reused the allocated object but alter the
            # memory region they refer to.
-            for idx in range(len(output_storage)):
+            for idx in range(len(inner_output_storage)):

-                var = output_storage[idx].storage[0]
-                old_output_storage[idx] = var
+                var = inner_output_storage[idx].storage[0]
+                old_inner_output_storage[idx] = var

                if var is None:
-                    old_output_data[idx] = None
+                    old_inner_output_data[idx] = None
                elif self.outs_is_tensor[idx]:
-                    old_output_data[idx] = var.data
+                    old_inner_output_data[idx] = var.data
                else:
-                    old_output_data[idx] = var.gpudata
+                    old_inner_output_data[idx] = var.gpudata

            # 4.6. Keep a reference to the variables (ndarrays, GpuArrays,
            # etc) associated with mitmot inputs currently in the
@@ -1460,7 +1470,7 @@ class Scan(Op):
            # reused the allocated object but alter the memory region they
            # refer to.
            for idx in range(nb_mitmot_in):
-                var = input_storage[idx + self.n_seqs].storage[0]
+                var = inner_input_storage[idx + self.n_seqs].storage[0]
                old_mitmot_input_storage[idx] = var

                if var is None:
@@ -1502,19 +1512,19 @@ class Scan(Op):
            dt_fn = time.time() - t0_fn
            if self.as_while:
                pdx = offset + self.n_shared_outs
-                cond = output_storage[pdx].storage[0] == 0
+                cond = inner_output_storage[pdx].storage[0] == 0

            # 5.2. By calling fn() directly instead of calling the theano
            # function, it is possible that the updates have not been
            # performed. Perform the updates if needed.
-            offset_out = len(output_storage) - 1
+            offset_out = len(inner_output_storage) - 1
            if getattr(fn, "need_update_inputs", True):
                # Update the inputs that have an update function
                for inp, storage in zip(
                    self.fn.maker.expanded_inputs[::-1], self.fn.input_storage[::-1]
                ):
                    if inp.update is not None:
-                        storage.data = output_storage[offset_out].data
+                        storage.data = inner_output_storage[offset_out].data
                        offset_out -= 1

            t_fn += dt_fn
@@ -1532,7 +1542,7 @@ class Scan(Op):
                        # Verify whether the input points to the same data as
                        # it did before the execution of the inner function.
                        old_var = old_mitmot_input_storage[inp_idx]
-                        new_var = input_storage[self.n_seqs + inp_idx].storage[0]
+                        new_var = inner_input_storage[self.n_seqs + inp_idx].storage[0]
                        if old_var is new_var:
                            old_data = old_mitmot_input_data[inp_idx]
                            if self.inps_is_tensor[self.n_seqs + inp_idx]:
@@ -1547,14 +1557,16 @@ class Scan(Op):
                        # nothing needs to be done. Otherwise, recover the
                        # and store it in `outs` as usual
                        if not same_data:
-                            outs[j][0][k + pos[j]] = input_storage[
+                            output_storage[j][0][k + pos[j]] = inner_input_storage[
                                self.n_seqs + inp_idx
                            ].storage[0]

                    else:
                        # This output tap has not been preallocated, recover
                        # its value as usual
-                        outs[j][0][k + pos[j]] = output_storage[offset_out].storage[0]
+                        output_storage[j][0][k + pos[j]] = inner_output_storage[
+                            offset_out
+                        ].storage[0]
                        offset_out += 1

                    mitmot_out_idx += 1
@@ -1570,14 +1582,16 @@ class Scan(Op):

                # Copy the output value to `outs`, if necessary
                if store_steps[j] == 1 or self.vector_outs[j]:
-                    outs[j][0][pos[j]] = output_storage[offset_out + j].storage[0]
+                    output_storage[j][0][pos[j]] = inner_output_storage[
+                        offset_out + j
+                    ].storage[0]
                else:
                    # Check whether the initialization of the output storage
                    # map for this output has been reused.
-                    old_var = old_output_storage[offset_out + j]
-                    new_var = output_storage[offset_out + j].storage[0]
+                    old_var = old_inner_output_storage[offset_out + j]
+                    new_var = inner_output_storage[offset_out + j].storage[0]
                    if old_var is new_var:
-                        old_data = old_output_data[offset_out + j]
+                        old_data = old_inner_output_data[offset_out + j]
                        if old_data is None:
                            output_reused = False
                        elif self.outs_is_tensor[offset_out + j]:
@@ -1589,9 +1603,9 @@ class Scan(Op):

                    if not output_reused:
                        try:
-                            outs[j][0][pos[j]] = output_storage[offset_out + j].storage[
-                                0
-                            ]
+                            output_storage[j][0][pos[j]] = inner_output_storage[
+                                offset_out + j
+                            ].storage[0]
                        except ValueError as e:
                            if i == 0:
                                # First iteration, so don't change the
@@ -1614,26 +1628,30 @@ class Scan(Op):

                if i == 0:
                    jout = j + offset_out
-                    shape = (store_steps[j],) + output_storage[jout].storage[0].shape
-                    dtype = output_storage[jout].storage[0].dtype
+                    shape = (store_steps[j],) + inner_output_storage[jout].storage[
+                        0
+                    ].shape
+                    dtype = inner_output_storage[jout].storage[0].dtype
                    if (
-                        outs[j][0] is None
-                        or outs[j][0].shape[0] < store_steps[j]
-                        or outs[j][0].shape[1:] != shape[1:]
-                        or outs[j][0].dtype != dtype
+                        output_storage[j][0] is None
+                        or output_storage[j][0].shape[0] < store_steps[j]
+                        or output_storage[j][0].shape[1:] != shape[1:]
+                        or output_storage[j][0].dtype != dtype
                    ):
-                        outs[j][0] = node.outputs[j].type.value_zeros(shape)
-                    elif outs[j][0].shape[0] != store_steps[j]:
-                        outs[j][0] = outs[j][0][: store_steps[j]]
-                    outs[j][0][pos[j]] = output_storage[jout].storage[0]
+                        output_storage[j][0] = node.outputs[j].type.value_zeros(shape)
+                    elif output_storage[j][0].shape[0] != store_steps[j]:
+                        output_storage[j][0] = output_storage[j][0][: store_steps[j]]
+                    output_storage[j][0][pos[j]] = inner_output_storage[jout].storage[0]
                elif store_steps[j] == 1 or self.vector_outs[j]:
-                    outs[j][0][pos[j]] = output_storage[j + offset_out].storage[0]
+                    output_storage[j][0][pos[j]] = inner_output_storage[
+                        j + offset_out
+                    ].storage[0]
                else:
                    # Check whether the initialization of the output storage map
                    # for this output has been reused.
-                    old_var = old_output_storage[offset_out + j]
-                    old_data = old_output_data[offset_out + j]
-                    new_var = output_storage[offset_out + j].storage[0]
+                    old_var = old_inner_output_storage[offset_out + j]
+                    old_data = old_inner_output_data[offset_out + j]
+                    new_var = inner_output_storage[offset_out + j].storage[0]
                    if old_var is new_var:
                        if old_data is None:
                            output_reused = False
@@ -1645,7 +1663,9 @@ class Scan(Op):
                        output_reused = False

                    if not output_reused:
-                        outs[j][0][pos[j]] = output_storage[j + offset_out].storage[0]
+                        output_storage[j][0][pos[j]] = inner_output_storage[
+                            j + offset_out
+                        ].storage[0]

            # 5.6 Copy over the values for outputs corresponding to shared
            # variables
@@ -1653,7 +1673,7 @@ class Scan(Op):
            end += self.n_shared_outs
            for j in range(begin, end):
                jout = j + offset_out
-                outs[j][0] = output_storage[jout].storage[0]
+                output_storage[j][0] = inner_output_storage[jout].storage[0]

            pos = [(idx + 1) % store for idx, store in zip(pos, store_steps)]
            i = i + 1
@@ -1672,25 +1692,29 @@ class Scan(Op):
                    # are read and written.
                    # This way, there will be no information overwritten
                    # before it is read (as it used to happen).
-                    shape = (pdx,) + outs[idx][0].shape[1:]
+                    shape = (pdx,) + output_storage[idx][0].shape[1:]
                    tmp = node.outputs[idx].type.value_zeros(shape)
-                    tmp[:] = outs[idx][0][:pdx]
-                    outs[idx][0][: store_steps[idx] - pdx] = outs[idx][0][pdx:]
-                    outs[idx][0][store_steps[idx] - pdx :] = tmp
+                    tmp[:] = output_storage[idx][0][:pdx]
+                    output_storage[idx][0][: store_steps[idx] - pdx] = output_storage[
+                        idx
+                    ][0][pdx:]
+                    output_storage[idx][0][store_steps[idx] - pdx :] = tmp
                    del tmp
                else:
-                    shape = (store_steps[idx] - pdx,) + outs[idx][0].shape[1:]
+                    shape = (store_steps[idx] - pdx,) + output_storage[idx][0].shape[1:]
                    tmp = node.outputs[idx].type.value_zeros(shape)
-                    tmp[:] = outs[idx][0][pdx:]
-                    outs[idx][0][store_steps[idx] - pdx :] = outs[idx][0][:pdx]
-                    outs[idx][0][: store_steps[idx] - pdx] = tmp
+                    tmp[:] = output_storage[idx][0][pdx:]
+                    output_storage[idx][0][store_steps[idx] - pdx :] = output_storage[
+                        idx
+                    ][0][:pdx]
+                    output_storage[idx][0][: store_steps[idx] - pdx] = tmp
                    del tmp
            # This would normally happen only when doing truncated
            # backpropagation through time. In such a scenarion Scan is
            # expected to return 0 for all entries for which the gradient is
            # not actually computed
            elif store_steps[idx] > i - self.mintaps[idx]:
-                outs[idx][0][i - self.mintaps[idx] :] = 0
+                output_storage[idx][0][i - self.mintaps[idx] :] = 0
                # This is a fix for a bug introduced by while. If you say
                # you want to loop up to a condition, you expect the output
                # to have that length ( and not the maximal length possible)
@@ -1709,13 +1733,13 @@ class Scan(Op):
                    # every output and then do outs[0][:i+maximal_tap],
                    # which implies I think more computations then this
                    # little trick that I used
-                    outs[idx][0] = outs[idx][0][: -(n_steps - i)]
+                    output_storage[idx][0] = output_storage[idx][0][: -(n_steps - i)]

        # We never reuse the input or output storage of the
        # inner function so we clear it.
-        for i_s in input_storage:
+        for i_s in inner_input_storage:
            i_s.storage[0] = None
-        for o_s in output_storage:
+        for o_s in inner_output_storage:
            o_s.storage[0] = None

        t_call = time.time() - t0_call
@@ -1735,7 +1759,6 @@ class Scan(Op):
        self.t_call = t_call
        self.t_fn = t_fn

-    # Infer Shape
    def infer_shape(self, fgraph, node, input_shapes):
        # input_shapes correspond to the shapes of node.inputs
        for inp, inp_shp in zip(node.inputs, input_shapes):
@@ -2085,7 +2108,6 @@ class Scan(Op):

        return mappings

-    # GRAD FUNCTION
    def L_op(self, inputs, outs, dC_douts):
        if not isinstance(outs, (list, tuple)):
            outs = [outs]

--- a/theano/sparse/opt.py
+++ b/theano/sparse/opt.py
@@ -5,7 +5,7 @@ import theano
 from theano import scalar, tensor
 from theano.configdefaults import config
 from theano.gof.graph import Apply
-from theano.gof.op import COp
+from theano.gof.op import COp, _NoPythonCOp
 from theano.gof.opt import PatternSub, TopoOptimizer, local_optimizer
 from theano.misc.safe_asarray import _asarray
 from theano.sparse import basic as sparse
@@ -78,7 +78,7 @@ theano.compile.optdb.register(
 )


-class AddSD_ccode(COp):
+class AddSD_ccode(_NoPythonCOp):
    """
    Add a sparse and a dense matrix.

@@ -663,7 +663,7 @@ def local_structured_dot(fgraph, node):
 # register_specialize(local_structured_dot)


-class UsmmCscDense(COp):
+class UsmmCscDense(_NoPythonCOp):
    """
    Performs the expression is `alpha` * `x` `y` + `z`.

@@ -995,7 +995,7 @@ def local_usmm_csx(fgraph, node):
 register_specialize(local_usmm_csx, "cxx_only")


-class CSMGradC(COp):
+class CSMGradC(_NoPythonCOp):

    __props__ = ()

@@ -1138,7 +1138,7 @@ def local_csm_grad_c(fgraph, node):
 # register_specialize(local_csm_grad_c, 'cxx_only')


-class MulSDCSC(COp):
+class MulSDCSC(_NoPythonCOp):
    """
    Multiplication of sparse matrix by a broadcasted dense vector
    element wise.
@@ -1181,9 +1181,6 @@ class MulSDCSC(COp):
    def c_code_cache_version(self):
        return (3,)

-    # def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
-    #    return NotImplementedError()
-
    def c_code(self, node, name, inputs, outputs, sub):

        (
@@ -1275,7 +1272,7 @@ class MulSDCSC(COp):
 mul_s_d_csc = MulSDCSC()


-class MulSDCSR(COp):
+class MulSDCSR(_NoPythonCOp):
    """
    Multiplication of sparse matrix by a broadcasted dense vector
    element wise.
@@ -1318,9 +1315,6 @@ class MulSDCSR(COp):
    def c_code_cache_version(self):
        return (3,)

-    # def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
-    #    return NotImplemented()
-
    def c_code(self, node, name, inputs, outputs, sub):

        (
@@ -1463,7 +1457,7 @@ def local_mul_s_d(fgraph, node):
 register_specialize(local_mul_s_d, "cxx_only")


-class MulSVCSR(COp):
+class MulSVCSR(_NoPythonCOp):
    """
    Multiplication of sparse matrix by a broadcasted dense vector
    element wise.
@@ -1627,7 +1621,7 @@ def local_mul_s_v(fgraph, node):
 register_specialize(local_mul_s_v, "cxx_only")


-class StructuredAddSVCSR(COp):
+class StructuredAddSVCSR(_NoPythonCOp):
    """
    Structured addition of a sparse matrix and a dense vector.
    The elements of the vector are are only added to the corresponding
@@ -1806,7 +1800,7 @@ def local_structured_add_s_v(fgraph, node):
 register_specialize(local_structured_add_s_v, "cxx_only")


-class SamplingDotCSR(COp):
+class SamplingDotCSR(_NoPythonCOp):
    """
    Operand optimized for calculating the dot product dot(`x`, `y`.T) = `z`
    when you only want to calculate a subset of `z`.

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -4,7 +4,7 @@ import os
 import theano
 from theano.configdefaults import config
 from theano.gof.graph import Apply
-from theano.gof.op import OpenMPOp
+from theano.gof.op import OpenMPOp, _NoPythonOp
 from theano.gof.params_type import ParamsType
 from theano.gof.type import EnumList
 from theano.scalar import int8, int64
@@ -18,7 +18,7 @@ from theano.tensor.type import TensorType
 _logger = logging.getLogger(__name__)


-class BaseCorrMM(OpenMPOp):
+class BaseCorrMM(OpenMPOp, _NoPythonOp):
    """
    Base class for `CorrMM`, `CorrMM_gradWeights` and
    `CorrMM_gradInputs`. Cannot be used directly.

--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
@@ -4,7 +4,7 @@ import os
 import theano
 from theano.configdefaults import config
 from theano.gof.graph import Apply
-from theano.gof.op import OpenMPOp
+from theano.gof.op import OpenMPOp, _NoPythonOp
 from theano.gof.params_type import ParamsType
 from theano.gof.type import EnumList
 from theano.scalar import int64
@@ -18,7 +18,7 @@ from theano.tensor.type import TensorType
 _logger = logging.getLogger(__name__)


-class BaseCorr3dMM(OpenMPOp):
+class BaseCorr3dMM(OpenMPOp, _NoPythonOp):
    """
    Base class for `Corr3dMM`, `Corr3dMM_gradWeights` and
    `Corr3dMM_gradInputs`. Cannot be used directly.