Make Op.perform an abstractmethod and provide Op type hints

This change makes `Op.perform` a mandatory method. Since more than a few `Op`s do not have Python implementations, they've been made to extend `_NoPython*Op` classes that provide an `Op.perform` that simply raises a `NotImplementedError`.

Make Op.perform an abstractmethod and provide Op type hints
0366c559 · Brandon T. Willard · Brandon T. Willard · 5a1a147d · 0366c559 · 0366c559
--- a/tests/compile/test_debugmode.py
+++ b/tests/compile/test_debugmode.py
@@ -118,6 +118,9 @@ class WeirdBrokenOp(COp):
        r = Apply(self, [a_], [a_.type()])
        return r
+    def perform(*args, **kwargs):
+        raise NotImplementedError()
    def dontuse_perform(self, node, inp, out_):
        (a,) = inp
        (out,) = out_

--- a/tests/gof/test_compute_test_value.py
+++ b/tests/gof/test_compute_test_value.py
@@ -41,6 +41,9 @@ class IncOneC(COp):
        (z,) = outputs
        return f"{z} = {x} + 1;"
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
 class TestComputeTestValue:
    def test_destroy_map(self):

--- a/tests/gof/test_destroyhandler.py
+++ b/tests/gof/test_destroyhandler.py
@@ -85,6 +85,9 @@ class MyOp(Op):
        outputs = [MyVariable(self.name + "_R") for i in range(self.nout)]
        return Apply(self, inputs, outputs)
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
    def __str__(self):
        return self.name

--- a/tests/gof/test_graph.py
+++ b/tests/gof/test_graph.py
@@ -58,6 +58,9 @@ class MyOp(Op):
        outputs = [MyVariable(sum(input.type.thingy for input in inputs))]
        return Apply(self, list(inputs), outputs)
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
 MyOp = MyOp()

--- a/tests/gof/test_op.py
+++ b/tests/gof/test_op.py
@@ -60,6 +60,9 @@ class MyOp(Op):
            outputs = [MyType(sum([input.type.thingy for input in inputs]))()]
            return Apply(self, inputs, outputs)
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
 MyOp = MyOp()
@@ -104,6 +107,9 @@ counter%(name)s++;
    def c_code_cache_version(self):
        return (1,)
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
 class TestOp:
@@ -206,6 +212,9 @@ class TestMakeThunk:
                (z,) = outputs
                return f"{z} = {x} + 1;"
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError("No Python implementation available.")
        i = scalar.int32("i")
        o = IncOneC()(i)

--- a/tests/gof/test_toolbox.py
+++ b/tests/gof/test_toolbox.py
@@ -48,6 +48,9 @@ class TestNodeFinder:
            def __str__(self):
                return self.name
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        sigmoid = MyOp(1, "Sigmoid")
        add = MyOp(2, "Add")
        dot = MyOp(2, "Dot")

--- a/tests/gof/test_types.py
+++ b/tests/gof/test_types.py
@@ -39,6 +39,9 @@ Py_INCREF(%(inp)s);
    def c_code_cache_version(self):
        return (0,)
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
 class GetOp(COp):
    __props__ = ()
@@ -65,6 +68,9 @@ Py_INCREF(%(out)s);
    def c_code_cache_version(self):
        return (0,)
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
 @pytest.mark.skipif(
    not theano.config.cxx, reason="G++ not available, so we need to skip this test."
@@ -192,6 +198,9 @@ class MyOpCEnumType(COp):
    def make_node(self):
        return Apply(self, [], [scalar.uint32()])
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
    def c_code_cache_version(self):
        return (3,)

--- a/tests/gpuarray/test_cgpukernelbase.py
+++ b/tests/gpuarray/test_cgpukernelbase.py
@@ -4,7 +4,6 @@ import pytest
 import theano
 from theano import config, tensor
 from theano.gof.graph import Apply
-from theano.gof.op import Op
 from theano.gof.params_type import ParamsType
 from theano.gpuarray.basic_ops import CGpuKernelBase
 from theano.gpuarray.type import GpuArrayType, get_context, gpu_context_type
@@ -12,11 +11,11 @@ from theano.gradient import grad_undefined
 from theano.scalar import int32 as int_t
-# This is an implementation to test that CGpuKernelBase works and also
+class GpuEye(CGpuKernelBase):
-# to use as an example in the docs.  It is not used for user graphs.
+    """Eye for GPU.
-class GpuEye(CGpuKernelBase, Op):
-    """
+    This is an implementation to test that `CGpuKernelBase` works and also
-    Eye for GPU.
+    to use as an example in the docs.  It is not used for user graphs.
    """
@@ -28,9 +27,7 @@ class GpuEye(CGpuKernelBase, Op):
            dtype = config.floatX
        self.dtype = dtype
        self.context_name = context_name
-        CGpuKernelBase.__init__(
+        super().__init__(["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)")
-            self, ["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)"
-        )
    def get_params(self, node):
        pygpu_gpuarray = pytest.importorskip("pygpu.gpuarray")

--- a/tests/tensor/test_basic.py
+++ b/tests/tensor/test_basic.py
@@ -984,6 +984,9 @@ class ApplyDefaultTestOp(Op):
        x = tt.as_tensor_variable(x)
        return Apply(self, [x], [x.type()])
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
 def test_constant():
    int8_vector_type = tt.TensorType(dtype="int8", broadcastable=(False,))
@@ -3862,6 +3865,9 @@ class TestGrad:
            gz0, gz1 = grads
            return self.gval0, self.gval1
+        def perform(self, *args, **kwargs):
+            raise NotImplementedError()
    def test_1param(self):
        # grad: Test passing a single variable param
        o = TestGrad.Obj1()

--- a/tests/tensor/test_merge.py
+++ b/tests/tensor/test_merge.py
@@ -38,6 +38,9 @@ class MyOp(Op):
        outputs = [MyType()()]
        return Apply(self, inputs, outputs)
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError("No Python implementation available.")
    def __str__(self):
        return self.name

--- a/tests/test_gradient.py
+++ b/tests/test_gradient.py
@@ -51,6 +51,9 @@ class TestGradSourcesInputs:
                (x,) = inp
                (gz,) = grads
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        a = retNone().make_node()
        with pytest.raises(TypeError):
            grad_sources_inputs([(a.out, one)], None)
@@ -68,6 +71,9 @@ class TestGradSourcesInputs:
            def grad(self, inputs, grads):
                return [inputs[0].zeros_like()]
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        i = theano.tensor.vector()
        j = theano.tensor.vector()
        a1 = retOne().make_node(i)
@@ -91,6 +97,9 @@ class TestGradSourcesInputs:
            def grad(self, inp, grads):
                return (gval,)
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        a1 = TestOp().make_node()
        g = grad_sources_inputs([(a1.outputs[0], one)], None)
        assert g[a1.inputs[0]] is gval
@@ -112,6 +121,9 @@ class TestGradSourcesInputs:
                gz1, gz2 = grads
                return (gval,)
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        a1 = TestOp().make_node()
        g = grad_sources_inputs([(a1.outputs[0], one)], None)
        assert g[a1.inputs[0]] is gval
@@ -134,6 +146,9 @@ class TestGradSourcesInputs:
                (gz,) = grads
                return (gval0, gval1)
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        a1 = TestOp().make_node()
        g = grad_sources_inputs([(a1.outputs[0], one)], None)
        assert g[a1.inputs[0]] is gval0
@@ -155,6 +170,9 @@ class TestGradSourcesInputs:
            def grad(self, inp, grads):
                return gval0, gval1
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        a1 = TestOp().make_node()
        g = grad_sources_inputs([(a1.outputs[0], one)], None)
        assert g[a1.inputs[0]] is gval0
@@ -190,6 +208,9 @@ class TestGrad:
            def grad(self, inputs, output_grads):
                return [theano.gradient.grad_not_implemented(self, 0, inputs[0])]
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        a = theano.tensor.scalar()
        b = DummyOp()(a)
@@ -208,6 +229,9 @@ class TestGrad:
            def grad(self, inputs, output_grads):
                return [theano.gradient.grad_undefined(self, 0, inputs[0])]
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        a = theano.tensor.scalar()
        b = DummyOp()(a)
@@ -380,6 +404,9 @@ class TestGrad:
            def grad(self, inputs, output_grads):
                return [inputs[0].zeros_like()]
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        # Op2 has two inputs, f and g
        # Its gradient with respect to g is not defined
        class Op2(Op):
@@ -391,6 +418,9 @@ class TestGrad:
            def grad(self, inputs, output_grads):
                return [inputs[0].zeros_like(), NullType()()]
+            def perform(self, *args, **kwargs):
+                raise NotImplementedError()
        x = theano.tensor.vector()
        f, g = Op1()(x)
        cost = Op2()(f, g)

--- a/tests/test_ifelse.py
+++ b/tests/test_ifelse.py
@@ -581,6 +581,9 @@ class IfElseIfElseIf(Op):
        thunk.lazy = True
        return thunk
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
 class NotImplementedOpException(Exception):
    pass
@@ -597,6 +600,9 @@ class NotImplementedOp(Op):
        thunk.lazy = False
        return thunk
+    def perform(self, *args, **kwargs):
+        raise NotImplementedError()
 def test_ifelse():
    a = tt.scalar()

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -9,7 +9,7 @@ import theano
 from theano import tensor
 from theano.configdefaults import config
 from theano.gof.graph import Apply, Variable
-from theano.gof.op import COp, ExternalCOp, Op
+from theano.gof.op import COp, ExternalCOp, Op, _NoPythonOp
 from theano.gof.opt import copy_stack_trace
 from theano.gof.params_type import ParamsType
 from theano.gof.type import CType
@@ -493,6 +493,14 @@ int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
        return (9,)
+class GpuKernelBaseCOp(GpuKernelBase, COp):
+    pass
+class GpuKernelBaseExternalCOp(GpuKernelBase, ExternalCOp):
+    pass
 def forward_string_meth(name):
    def f(*args):
        res = getattr(GpuKernelBase, name)(*args)
@@ -517,7 +525,7 @@ def get_dtype(s):
        return np.dtype(s)
-class CGpuKernelBase(ExternalCOp, GpuKernelBase):
+class CGpuKernelBase(GpuKernelBaseExternalCOp, _NoPythonOp):
    """
    Class to combine GpuKernelBase and ExternalCOp.
@@ -1498,7 +1506,7 @@ class GpuJoin(HideC, Join):
 gpu_join = GpuJoin()
-class GpuSplit(HideC, Split):
+class GpuSplit(HideC, Split, _NoPythonOp):
    """
    Split for GPU.
@@ -1748,7 +1756,7 @@ def profile_printer(
        print("", file=file)
-class GpuEye(GpuKernelBase, Op):
+class GpuEye(GpuKernelBaseCOp, _NoPythonOp):
    """
    Eye for GPU.
@@ -1882,7 +1890,7 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
        return (10,)
-class GpuTri(GpuKernelBase, Op):
+class GpuTri(GpuKernelBaseCOp, _NoPythonOp):
    """
    Tri for GPU.

--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
 import theano
 from theano.compile import optdb
 from theano.gof.graph import Apply
-from theano.gof.op import COp
+from theano.gof.op import _NoPythonCOp
 from theano.gof.opt import LocalOptGroup
 from theano.gof.params_type import ParamsType
 from theano.scalar import bool as bool_t
@@ -27,7 +27,7 @@ except ImportError:
    pass
-class BlasOp(COp):
+class BlasOp(_NoPythonCOp):
    def c_headers(self, **kwargs):
        return ["<blas_api.h>", "<numpy_compat.h>", "<gpuarray_helper.h>"]
@@ -412,7 +412,7 @@ class GpuDot22(BlasOp):
 gpu_dot22 = GpuDot22()
-class GpuGemmBatch(BlasOp):
+class GpuGemmBatch(BlasOp, _NoPythonCOp):
    params_type = ParamsType(inplace=bool_t)
    __props__ = ("inplace",)
    _f16_ok = True
@@ -1009,7 +1009,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        )
-class GpuCorrMM(BaseGpuCorrMM):
+class GpuCorrMM(BaseGpuCorrMM, _NoPythonCOp):
    """
    GPU correlation implementation using Matrix Multiplication.
@@ -1129,7 +1129,7 @@ class GpuCorrMM(BaseGpuCorrMM):
        return d_bottom, d_weights
-class GpuCorrMM_gradWeights(BaseGpuCorrMM):
+class GpuCorrMM_gradWeights(BaseGpuCorrMM, _NoPythonCOp):
    """
    Gradient wrt. filters for `GpuCorrMM`.
@@ -1235,7 +1235,7 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
            return [[1], [1], [0], [0]]  # no connection to height, width
-class GpuCorrMM_gradInputs(BaseGpuCorrMM):
+class GpuCorrMM_gradInputs(BaseGpuCorrMM, _NoPythonCOp):
    """
    Gradient wrt. inputs for `GpuCorrMM`.
@@ -1337,7 +1337,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            return [[1], [1], [0], [0]]  # no connection to height, width
-class BaseGpuCorr3dMM(CGpuKernelBase):
+class BaseGpuCorr3dMM(CGpuKernelBase, _NoPythonCOp):
    """
    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
    `GpuCorr3dMM_gradInputs`. Cannot be used directly.
@@ -1777,7 +1777,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        )
-class GpuCorr3dMM(BaseGpuCorr3dMM):
+class GpuCorr3dMM(BaseGpuCorr3dMM, _NoPythonCOp):
    """
    GPU correlation implementation using Matrix Multiplication.
@@ -1881,7 +1881,7 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        return d_bottom, d_weights
-class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
+class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM, _NoPythonCOp):
    """
    Gradient wrt. filters for `GpuCorr3dMM`.
@@ -1970,7 +1970,7 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
-class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
+class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM, _NoPythonCOp):
    """
    Gradient wrt. inputs for `GpuCorr3dMM`.

--- a/theano/gpuarray/blocksparse.py
+++ b/theano/gpuarray/blocksparse.py
@@ -4,7 +4,7 @@ import numpy as np
 from theano import tensor
 from theano.gof.graph import Apply
-from theano.gof.op import ExternalCOp
+from theano.gof.op import _NoPythonExternalCOp
 from theano.gof.params_type import ParamsType
 from theano.gradient import grad_undefined
 from theano.scalar import bool as bool_t
@@ -17,7 +17,7 @@ from .type import gpu_context_type
 _logger = logging.getLogger("theano.gpuarray.blocksparse")
-class GpuSparseBlockGemv(ExternalCOp):
+class GpuSparseBlockGemv(_NoPythonExternalCOp):
    """
    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
    information.
@@ -32,7 +32,7 @@ class GpuSparseBlockGemv(ExternalCOp):
    # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.
    def __init__(self, inplace=False):
-        ExternalCOp.__init__(self, "c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
+        super().__init__("c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}
@@ -92,7 +92,7 @@ gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
 gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
-class GpuSparseBlockOuter(ExternalCOp):
+class GpuSparseBlockOuter(_NoPythonExternalCOp):
    """
    GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
    information.
@@ -106,7 +106,7 @@ class GpuSparseBlockOuter(ExternalCOp):
    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
    def __init__(self, inplace=False):
-        ExternalCOp.__init__(self, ["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)")
+        super().__init__(["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

--- a/theano/gpuarray/ctc.py
+++ b/theano/gpuarray/ctc.py
@@ -4,7 +4,7 @@ import sys
 import theano.tensor as tt
 from theano.configdefaults import config
 from theano.gof.graph import Apply
-from theano.gof.op import ExternalCOp
+from theano.gof.op import _NoPythonExternalCOp
 from theano.gof.opt import local_optimizer
 from theano.gpuarray import pygpu
 from theano.gpuarray.basic_ops import (
@@ -20,7 +20,7 @@ from theano.tensor.nnet.ctc import ctc_available
 from theano.tensor.opt import register_canonicalize
-class GpuConnectionistTemporalClassification(ExternalCOp):
+class GpuConnectionistTemporalClassification(_NoPythonExternalCOp):
    """
    GPU wrapper for Baidu CTC loss function.

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -12,7 +12,7 @@ from theano import tensor
 from theano.compile.ops import shape_i, shape_i_op
 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME, config
 from theano.gof.graph import Apply, Variable
-from theano.gof.op import COp, ExternalCOp
+from theano.gof.op import ExternalCOp, _NoPythonCOp, _NoPythonExternalCOp
 from theano.gof.params_type import ParamsType
 from theano.gof.type import CDataType, EnumList, Generic
 from theano.gpuarray import cudnn_defs, pygpu
@@ -302,7 +302,7 @@ class MakerCDataType(CDataType):
        return self._get_func()(ptr)
-class CDataMaker(COp):
+class CDataMaker(_NoPythonCOp):
    """This is the equally lame `Op` that accompanies `MakerCDataType`."""
    __props__ = ("rtype",)
@@ -350,7 +350,7 @@ def CUDNNDataType(name, freefunc=None):
    )
-class DnnVersion(COp):
+class DnnVersion(_NoPythonCOp):
    __props__ = ()
    def c_headers(self, **kwargs):
@@ -460,7 +460,7 @@ def get_precision(precision, inputs, for_grad=False):
    return precision, common_dtype
-class DnnBase(ExternalCOp):
+class DnnBase(_NoPythonExternalCOp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -496,7 +496,7 @@ class DnnBase(ExternalCOp):
    def __init__(self, files=None, c_func=None):
        if files is None:
            files = []
-        ExternalCOp.__init__(self, ["c_code/dnn_base.c"] + files, c_func)
+        super().__init__(["c_code/dnn_base.c"] + files, c_func)
    def c_headers(self, **kwargs):
        return [
@@ -535,7 +535,7 @@ class DnnBase(ExternalCOp):
        return (super().c_code_cache_version(), version(), 4)
-class GpuDnnConvDesc(ExternalCOp):
+class GpuDnnConvDesc(_NoPythonExternalCOp):
    """
    This Op builds a convolution descriptor for use in the other convolution
@@ -607,7 +607,7 @@ class GpuDnnConvDesc(ExternalCOp):
        precision="float32",
        num_groups=1,
    ):
-        ExternalCOp.__init__(self, ["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")
+        super().__init__(["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")
        if version() < 6000 and any([d != 1 for d in dilation]):
            raise RuntimeError("Dilation > 1 not supported for cuDNN version < 6.")
@@ -756,8 +756,7 @@ class GpuDnnConv(DnnBase):
    )
    def __init__(self, algo=None, inplace=False, num_groups=1):
-        DnnBase.__init__(
+        super().__init__(
-            self,
            ["c_code/dnn_conv_base.c", "c_code/dnn_fwd.c"],
            "APPLY_SPECIFIC(conv_fwd)",
        )
@@ -918,8 +917,7 @@ class GpuDnnConvGradW(DnnBase):
    )
    def __init__(self, inplace=False, algo=None, num_groups=1):
-        DnnBase.__init__(
+        super().__init__(
-            self,
            ["c_code/dnn_conv_base.c", "c_code/dnn_gw.c"],
            "APPLY_SPECIFIC(conv_gw)",
        )
@@ -1088,8 +1086,7 @@ class GpuDnnConvGradI(DnnBase):
    )
    def __init__(self, inplace=False, algo=None, num_groups=1):
-        DnnBase.__init__(
+        super().__init__(
-            self,
            ["c_code/dnn_conv_base.c", "c_code/dnn_gi.c"],
            "APPLY_SPECIFIC(conv_gi)",
        )
@@ -1767,7 +1764,7 @@ def dnn_gradinput3d(
    )
-class GpuDnnPoolDesc(COp):
+class GpuDnnPoolDesc(_NoPythonCOp):
    """
    This Op builds a pooling descriptor for use in the other
    pooling operations.
@@ -1911,7 +1908,7 @@ class GpuDnnPoolBase(DnnBase):
    params_type = ParamsType(mode=cudnn.cudnnPoolingMode_t, handle=handle_type)
    def __init__(self, mode="max"):
-        DnnBase.__init__(self, [self.c_file], self.c_function)
+        super().__init__([self.c_file], self.c_function)
        if mode == "average":
            mode = "average_inc_pad"
        # Supported modes depend on runtime cuDNN version.
@@ -2114,7 +2111,7 @@ class GpuDnnSoftmaxBase(DnnBase):
    )
    def __init__(self, algo, mode):
-        DnnBase.__init__(self, [self.file], self.c_func)
+        super().__init__([self.file], self.c_func)
        assert cudnn.cudnnSoftmaxAlgorithm_t.has_alias(algo)
        self.algo = algo
@@ -2207,7 +2204,7 @@ class GpuDnnReduction(DnnBase):
    )
    def __init__(self, red_op, axis, acc_dtype, dtype, return_indices):
-        DnnBase.__init__(self, ["c_code/dnn_redux.c"], "APPLY_SPECIFIC(dnn_redux)")
+        super().__init__(["c_code/dnn_redux.c"], "APPLY_SPECIFIC(dnn_redux)")
        assert cudnn.cudnnReduceTensorOp_t.has_alias(red_op)
        self.red_op = red_op
        assert acc_dtype in ["float16", "float32", "float64"]
@@ -2328,8 +2325,7 @@ class GpuDnnBatchNorm(DnnBase):
        inplace_running_var=False,
        inplace_output=False,
    ):
-        DnnBase.__init__(
+        super().__init__(
-            self,
            ["c_code/dnn_batchnorm_base.c", "c_code/dnn_batchnorm.c"],
            "dnn_batchnorm_op",
        )
@@ -2460,8 +2456,7 @@ class GpuDnnBatchNormInference(DnnBase):
    )
    def __init__(self, mode="per-activation", inplace=False):
-        DnnBase.__init__(
+        super().__init__(
-            self,
            ["c_code/dnn_batchnorm_base.c", "c_code/dnn_batchnorm_inf.c"],
            "dnn_batchnorm_op",
        )
@@ -2546,8 +2541,7 @@ class GpuDnnBatchNormGrad(DnnBase):
    params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t, handle=handle_type)
    def __init__(self, mode="per-activation"):
-        DnnBase.__init__(
+        super().__init__(
-            self,
            ["c_code/dnn_batchnorm_base.c", "c_code/dnn_batchnorm_grad.c"],
            "dnn_batchnorm_grad",
        )
@@ -2585,7 +2579,7 @@ class GpuDnnDropoutOp(DnnBase):
    __props__ = ("inplace",)
    def __init__(self, inplace=False):
-        DnnBase.__init__(self, ["c_code/dnn_dropout_fwd.c"], "dnn_dropout_fwd")
+        super().__init__(["c_code/dnn_dropout_fwd.c"], "dnn_dropout_fwd")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {1: [2]}
@@ -2605,7 +2599,7 @@ class _DropoutDescriptor(DnnBase):
    __props__ = ("context_name",)
    def __init__(self, context_name):
-        DnnBase.__init__(self, ["c_code/dnn_dropout_desc.c"], "dnn_dropout_desc")
+        super().__init__(["c_code/dnn_dropout_desc.c"], "dnn_dropout_desc")
        self.context_name = context_name
    def dnn_context(self, node):
@@ -2666,7 +2660,7 @@ class _RNNDescriptor(DnnBase):
    def __init__(self, context_name):
        if version() < 5005:
            raise RuntimeError("cudnn RNN require cudnn v5 final or higher.")
-        DnnBase.__init__(self, ["c_code/dnn_rnn_desc.c"], "dnn_rnn_desc")
+        super().__init__(["c_code/dnn_rnn_desc.c"], "dnn_rnn_desc")
        self.context_name = context_name
    def dnn_context(self, node):
@@ -2759,7 +2753,7 @@ class _RNNParamSize(DnnBase):
    __props__ = ("context_name",)
    def __init__(self, context_name):
-        DnnBase.__init__(self, ["c_code/dnn_rnn_paramsize.c"], "dnn_rnn_paramsize")
+        super().__init__(["c_code/dnn_rnn_paramsize.c"], "dnn_rnn_paramsize")
        self.context_name = context_name
    def dnn_context(self, node):
@@ -2792,7 +2786,7 @@ class _RNNSplitParams(DnnBase):
    __props__ = ("rnn_mode",)
    def __init__(self, rnn_mode):
-        DnnBase.__init__(self)
+        super().__init__()
        self.rnn_mode = rnn_mode
    def make_node(self, w, desc, layer, isize, typecode):
@@ -3035,7 +3029,7 @@ class GpuDnnRNNOp(DnnBase):
    _cop_num_outputs = 4
    def __init__(self, rnn_mode, direction_mode):
-        DnnBase.__init__(self, ["c_code/dnn_rnn_fwd.c"], "dnn_rnn_fwd")
+        super().__init__(["c_code/dnn_rnn_fwd.c"], "dnn_rnn_fwd")
        self.rnn_mode = rnn_mode
        if direction_mode == "bidirectional":
            self.num_dirs = 2
@@ -3126,7 +3120,7 @@ class GpuDnnRNNGradInputs(DnnBase):
    _cop_num_outputs = 4
    def __init__(self, rnn_mode, grad_h, grad_c):
-        DnnBase.__init__(self, ["c_code/dnn_rnn_gi.c"], "dnn_rnn_gi")
+        super().__init__(["c_code/dnn_rnn_gi.c"], "dnn_rnn_gi")
        self.rnn_mode = rnn_mode
        self.grad_h = grad_h
        self.grad_c = grad_c
@@ -3175,7 +3169,7 @@ class GpuDnnRNNGradWeights(DnnBase):
    __props__ = ()
    def __init__(self):
-        DnnBase.__init__(self, ["c_code/dnn_rnn_gw.c"], "dnn_rnn_gw")
+        super().__init__(["c_code/dnn_rnn_gw.c"], "dnn_rnn_gw")
    def make_node(self, desc, x, hx, y, reserve, w):
        # We trust the callers here
@@ -3579,9 +3573,7 @@ class GpuDnnTransformerGrid(DnnBase):
    check_input = False
    def __init__(self):
-        DnnBase.__init__(
+        super().__init__(["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)")
-            self, ["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)"
-        )
    def make_node(self, theta, out_dims):
        """
@@ -3640,8 +3632,8 @@ class GpuDnnTransformerSampler(DnnBase):
    check_input = False
    def __init__(self):
-        DnnBase.__init__(
+        super().__init__(
-            self, ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)"
+            ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)"
        )
    def make_node(self, img, grid):
@@ -3704,7 +3696,7 @@ class GpuDnnTransformerGradI(DnnBase):
    check_input = False
    def __init__(self):
-        DnnBase.__init__(self, ["c_code/dnn_sptf_gi.c"], "APPLY_SPECIFIC(dnn_sptf_gi)")
+        super().__init__(["c_code/dnn_sptf_gi.c"], "APPLY_SPECIFIC(dnn_sptf_gi)")
    def make_node(self, img, grid, dy):
        context_name = infer_context_name(img, grid, dy)
@@ -3742,7 +3734,7 @@ class GpuDnnTransformerGradT(DnnBase):
    check_input = False
    def __init__(self):
-        DnnBase.__init__(self, ["c_code/dnn_sptf_gt.c"], "APPLY_SPECIFIC(dnn_sptf_gt)")
+        super().__init__(["c_code/dnn_sptf_gt.c"], "APPLY_SPECIFIC(dnn_sptf_gt)")
    def make_node(self, dgrid):
        context_name = infer_context_name(dgrid)

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -5,7 +5,7 @@ import numpy as np
 from theano import scalar
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.gof.utils import MethodNotDefined
 from theano.link.c.interface import HideC
 from theano.scalar import Composite, Scalar
@@ -84,7 +84,7 @@ def max_inputs_to_GpuElemwise(node_or_outputs):
    return max_nb_inputs
-class GpuElemwise(HideC, Elemwise):
+class GpuElemwise(_NoPythonOp, HideC, Elemwise):
    """
    Elemwise on the GPU.
@@ -414,9 +414,6 @@ class GpuElemwise(HideC, Elemwise):
        return str(code)
-    # To disable the superclass perform.
-    perform = Op.perform
    # Since we don't have a perform ...
    def python_constant_folding(self, node):
        return False
@@ -482,7 +479,7 @@ class GpuDimShuffle(DimShuffle):
        storage[0] = res
-class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
+class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype, _NoPythonOp):
    """
    GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
@@ -616,9 +613,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            ],
        )
-    def perform(self, node, inp, out, ctx):
-        Op.perform(self, node, inp, out, ctx)
    def supports_c_code(self, inputs):
        """
        Returns True if the current op and reduce pattern has functioning C code.

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.tensor.extra_ops import CumOp
@@ -11,7 +11,7 @@ except ImportError:
 import theano.scalar as scalar
 from theano.gof.params_type import ParamsType
 from theano.gpuarray.basic_ops import (
-    GpuKernelBase,
+    GpuKernelBaseCOp,
    GpuReshape,
    Kernel,
    as_gpuarray_variable,
@@ -22,7 +22,7 @@ from theano.gpuarray.opt import op_lifter, register_opt, register_opt2
 from theano.gpuarray.type import gpu_context_type
-class GpuCumOp(GpuKernelBase, Op):
+class GpuCumOp(GpuKernelBaseCOp, _NoPythonOp):
    """
    Parameters
    ----------
@@ -505,7 +505,7 @@ class GpuCumOp(GpuKernelBase, Op):
 # GpuCumsumOp exists only to serve backward compatibility.
 # Once an object is created, it will be converted to CumOp object.
-class GpuCumsumOp(GpuKernelBase, Op):
+class GpuCumsumOp(GpuKernelBaseCOp, _NoPythonOp):
    SUPPORTED_NDIMS = 3
    __props__ = ("axis",)

--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -2,7 +2,7 @@ import numpy as np
 import theano.tensor as tt
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.gpuarray.basic_ops import (
    as_gpuarray_variable,
    gpu_contiguous,
@@ -37,7 +37,7 @@ except Exception:
    skcuda_available = False
-class CuRFFTOp(Op):
+class CuRFFTOp(_NoPythonOp):
    __props__ = ()
@@ -168,7 +168,7 @@ class CuRFFTOp(Op):
 curfft_op = CuRFFTOp()
-class CuIRFFTOp(Op):
+class CuIRFFTOp(_NoPythonOp):
    __props__ = ()

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -11,12 +11,12 @@ except ImportError:
 import theano
 import theano.sandbox.multinomial
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.scalar import as_scalar
 from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from .basic_ops import (
-    GpuKernelBase,
+    GpuKernelBaseCOp,
    Kernel,
    as_gpuarray_variable,
    gpuarray_helper_inc_dir,
@@ -28,12 +28,12 @@ from .opt import op_lifter, register_opt, register_opt2
 from .type import GpuArrayType
-class GPUAMultinomialFromUniform(GpuKernelBase, Op):
+class GPUAMultinomialFromUniform(GpuKernelBaseCOp, _NoPythonOp):
    __props__ = ("odtype",)
    _f16_ok = True
    def __init__(self, odtype):
-        Op.__init__(self)
+        super().__init__(self)
        self.odtype = odtype
    def get_params(self, node):
@@ -251,7 +251,7 @@ KERNEL void k_multi_warp_multinomial(
        return (7,)
-class GPUAChoiceFromUniform(GpuKernelBase, Op):
+class GPUAChoiceFromUniform(GpuKernelBaseCOp, _NoPythonOp):
    """
    The output is transposed compared to MultinomialWOReplacementFromUniform.
    We must insert a Transpose op after it.
@@ -263,7 +263,7 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):
    __props__ = ("odtype", "replace")
    def __init__(self, odtype, replace=False):
-        Op.__init__(self)
+        super().__init__(self)
        self.odtype = odtype
        self.replace = replace

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
 import theano.tensor as tt
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.gof.params_type import ParamsType
 from theano.tensor.nnet.neighbours import Images2Neibs
@@ -11,7 +11,7 @@ except ImportError:
    pass
 from theano.gpuarray.basic_ops import (
-    GpuKernelBase,
+    GpuKernelBaseCOp,
    Kernel,
    as_gpuarray_variable,
    infer_context_name,
@@ -19,7 +19,7 @@ from theano.gpuarray.basic_ops import (
 from theano.gpuarray.type import GpuArrayType, gpu_context_type
-class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
+class GpuImages2Neibs(GpuKernelBaseCOp, Images2Neibs, _NoPythonOp):
    """
    Images2Neibs for the GPU.
@@ -627,7 +627,3 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
            params=sub["params"],
            fail=sub["fail"],
        )
-    def perform(self, node, inp, out, params):
-        # Disable the perform method from the CPU version
-        Op.perform(self, node, inp, out, params)
--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -3,7 +3,7 @@ from io import StringIO
 import numpy as np
 from theano.gof.graph import Apply
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 try:
@@ -12,18 +12,18 @@ try:
 except ImportError:
    pass
-from .basic_ops import (
+from theano.gpuarray.basic_ops import (
-    GpuKernelBase,
+    GpuKernelBaseCOp,
    Kernel,
    as_gpuarray_variable,
    gpuarray_helper_inc_dir,
    infer_context_name,
 )
-from .fp16_help import load_w, work_dtype, write_w
+from theano.gpuarray.fp16_help import load_w, work_dtype, write_w
-from .type import GpuArrayType
+from theano.gpuarray.type import GpuArrayType
-class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
+class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBaseCOp, _NoPythonOp):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
@@ -283,7 +283,7 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = (
 )
-class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
+class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBaseCOp, _NoPythonOp):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
@@ -508,7 +508,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
-class GpuSoftmax(GpuKernelBase, Op):
+class GpuSoftmax(GpuKernelBaseCOp, _NoPythonOp):
    """
    Implement Softmax on the gpu.
@@ -804,7 +804,7 @@ class GpuSoftmax(GpuKernelBase, Op):
 gpu_softmax = GpuSoftmax()
-class GpuSoftmaxWithBias(GpuKernelBase, Op):
+class GpuSoftmaxWithBias(GpuKernelBaseCOp, _NoPythonOp):
    """
    Implement SoftmaxWithBias on the gpu.

--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -20,7 +20,7 @@ import theano.tensor
 from theano.compile import optdb
 from theano.configdefaults import config
 from theano.gof.graph import Apply, Variable, is_in_ancestors
-from theano.gof.op import Op
+from theano.gof.op import _NoPythonOp
 from theano.gof.opt import GlobalOptimizer, local_optimizer
 from theano.scan.utils import clone
 from theano.tensor import TensorType, opt
@@ -40,7 +40,7 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 _logger = logging.getLogger("theano.ifelse")
-class IfElse(Op):
+class IfElse(_NoPythonOp):
    """
    Op that provides conditional graph evaluation if used with the CVM/VM
    linkers. Note that there exist a helpful function `ifelse` that should

--- a/theano/scan/op.py
+++ b/theano/scan/op.py
--- a/theano/sparse/opt.py
+++ b/theano/sparse/opt.py
@@ -5,7 +5,7 @@ import theano
 from theano import scalar, tensor
 from theano.configdefaults import config
 from theano.gof.graph import Apply
-from theano.gof.op import COp
+from theano.gof.op import COp, _NoPythonCOp
 from theano.gof.opt import PatternSub, TopoOptimizer, local_optimizer
 from theano.misc.safe_asarray import _asarray
 from theano.sparse import basic as sparse
@@ -78,7 +78,7 @@ theano.compile.optdb.register(
 )
-class AddSD_ccode(COp):
+class AddSD_ccode(_NoPythonCOp):
    """
    Add a sparse and a dense matrix.
@@ -663,7 +663,7 @@ def local_structured_dot(fgraph, node):
 # register_specialize(local_structured_dot)
-class UsmmCscDense(COp):
+class UsmmCscDense(_NoPythonCOp):
    """
    Performs the expression is `alpha` * `x` `y` + `z`.
@@ -995,7 +995,7 @@ def local_usmm_csx(fgraph, node):
 register_specialize(local_usmm_csx, "cxx_only")
-class CSMGradC(COp):
+class CSMGradC(_NoPythonCOp):
    __props__ = ()
@@ -1138,7 +1138,7 @@ def local_csm_grad_c(fgraph, node):
 # register_specialize(local_csm_grad_c, 'cxx_only')
-class MulSDCSC(COp):
+class MulSDCSC(_NoPythonCOp):
    """
    Multiplication of sparse matrix by a broadcasted dense vector
    element wise.
@@ -1181,9 +1181,6 @@ class MulSDCSC(COp):
    def c_code_cache_version(self):
        return (3,)
-    # def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
-    #    return NotImplementedError()
    def c_code(self, node, name, inputs, outputs, sub):
        (
@@ -1275,7 +1272,7 @@ class MulSDCSC(COp):
 mul_s_d_csc = MulSDCSC()
-class MulSDCSR(COp):
+class MulSDCSR(_NoPythonCOp):
    """
    Multiplication of sparse matrix by a broadcasted dense vector
    element wise.
@@ -1318,9 +1315,6 @@ class MulSDCSR(COp):
    def c_code_cache_version(self):
        return (3,)
-    # def perform(self, node, (a_data, a_indices, a_indptr, b), (out,)):
-    #    return NotImplemented()
    def c_code(self, node, name, inputs, outputs, sub):
        (
@@ -1463,7 +1457,7 @@ def local_mul_s_d(fgraph, node):
 register_specialize(local_mul_s_d, "cxx_only")
-class MulSVCSR(COp):
+class MulSVCSR(_NoPythonCOp):
    """
    Multiplication of sparse matrix by a broadcasted dense vector
    element wise.
@@ -1627,7 +1621,7 @@ def local_mul_s_v(fgraph, node):
 register_specialize(local_mul_s_v, "cxx_only")
-class StructuredAddSVCSR(COp):
+class StructuredAddSVCSR(_NoPythonCOp):
    """
    Structured addition of a sparse matrix and a dense vector.
    The elements of the vector are are only added to the corresponding
@@ -1806,7 +1800,7 @@ def local_structured_add_s_v(fgraph, node):
 register_specialize(local_structured_add_s_v, "cxx_only")
-class SamplingDotCSR(COp):
+class SamplingDotCSR(_NoPythonCOp):
    """
    Operand optimized for calculating the dot product dot(`x`, `y`.T) = `z`
    when you only want to calculate a subset of `z`.

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -4,7 +4,7 @@ import os
 import theano
 from theano.configdefaults import config
 from theano.gof.graph import Apply
-from theano.gof.op import OpenMPOp
+from theano.gof.op import OpenMPOp, _NoPythonOp
 from theano.gof.params_type import ParamsType
 from theano.gof.type import EnumList
 from theano.scalar import int8, int64
@@ -18,7 +18,7 @@ from theano.tensor.type import TensorType
 _logger = logging.getLogger(__name__)
-class BaseCorrMM(OpenMPOp):
+class BaseCorrMM(OpenMPOp, _NoPythonOp):
    """
    Base class for `CorrMM`, `CorrMM_gradWeights` and
    `CorrMM_gradInputs`. Cannot be used directly.

--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
@@ -4,7 +4,7 @@ import os
 import theano
 from theano.configdefaults import config
 from theano.gof.graph import Apply
-from theano.gof.op import OpenMPOp
+from theano.gof.op import OpenMPOp, _NoPythonOp
 from theano.gof.params_type import ParamsType
 from theano.gof.type import EnumList
 from theano.scalar import int64
@@ -18,7 +18,7 @@ from theano.tensor.type import TensorType
 _logger = logging.getLogger(__name__)
-class BaseCorr3dMM(OpenMPOp):
+class BaseCorr3dMM(OpenMPOp, _NoPythonOp):
    """
    Base class for `Corr3dMM`, `Corr3dMM_gradWeights` and
    `Corr3dMM_gradInputs`. Cannot be used directly.