Apply isort to theano.gpuarray and remove circular references

Two new modules were made in order to remove the circular references: `theano.gpuarray.dnn_opt` (i.e. optimizations specific to `theano.gpuarray.dnn`) and `theano.gpuarray.optdb` (i.e. optimization databases for gpuarray `Op`s).

Apply isort to theano.gpuarray and remove circular references
88599bc3 · Brandon T. Willard · b4fbaa2e · 88599bc3 · 88599bc3 · 88599bc3
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
-import sys
-import os
 import logging
+import os
+import sys
 import warnings
 import theano
 from theano import config
 from theano.compile import optdb
 from theano.tensor.basic import register_transfer
 _logger_name = "theano.gpuarray"
 _logger = logging.getLogger(_logger_name)
@@ -24,19 +24,20 @@ try:
 except ImportError:
    pygpu = None
+from . import ctc, dnn, extra_ops, fft, multinomial, opt, reduction, rng_mrg, sort
+from .basic_ops import as_gpuarray_variable
 # This is for documentation not to depend on the availability of pygpu
 from .type import (
-    GpuArrayType,
+    ContextNotDefined,
-    GpuArrayVariable,
    GpuArrayConstant,
    GpuArraySharedVariable,
+    GpuArrayType,
+    GpuArrayVariable,
+    get_context,
    gpuarray_shared_constructor,
    reg_context,
-    get_context,
-    ContextNotDefined,
 )
-from .basic_ops import as_gpuarray_variable
-from . import fft, dnn, opt, extra_ops, multinomial, reduction, sort, rng_mrg, ctc
 def transfer(x, target):
@@ -292,7 +293,7 @@ if pygpu:
        host_from_gpu,
    )
    from .elemwise import GpuElemwise
-    from .subtensor import GpuSubtensor, GpuIncSubtensor, GpuAdvancedIncSubtensor1
+    from .subtensor import GpuAdvancedIncSubtensor1, GpuIncSubtensor, GpuSubtensor
 else:
    if (

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
-import os
 import copy
+import os
 import re
-import numpy as np
-import theano
 from collections import deque
+import numpy as np
 from six import string_types
-from theano import Op, Apply, Type, Variable
+import theano
-from theano import tensor, config
+from theano import Apply, Op, Type, Variable, config, tensor
+from theano.gof import COp, HideC, ParamsType
+from theano.gof.opt import copy_stack_trace
+from theano.gof.utils import MethodNotDefined
 from theano.gradient import grad_undefined
-from theano.scalar import bool as bool_t, int32 as int32_t
+from theano.scalar import bool as bool_t
-from theano.tensor.basic import Alloc, AllocEmpty, alloc_validate_shape, Join, Split
+from theano.scalar import int32 as int32_t
+from theano.tensor.basic import Alloc, AllocEmpty, Join, Split, alloc_validate_shape
-from theano.gof import HideC, COp, ParamsType
-from theano.gof.utils import MethodNotDefined
-from theano.gof.opt import copy_stack_trace
 try:
    import pygpu
@@ -26,15 +23,15 @@ try:
 except ImportError:
    pass
+from .fp16_help import write_w
 from .type import (
-    GpuArrayType,
+    EQ_MAP,
+    ContextNotDefined,
    GpuArrayConstant,
-    gpu_context_type,
+    GpuArrayType,
    get_context,
-    ContextNotDefined,
+    gpu_context_type,
-    EQ_MAP,
 )
-from .fp16_help import write_w
 def as_gpuarray_variable(x, context_name):
@@ -1000,7 +997,7 @@ class GpuAlloc(HideC, Alloc):
        return (4,)
    def do_constant_folding(self, node):
-        from . import subtensor, blas
+        from . import blas, subtensor
        for client in node.outputs[0].clients:
            if client[0] == "output":

--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -2,7 +2,6 @@ from six import integer_types
 import theano
 from theano import Apply, Op
 from theano.compile import optdb
 from theano.gof import LocalOptGroup, ParamsType
 from theano.scalar import bool as bool_t
@@ -10,15 +9,16 @@ from theano.tensor.basic import as_tensor_variable
 from theano.tensor.opt import in2out
 from .basic_ops import (
-    GpuArrayType,
    CGpuKernelBase,
+    GpuArrayType,
    as_gpuarray_variable,
    gpu_contiguous,
-    infer_context_name,
    gpuarray_helper_inc_dir,
+    infer_context_name,
 )
 from .opt_util import inplace_allocempty
 try:
    import pygpu
    from pygpu import blas

--- a/theano/gpuarray/blocksparse.py
+++ b/theano/gpuarray/blocksparse.py
 import logging
 import numpy as np
 from theano import Apply, tensor
 from theano.gof import COp, ParamsType
-from theano.tensor import discrete_dtypes, as_tensor_variable
-from theano.scalar import bool as bool_t
 from theano.gradient import grad_undefined
+from theano.scalar import bool as bool_t
+from theano.tensor import as_tensor_variable, discrete_dtypes
+from .basic_ops import as_gpuarray_variable, gpuarray_helper_inc_dir, infer_context_name
 from .type import gpu_context_type
-from .basic_ops import as_gpuarray_variable, infer_context_name, gpuarray_helper_inc_dir
 _logger = logging.getLogger("theano.gpuarray.blocksparse")

--- a/theano/gpuarray/ctc.py
+++ b/theano/gpuarray/ctc.py
@@ -4,22 +4,20 @@ import sys
 import theano
 import theano.tensor as tt
 import theano.tensor.nnet.ctc
 from theano import config, gof
+from theano.gof import local_optimizer
+from theano.gpuarray import pygpu
 from theano.gpuarray.basic_ops import (
-    gpu_contiguous,
    as_gpuarray_variable,
-    infer_context_name,
+    gpu_contiguous,
    gpuarray_helper_inc_dir,
+    infer_context_name,
 )
-from theano.gpuarray.type import GpuArrayType, gpu_context_type
 from theano.gpuarray.elemwise import GpuDimShuffle
+from theano.gpuarray.type import GpuArrayType, gpu_context_type
 from theano.gradient import grad_undefined
-from theano.gof import local_optimizer
-from theano.tensor.opt import register_canonicalize
 from theano.tensor.nnet.ctc import ctc_available
+from theano.tensor.opt import register_canonicalize
-from theano.gpuarray import pygpu
 class GpuConnectionistTemporalClassification(gof.COp):

--- a/theano/gpuarray/cudnn_defs.py
+++ b/theano/gpuarray/cudnn_defs.py
@@ -18,6 +18,7 @@ Currently supported cuDNN APIs:
 from theano.gof import CEnumType
 HALF, FLOAT, DOUBLE = ("float16", "float32", "float64")
 TRUE_HALF_CONFIG = (HALF, HALF)
 PSEUDO_HALF_CONFIG = (HALF, FLOAT)

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2,85 +2,51 @@ import ctypes
 import os
 import sys
 import warnings
+from functools import reduce
 import numpy as np
+from six import integer_types
 import theano
 import theano.pathparse
+from theano import Apply, Op, Variable, config, tensor
-from functools import reduce
+from theano.compile.ops import shape_i, shape_i_op
+from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
-from six import integer_types
+from theano.gof import COp, EnumList, ParamsType
-from theano import Op, Apply, tensor, config, Variable
-from theano.scalar import (
-    as_scalar,
-    constant,
-    Log,
-    get_scalar_type,
-    int32 as int_t,
-    bool as bool_t,
-    uint32 as uint32_t,
-)
-from theano.tensor import as_tensor_variable, Argmax
-from theano.tensor.extra_ops import cpu_contiguous
-from theano.gradient import DisconnectedType, grad_not_implemented
-from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
 from theano.gof.cmodule import GCC_compiler
 from theano.gof.type import CDataType, Generic
-from theano.gof.opt import inherit_stack_trace
+from theano.gpuarray import cudnn_defs, pygpu
-from theano.tensor.opt import Assert
+from theano.gpuarray.basic_ops import (
-from theano.compile import optdb
+    GpuAllocEmpty,
-from theano.compile.ops import shape_i, shape_i_op
+    GpuArrayType,
-from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
+    HostFromGpu,
+    as_gpuarray_variable,
+    empty_like,
+    gpu_contiguous,
+    gpuarray_helper_inc_dir,
+    infer_context_name,
+)
+from theano.gpuarray.type import GpuArraySharedVariable, get_context, gpu_context_type
+from theano.gradient import DisconnectedType, grad_not_implemented
+from theano.scalar import as_scalar
+from theano.scalar import bool as bool_t
+from theano.scalar import constant, get_scalar_type
+from theano.scalar import int32 as int_t
+from theano.scalar import uint32 as uint32_t
+from theano.tensor.basic import as_tensor_variable
+from theano.tensor.extra_ops import cpu_contiguous
 from theano.tensor.nnet.abstract_conv import (
    AbstractConv2d,
-    AbstractConv2d_gradWeights,
    AbstractConv2d_gradInputs,
+    AbstractConv2d_gradWeights,
    AbstractConv3d,
-    AbstractConv3d_gradWeights,
    AbstractConv3d_gradInputs,
-    get_conv_output_shape,
+    AbstractConv3d_gradWeights,
    assert_conv_shape,
+    get_conv_output_shape,
 )
-from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
+from theano.tensor.opt import Assert
-from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
-from . import pygpu, cudnn_defs
-from .type import get_context, gpu_context_type, list_contexts, GpuArraySharedVariable
-from .basic_ops import (
-    as_gpuarray_variable,
-    infer_context_name,
-    gpuarray_helper_inc_dir,
-    gpu_contiguous,
-    GpuAllocEmpty,
-    empty_like,
-    GpuArrayType,
-    HostFromGpu,
-)
-from .elemwise import GpuElemwise, GpuCAReduceCuda
-from .reduction import GpuMaxAndArgmax
-# These don't exist in gpuarray
-# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
-from .nnet import GpuSoftmax
-from .opt import (
-    gpu_seqopt,
-    register_opt,
-    pool_db,
-    pool_db2,
-    op_lifter,
-    register_opt2,
-    register_inplace,
-)
-from .opt_util import (
-    alpha_merge,
-    output_merge,
-    inplace_allocempty,
-    pad_dims,
-    unpad_dims,
-)
 DNN_CONV_ALGO_CHOOSE_ONCE = ["guess_once", "time_once"]
 DNN_CONV_ALGO_CHOOSE_TIME = ["time_once", "time_on_shape_change"]
@@ -3923,779 +3889,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
    return [rval]
-@local_optimizer([AbstractConv2d, AbstractConv3d])
-def local_abstractconv_cudnn(node):
-    ctx = infer_context_name(*node.inputs)
-    if not isinstance(node.inputs[0].type, GpuArrayType):
-        return
-    if node.op.unshared:
-        return None
-    if isinstance(node.op.border_mode, tuple) and any(
-        isinstance(p, tuple) for p in node.op.border_mode
-    ):
-        # Asymmetric padding not yet supported
-        return None
-    if isinstance(node.op, AbstractConv2d):
-        with inherit_stack_trace(node.outputs):
-            return local_abstractconv_cudnn_graph(
-                node.op, ctx, node.inputs, node.outputs
-            )
-    elif isinstance(node.op, AbstractConv3d):
-        with inherit_stack_trace(node.outputs):
-            return local_abstractconv3d_cudnn_graph(
-                node.op, ctx, node.inputs, node.outputs
-            )
-@local_optimizer(
-    [AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]
-)
-def local_abstractconv_cudnn_alt(node):
-    if not isinstance(
-        node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)
-    ):
-        return
-    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
-        return None
-    if node.op.unshared:
-        return None
-    if isinstance(node.op.border_mode, tuple) and any(
-        isinstance(p, tuple) for p in node.op.border_mode
-    ):
-        # Asymmetric padding not yet supported
-        return None
-    inp1 = node.inputs[0]
-    inp2 = node.inputs[1]
-    if not dnn_available(inp1.type.context_name):
-        return
-    op = node.op
-    border_mode = node.op.border_mode
-    subsample = node.op.subsample
-    filter_dilation = node.op.filter_dilation
-    num_groups = node.op.num_groups
-    precision, _ = get_precision(None, [inp1, inp2])
-    if node.op.filter_flip:
-        conv_mode = "conv"
-    else:
-        conv_mode = "cross"
-    if isinstance(op, AbstractConv2d):
-        if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
-            return None
-        if border_mode == "full":
-            direction_hint = "bprop inputs"
-        elif border_mode == "valid" and filter_dilation == (1, 1):
-            direction_hint = "bprop weights"
-        else:
-            return None
-        rval = dnn_conv(
-            inp1,
-            inp2,
-            border_mode=border_mode,
-            subsample=subsample,
-            dilation=filter_dilation,
-            direction_hint=direction_hint,
-            conv_mode=conv_mode,
-            num_groups=num_groups,
-        )
-    elif isinstance(op, AbstractConv2d_gradWeights):
-        if (
-            border_mode == "valid"
-            and subsample == (1, 1)
-            and filter_dilation == (1, 1)
-            and num_groups == 1
-        ):
-            img = gpu_contiguous(inp1)
-            topgrad = gpu_contiguous(inp2)
-            ctx_name = infer_context_name(img, topgrad)
-            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
-            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
-            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
-            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
-            out_shp = get_conv_output_shape(
-                ishape,
-                tshape,
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            out_shp = assert_conv_shape(out_shp)
-            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-            desc = GpuDnnConvDesc(
-                border_mode=border_mode,
-                subsample=subsample,
-                dilation=filter_dilation,
-                conv_mode="cross",
-                precision=precision,
-            )(out.shape)
-            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
-            if conv_mode == "conv":
-                conv = conv[:, :, ::-1, ::-1]
-            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
-        else:
-            return None
-    elif isinstance(op, AbstractConv2d_gradInputs):
-        if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
-            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
-            topgrad = gpu_contiguous(inp2)
-            ctx_name = infer_context_name(kerns, topgrad)
-            conv_mode = "cross" if conv_mode == "conv" else "conv"
-            desc = GpuDnnConvDesc(
-                border_mode="full",
-                subsample=subsample,
-                dilation=filter_dilation,
-                conv_mode=conv_mode,
-                precision=precision,
-            )(kerns.shape)
-            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
-            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
-            shape = get_conv_output_shape(
-                tshape,
-                kshape,
-                border_mode="full",
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            shape = assert_conv_shape(shape)
-            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
-            rval = GpuDnnConv(algo=None, num_groups=num_groups)(
-                topgrad, kerns, out, desc
-            )
-        else:
-            return None
-    return [rval]
-@local_optimizer(
-    [AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs]
-)
-def local_abstractconv3d_cudnn_alt(node):
-    if not isinstance(
-        node.op, (AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs)
-    ):
-        return
-    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
-        return None
-    inp1 = node.inputs[0]
-    inp2 = node.inputs[1]
-    if not dnn_available(inp1.type.context_name):
-        return
-    op = node.op
-    border_mode = node.op.border_mode
-    subsample = node.op.subsample
-    filter_dilation = node.op.filter_dilation
-    num_groups = node.op.num_groups
-    precision, _ = get_precision(None, [inp1, inp2])
-    if node.op.filter_flip:
-        conv_mode = "conv"
-    else:
-        conv_mode = "cross"
-    if isinstance(op, AbstractConv3d):
-        if border_mode == "half" or subsample != (1, 1, 1) or num_groups > 1:
-            return None
-        if border_mode == "full":
-            direction_hint = "bprop inputs"
-        elif border_mode == "valid" and filter_dilation == (1, 1, 1):
-            direction_hint = "bprop weights"
-        else:
-            return None
-        rval = dnn_conv3d(
-            inp1,
-            inp2,
-            border_mode=border_mode,
-            subsample=subsample,
-            dilation=filter_dilation,
-            direction_hint=direction_hint,
-            conv_mode=conv_mode,
-        )
-    elif isinstance(op, AbstractConv3d_gradWeights):
-        if (
-            border_mode == "valid"
-            and subsample == (1, 1, 1)
-            and filter_dilation == (1, 1, 1)
-            and num_groups == 1
-        ):
-            img = gpu_contiguous(inp1)
-            topgrad = gpu_contiguous(inp2)
-            ctx_name = infer_context_name(img, topgrad)
-            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
-            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
-            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
-            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
-            out_shp = get_conv_output_shape(
-                ishape,
-                tshape,
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            out_shp = assert_conv_shape(out_shp)
-            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-            desc = GpuDnnConvDesc(
-                border_mode=border_mode,
-                subsample=subsample,
-                dilation=filter_dilation,
-                conv_mode="cross",
-                num_groups=num_groups,
-                precision=precision,
-            )(out.shape)
-            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
-            if conv_mode == "conv":
-                conv = conv[:, :, ::-1, ::-1, ::-1]
-            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
-        else:
-            return None
-    elif isinstance(op, AbstractConv3d_gradInputs):
-        if border_mode == "valid" and subsample == (1, 1, 1) and num_groups == 1:
-            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
-            topgrad = gpu_contiguous(inp2)
-            ctx_name = infer_context_name(kerns, topgrad)
-            conv_mode = "cross" if conv_mode == "conv" else "conv"
-            desc = GpuDnnConvDesc(
-                border_mode="full",
-                subsample=subsample,
-                dilation=filter_dilation,
-                conv_mode=conv_mode,
-                num_groups=num_groups,
-                precision=precision,
-            )(kerns.shape)
-            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
-            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
-            shape = get_conv_output_shape(
-                tshape,
-                kshape,
-                border_mode="full",
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            shape = assert_conv_shape(shape)
-            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
-            rval = GpuDnnConv(algo=None, num_groups=num_groups)(
-                topgrad, kerns, out, desc
-            )
-        else:
-            return None
-    return [rval]
-@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
-def local_abstractconv_gw_cudnn(node):
-    ctx = infer_context_name(*node.inputs)
-    if not isinstance(node.inputs[0].type, GpuArrayType):
-        return
-    if node.op.unshared:
-        return None
-    if isinstance(node.op.border_mode, tuple) and any(
-        isinstance(p, tuple) for p in node.op.border_mode
-    ):
-        # Asymmetric padding not yet supported
-        return None
-    if isinstance(node.op, AbstractConv2d_gradWeights):
-        with inherit_stack_trace(node.outputs):
-            return local_abstractconv_cudnn_graph(
-                node.op, ctx, node.inputs, node.outputs
-            )
-    elif isinstance(node.op, AbstractConv3d_gradWeights):
-        with inherit_stack_trace(node.outputs):
-            return local_abstractconv3d_cudnn_graph(
-                node.op, ctx, node.inputs, node.outputs
-            )
-@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
-def local_abstractconv_gi_cudnn(node):
-    ctx = infer_context_name(*node.inputs)
-    if not isinstance(node.inputs[0].type, GpuArrayType):
-        return
-    if node.op.unshared:
-        return None
-    if isinstance(node.op.border_mode, tuple) and any(
-        isinstance(p, tuple) for p in node.op.border_mode
-    ):
-        # Asymmetric padding not yet supported
-        return None
-    if isinstance(node.op, AbstractConv2d_gradInputs):
-        with inherit_stack_trace(node.outputs):
-            return local_abstractconv_cudnn_graph(
-                node.op, ctx, node.inputs, node.outputs
-            )
-    elif isinstance(node.op, AbstractConv3d_gradInputs):
-        with inherit_stack_trace(node.outputs):
-            return local_abstractconv3d_cudnn_graph(
-                node.op, ctx, node.inputs, node.outputs
-            )
-@inplace_allocempty(GpuDnnConv, 2)
-def local_dnn_conv_inplace(node, inputs):
-    return [
-        GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
-            *inputs
-        )
-    ]
-@inplace_allocempty(GpuDnnConvGradW, 2)
-def local_dnn_convgw_inplace(node, inputs):
-    return [
-        GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
-            *inputs
-        )
-    ]
-@inplace_allocempty(GpuDnnConvGradI, 2)
-def local_dnn_convgi_inplace(node, inputs):
-    return [
-        GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
-            *inputs
-        )
-    ]
-optdb.register(
-    "local_dnna_conv_inplace",
-    tensor.opt.in2out(
-        local_dnn_conv_inplace,
-        local_dnn_convgw_inplace,
-        local_dnn_convgi_inplace,
-        name="local_dnna_conv_inplace",
-    ),
-    70.0,
-    "fast_run",
-    "inplace",
-    "gpuarray",
-    "cudnn",
-)
-@register_opt("cudnn")
-@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
-def local_dnn_conv_alpha_merge(node, *inputs):
-    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
-def local_dnn_convw_alpha_merge(node, *inputs):
-    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
-def local_dnn_convi_alpha_merge(node, *inputs):
-    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
-def local_dnn_conv_output_merge(node, *inputs):
-    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
-def local_dnn_convw_output_merge(node, *inputs):
-    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
-def local_dnn_convi_output_merge(node, *inputs):
-    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    if not op.ignore_border:
-        return
-    img, ws, stride, pad = inputs
-    nd = op.ndim
-    if nd not in (2, 3):
-        return
-    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
-    mode = op.mode
-    # dnn_pool expects exactly 2 non-pooling dimensions
-    if img.ndim == nd + 2:
-        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
-    else:
-        # reshape to 4D or 5D with 2 non-pooling dimensions
-        img_padded = pad_dims(img, 2, nd)
-        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
-        return unpad_dims(ret_padded, img, 2, nd)
-pool_db.register(
-    "local_gpua_pool_dnn_alternative",
-    op_lifter([Pool])(local_gpua_pool_dnn_alternative),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-pool_db2.register(
-    "local_gpua_pool_dnn_alternative",
-    local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    if not op.ignore_border:
-        return
-    inp, out, out_grad, ws, stride, pad = inputs
-    nd = op.ndim
-    if nd not in (2, 3):
-        return
-    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
-    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
-    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
-    mode = op.mode
-    # the GPU ops expect exactly 2 non-pooling dimensions
-    if inp.ndim == nd + 2:
-        return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
-    else:
-        # reshape to 4D or 5D with 2 non-pooling dimensions
-        inp_padded = pad_dims(inp, 2, nd)
-        out_padded = pad_dims(out, 2, nd)
-        out_grad_padded = pad_dims(out_grad, 2, nd)
-        ret_padded = GpuDnnPoolGrad(mode=mode)(
-            inp_padded, out_padded, out_grad_padded, ws, stride, pad
-        )
-        return unpad_dims(ret_padded, inp, 2, nd)
-pool_db.register(
-    "local_gpua_pool_dnn_grad_stride",
-    op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-pool_db2.register(
-    "local_gpua_pool_dnn_grad_stride",
-    local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    if not op.ignore_border:
-        return
-    inp, out_grad, ws, stride, pad = inputs
-    nd = op.ndim
-    if nd not in (2, 3):
-        return
-    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
-    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
-    mode = op.mode
-    # the GPU ops expect exactly 2 non-pooling dimensions
-    if inp.ndim == nd + 2:
-        # We reuse out_grad because cuDNN does not use the value of the `out`
-        # argument but still checks its shape for average pooling. This
-        # has been observed in v2 and v3 as far as I know.
-        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
-    else:
-        # reshape to 4D or 5D with 2 non-pooling dimensions
-        inp_padded = pad_dims(inp, 2, nd)
-        out_grad_padded = pad_dims(out_grad, 2, nd)
-        ret_padded = GpuDnnPoolGrad(mode=mode)(
-            inp_padded, out_grad_padded, out_grad_padded, ws, stride, pad
-        )
-        return unpad_dims(ret_padded, inp, 2, nd)
-pool_db.register(
-    "local_gpua_avg_pool_dnn_grad_stride",
-    op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-pool_db2.register(
-    "local_gpua_avg_pool_dnn_grad_stride",
-    local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-@register_opt("cudnn", "fast_compile")
-@local_optimizer([GpuSoftmax])
-def local_softmax_dnn(node):
-    if isinstance(node.op, GpuSoftmax):
-        if not dnn_available(node.outputs[0].type.context_name):
-            return
-        ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
-        ins = gpu_contiguous(ins)
-        out = GpuDnnSoftmax("accurate", "channel")(ins)
-        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
-        return [out]
-@register_opt("cudnn", "stabilize")
-@local_optimizer([GpuElemwise])
-def local_log_softmax_dnn(node):
-    # This looks for GpuDnnSoftmax so we know that we have cudnn.
-    if (
-        isinstance(node.op, GpuElemwise)
-        and isinstance(node.op.scalar_op, Log)
-        and node.inputs[0].owner
-        and isinstance(node.inputs[0].owner.op, GpuDnnSoftmax)
-        and len(node.inputs[0].clients) == 1
-    ):
-        softmax_node = node.inputs[0].owner
-        new_softmax = GpuDnnSoftmax("log", softmax_node.op.mode)
-        return [new_softmax(softmax_node.inputs[0])]
-@register_opt("cudnn", "fast_compile")
-@op_lifter([LogSoftmax])
-@register_opt2([LogSoftmax], "fast_compile", "cudnn")
-def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
-    # Transform the input in the format expected by GpuDnnSoftmax
-    inp = inputs[0]
-    if inp.ndim != 2:
-        return
-    if not dnn_available(ctx_name):
-        return
-    inp = inp.dimshuffle(0, 1, "x", "x")
-    inp.tag.context_name = ctx_name
-    # Apply GpuDnnSoftmax and return the result
-    out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp))
-    return [out.dimshuffle(0, 1)]
-@register_opt("cudnn", "fast_compile")
-@op_lifter([SoftmaxGrad])
-@register_opt2([SoftmaxGrad], "cudnn", "fast_compile")
-def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    ins = []
-    for n in inputs:
-        n = as_gpuarray_variable(n, ctx_name)
-        if n.ndim != 2:
-            return
-        ins.append(n.dimshuffle(0, "x", 1, "x"))
-    out = GpuDnnSoftmaxGrad("accurate", "instance")(
-        gpu_contiguous(ins[0]), gpu_contiguous(ins[1])
-    )
-    return [out.dimshuffle(0, 2)]
-@register_opt("cudnn")
-@local_optimizer([GpuCAReduceCuda])
-def local_dnn_reduction(node):
-    if not isinstance(node.op, GpuCAReduceCuda):
-        return
-    if not dnn_available(node.inputs[0].type.context_name):
-        return
-    if version(raises=False) < 6000:
-        return
-    if node.inputs[0].ndim > 8:
-        return
-    acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)
-    if node.inputs[0].dtype != node.outputs[0].dtype:
-        # We can mix float16 and float32, but not float64.
-        if node.inputs[0].dtype == "float64" or node.outputs[0].dtype == "float64":
-            return
-        if acc_dtype != "float32":
-            return
-    if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
-        return
-    if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
-        return
-    if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
-        return
-    if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
-        return
-    def _identity(a):
-        return a
-    def _square(a):
-        return GpuElemwise(theano.scalar.basic.sqr)(a)
-    scal = node.op.scalar_op.name
-    post = _identity
-    if node.op.pre_scalar_op is not None:
-        if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
-            if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
-                scal = "norm2"
-                post = _square
-            elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
-                scal = "norm1"
-            else:
-                return
-        elif isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and isinstance(
-            node.op.pre_scalar_op, theano.scalar.basic.Abs
-        ):
-            scal = "absmax"
-        else:
-            return
-    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
-        return
-    with inherit_stack_trace(node.outputs):
-        ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype, False)(
-            node.inputs[0]
-        )
-        return [post(ret)]
-@register_opt("cudnn")
-@local_optimizer([GpuMaxAndArgmax])
-def local_cudnn_maxandargmax(node):
-    if not isinstance(node.op, GpuMaxAndArgmax):
-        return
-    if not dnn_available(node.inputs[0].type.context_name):
-        return
-    if version(raises=False) < 6000:
-        return
-    if node.inputs[0].ndim > 8:
-        return
-    if node.inputs[0].dtype != node.outputs[0].dtype:
-        return
-    if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
-        return
-    # order of the axes influences the output indices
-    if node.op.axis is not None and tuple(sorted(node.op.axis)) != node.op.axis:
-        return
-    max, arg = GpuDnnReduction(
-        "maximum", node.op.axis, node.outputs[0].dtype, node.outputs[0].dtype, True
-    )(node.inputs[0])
-    # cudnn can only return int32 indices
-    return (
-        max,
-        as_gpuarray_variable(arg.astype("int64"), node.outputs[1].type.context_name),
-    )
-@register_opt("cudnn", "fast_compile")
-@op_lifter([Argmax])
-@register_opt2([Argmax], "fast_compile", "cudnn")
-def local_dnn_argmax(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    if version(raises=False) < 6000:
-        return
-    if inputs[0].ndim > 8:
-        return
-    if inputs[0].dtype not in ["float16", "float32", "float64"]:
-        return
-    # order of the axes influences the output indices
-    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
-        return
-    max, arg = GpuDnnReduction(
-        "maximum", op.axis, inputs[0].dtype, inputs[0].dtype, True
-    )(*inputs)
-    return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
-class NoCuDNNRaise(Optimizer):
-    def apply(self, fgraph):
-        """
-        Raise a error if cudnn can't be used.
-        """
-        for c in list_contexts():
-            if not dnn_available(c):
-                # Make an assert error as we want Theano to fail, not
-                # just skip this optimization.
-                raise AssertionError(
-                    "cuDNN optimization was enabled, but Theano was not able "
-                    "to use it for context "
-                    + str(c)
-                    + ". We got this error: \n"
-                    + dnn_available.msg
-                )
-gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, "cudnn")
 def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
    x, scale, bias, epsilon, running_average_factor = inputs[:5]
    running_mean = inputs[5] if len(inputs) > 5 else None
@@ -4740,60 +3933,6 @@ def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
    return results
-@register_inplace()
-@local_optimizer([GpuDnnBatchNorm], inplace=True)
-def local_batch_norm_inplace_output(node):
-    if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
-        return GpuDnnBatchNorm(
-            mode=node.op.mode,
-            running_averages=node.op.running_averages,
-            inplace_running_mean=node.op.inplace_running_mean,
-            inplace_running_var=node.op.inplace_running_var,
-            inplace_output=True,
-        )(*node.inputs)
-@register_inplace()
-@local_optimizer([GpuDnnBatchNorm], inplace=True)
-def local_batch_norm_inplace_running_mean(node):
-    if (
-        isinstance(node.op, GpuDnnBatchNorm)
-        and node.op.running_averages
-        and not node.op.inplace_running_mean
-    ):
-        return GpuDnnBatchNorm(
-            mode=node.op.mode,
-            running_averages=node.op.running_averages,
-            inplace_running_mean=True,
-            inplace_running_var=node.op.inplace_running_var,
-            inplace_output=node.op.inplace_output,
-        )(*node.inputs)
-@register_inplace()
-@local_optimizer([GpuDnnBatchNorm], inplace=True)
-def local_batch_norm_inplace_running_var(node):
-    if (
-        isinstance(node.op, GpuDnnBatchNorm)
-        and node.op.running_averages
-        and not node.op.inplace_running_var
-    ):
-        return GpuDnnBatchNorm(
-            mode=node.op.mode,
-            running_averages=node.op.running_averages,
-            inplace_running_mean=node.op.inplace_running_mean,
-            inplace_running_var=True,
-            inplace_output=node.op.inplace_output,
-        )(*node.inputs)
-@register_inplace()
-@local_optimizer([GpuDnnBatchNormInference], inplace=True)
-def local_batch_norm_inference_inplace(node):
-    if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
-        return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
 def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
    x, dy, scale, x_mean, x_invstd, epsilon = inputs

--- a/theano/gpuarray/dnn_opt.py
+++ b/theano/gpuarray/dnn_opt.py
+import theano
+from theano.compile import optdb
+from theano.compile.ops import shape_i_op
+from theano.gof.opt import Optimizer, inherit_stack_trace, local_optimizer
+from theano.gpuarray.basic_ops import (
+    GpuAllocEmpty,
+    GpuArrayType,
+    as_gpuarray_variable,
+    gpu_contiguous,
+    infer_context_name,
+)
+from theano.gpuarray.dnn import (
+    GpuDnnBatchNorm,
+    GpuDnnBatchNormInference,
+    GpuDnnConv,
+    GpuDnnConvDesc,
+    GpuDnnConvGradI,
+    GpuDnnConvGradW,
+    GpuDnnPoolGrad,
+    GpuDnnReduction,
+    GpuDnnSoftmax,
+    GpuDnnSoftmaxGrad,
+    cudnn,
+    dnn_available,
+    dnn_conv,
+    dnn_conv3d,
+    dnn_pool,
+    get_precision,
+    local_abstractconv3d_cudnn_graph,
+    local_abstractconv_cudnn_graph,
+    version,
+)
+from theano.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise
+from theano.gpuarray.nnet import GpuSoftmax
+from theano.gpuarray.opt_util import (
+    alpha_merge,
+    inplace_allocempty,
+    op_lifter,
+    output_merge,
+    pad_dims,
+    unpad_dims,
+)
+from theano.gpuarray.optdb import (
+    gpu_seqopt,
+    pool_db,
+    pool_db2,
+    register_inplace,
+    register_opt,
+    register_opt2,
+)
+from theano.gpuarray.reduction import GpuMaxAndArgmax
+from theano.gpuarray.type import list_contexts
+from theano.scalar import Log
+from theano.tensor import Argmax
+from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
+from theano.tensor.nnet.abstract_conv import (
+    AbstractConv2d,
+    AbstractConv2d_gradInputs,
+    AbstractConv2d_gradWeights,
+    AbstractConv3d,
+    AbstractConv3d_gradInputs,
+    AbstractConv3d_gradWeights,
+    assert_conv_shape,
+    get_conv_output_shape,
+)
+from theano.tensor.signal.pool import AveragePoolGrad, MaxPoolGrad, Pool
+@local_optimizer([AbstractConv2d, AbstractConv3d])
+def local_abstractconv_cudnn(node):
+    ctx = infer_context_name(*node.inputs)
+    if not isinstance(node.inputs[0].type, GpuArrayType):
+        return
+    if node.op.unshared:
+        return None
+    if isinstance(node.op.border_mode, tuple) and any(
+        isinstance(p, tuple) for p in node.op.border_mode
+    ):
+        # Asymmetric padding not yet supported
+        return None
+    if isinstance(node.op, AbstractConv2d):
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv_cudnn_graph(
+                node.op, ctx, node.inputs, node.outputs
+            )
+    elif isinstance(node.op, AbstractConv3d):
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv3d_cudnn_graph(
+                node.op, ctx, node.inputs, node.outputs
+            )
+@local_optimizer(
+    [AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]
+)
+def local_abstractconv_cudnn_alt(node):
+    if not isinstance(
+        node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)
+    ):
+        return
+    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
+        return None
+    if node.op.unshared:
+        return None
+    if isinstance(node.op.border_mode, tuple) and any(
+        isinstance(p, tuple) for p in node.op.border_mode
+    ):
+        # Asymmetric padding not yet supported
+        return None
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+    if not dnn_available(inp1.type.context_name):
+        return
+    op = node.op
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    precision, _ = get_precision(None, [inp1, inp2])
+    if node.op.filter_flip:
+        conv_mode = "conv"
+    else:
+        conv_mode = "cross"
+    if isinstance(op, AbstractConv2d):
+        if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
+            return None
+        if border_mode == "full":
+            direction_hint = "bprop inputs"
+        elif border_mode == "valid" and filter_dilation == (1, 1):
+            direction_hint = "bprop weights"
+        else:
+            return None
+        rval = dnn_conv(
+            inp1,
+            inp2,
+            border_mode=border_mode,
+            subsample=subsample,
+            dilation=filter_dilation,
+            direction_hint=direction_hint,
+            conv_mode=conv_mode,
+            num_groups=num_groups,
+        )
+    elif isinstance(op, AbstractConv2d_gradWeights):
+        if (
+            border_mode == "valid"
+            and subsample == (1, 1)
+            and filter_dilation == (1, 1)
+            and num_groups == 1
+        ):
+            img = gpu_contiguous(inp1)
+            topgrad = gpu_contiguous(inp2)
+            ctx_name = infer_context_name(img, topgrad)
+            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
+            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
+            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
+            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
+            out_shp = get_conv_output_shape(
+                ishape,
+                tshape,
+                border_mode=border_mode,
+                subsample=subsample,
+                filter_dilation=filter_dilation,
+            )
+            out_shp = assert_conv_shape(out_shp)
+            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
+            desc = GpuDnnConvDesc(
+                border_mode=border_mode,
+                subsample=subsample,
+                dilation=filter_dilation,
+                conv_mode="cross",
+                precision=precision,
+            )(out.shape)
+            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
+            if conv_mode == "conv":
+                conv = conv[:, :, ::-1, ::-1]
+            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
+        else:
+            return None
+    elif isinstance(op, AbstractConv2d_gradInputs):
+        if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
+            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
+            topgrad = gpu_contiguous(inp2)
+            ctx_name = infer_context_name(kerns, topgrad)
+            conv_mode = "cross" if conv_mode == "conv" else "conv"
+            desc = GpuDnnConvDesc(
+                border_mode="full",
+                subsample=subsample,
+                dilation=filter_dilation,
+                conv_mode=conv_mode,
+                precision=precision,
+            )(kerns.shape)
+            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
+            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
+            shape = get_conv_output_shape(
+                tshape,
+                kshape,
+                border_mode="full",
+                subsample=subsample,
+                filter_dilation=filter_dilation,
+            )
+            shape = assert_conv_shape(shape)
+            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
+            rval = GpuDnnConv(algo=None, num_groups=num_groups)(
+                topgrad, kerns, out, desc
+            )
+        else:
+            return None
+    return [rval]
+@local_optimizer(
+    [AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs]
+)
+def local_abstractconv3d_cudnn_alt(node):
+    if not isinstance(
+        node.op, (AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs)
+    ):
+        return
+    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
+        return None
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+    if not dnn_available(inp1.type.context_name):
+        return
+    op = node.op
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    precision, _ = get_precision(None, [inp1, inp2])
+    if node.op.filter_flip:
+        conv_mode = "conv"
+    else:
+        conv_mode = "cross"
+    if isinstance(op, AbstractConv3d):
+        if border_mode == "half" or subsample != (1, 1, 1) or num_groups > 1:
+            return None
+        if border_mode == "full":
+            direction_hint = "bprop inputs"
+        elif border_mode == "valid" and filter_dilation == (1, 1, 1):
+            direction_hint = "bprop weights"
+        else:
+            return None
+        rval = dnn_conv3d(
+            inp1,
+            inp2,
+            border_mode=border_mode,
+            subsample=subsample,
+            dilation=filter_dilation,
+            direction_hint=direction_hint,
+            conv_mode=conv_mode,
+        )
+    elif isinstance(op, AbstractConv3d_gradWeights):
+        if (
+            border_mode == "valid"
+            and subsample == (1, 1, 1)
+            and filter_dilation == (1, 1, 1)
+            and num_groups == 1
+        ):
+            img = gpu_contiguous(inp1)
+            topgrad = gpu_contiguous(inp2)
+            ctx_name = infer_context_name(img, topgrad)
+            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
+            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
+            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
+            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
+            out_shp = get_conv_output_shape(
+                ishape,
+                tshape,
+                border_mode=border_mode,
+                subsample=subsample,
+                filter_dilation=filter_dilation,
+            )
+            out_shp = assert_conv_shape(out_shp)
+            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
+            desc = GpuDnnConvDesc(
+                border_mode=border_mode,
+                subsample=subsample,
+                dilation=filter_dilation,
+                conv_mode="cross",
+                num_groups=num_groups,
+                precision=precision,
+            )(out.shape)
+            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
+            if conv_mode == "conv":
+                conv = conv[:, :, ::-1, ::-1, ::-1]
+            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
+        else:
+            return None
+    elif isinstance(op, AbstractConv3d_gradInputs):
+        if border_mode == "valid" and subsample == (1, 1, 1) and num_groups == 1:
+            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
+            topgrad = gpu_contiguous(inp2)
+            ctx_name = infer_context_name(kerns, topgrad)
+            conv_mode = "cross" if conv_mode == "conv" else "conv"
+            desc = GpuDnnConvDesc(
+                border_mode="full",
+                subsample=subsample,
+                dilation=filter_dilation,
+                conv_mode=conv_mode,
+                num_groups=num_groups,
+                precision=precision,
+            )(kerns.shape)
+            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
+            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
+            shape = get_conv_output_shape(
+                tshape,
+                kshape,
+                border_mode="full",
+                subsample=subsample,
+                filter_dilation=filter_dilation,
+            )
+            shape = assert_conv_shape(shape)
+            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
+            rval = GpuDnnConv(algo=None, num_groups=num_groups)(
+                topgrad, kerns, out, desc
+            )
+        else:
+            return None
+    return [rval]
+@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
+def local_abstractconv_gw_cudnn(node):
+    ctx = infer_context_name(*node.inputs)
+    if not isinstance(node.inputs[0].type, GpuArrayType):
+        return
+    if node.op.unshared:
+        return None
+    if isinstance(node.op.border_mode, tuple) and any(
+        isinstance(p, tuple) for p in node.op.border_mode
+    ):
+        # Asymmetric padding not yet supported
+        return None
+    if isinstance(node.op, AbstractConv2d_gradWeights):
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv_cudnn_graph(
+                node.op, ctx, node.inputs, node.outputs
+            )
+    elif isinstance(node.op, AbstractConv3d_gradWeights):
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv3d_cudnn_graph(
+                node.op, ctx, node.inputs, node.outputs
+            )
+@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
+def local_abstractconv_gi_cudnn(node):
+    ctx = infer_context_name(*node.inputs)
+    if not isinstance(node.inputs[0].type, GpuArrayType):
+        return
+    if node.op.unshared:
+        return None
+    if isinstance(node.op.border_mode, tuple) and any(
+        isinstance(p, tuple) for p in node.op.border_mode
+    ):
+        # Asymmetric padding not yet supported
+        return None
+    if isinstance(node.op, AbstractConv2d_gradInputs):
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv_cudnn_graph(
+                node.op, ctx, node.inputs, node.outputs
+            )
+    elif isinstance(node.op, AbstractConv3d_gradInputs):
+        with inherit_stack_trace(node.outputs):
+            return local_abstractconv3d_cudnn_graph(
+                node.op, ctx, node.inputs, node.outputs
+            )
+@inplace_allocempty(GpuDnnConv, 2)
+def local_dnn_conv_inplace(node, inputs):
+    return [
+        GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
+            *inputs
+        )
+    ]
+@inplace_allocempty(GpuDnnConvGradW, 2)
+def local_dnn_convgw_inplace(node, inputs):
+    return [
+        GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
+            *inputs
+        )
+    ]
+@inplace_allocempty(GpuDnnConvGradI, 2)
+def local_dnn_convgi_inplace(node, inputs):
+    return [
+        GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
+            *inputs
+        )
+    ]
+optdb.register(
+    "local_dnna_conv_inplace",
+    theano.tensor.opt.in2out(
+        local_dnn_conv_inplace,
+        local_dnn_convgw_inplace,
+        local_dnn_convgi_inplace,
+        name="local_dnna_conv_inplace",
+    ),
+    70.0,
+    "fast_run",
+    "inplace",
+    "gpuarray",
+    "cudnn",
+)
+@register_opt("cudnn")
+@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
+def local_dnn_conv_alpha_merge(node, *inputs):
+    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
+@register_opt("cudnn")
+@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
+def local_dnn_convw_alpha_merge(node, *inputs):
+    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
+@register_opt("cudnn")
+@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
+def local_dnn_convi_alpha_merge(node, *inputs):
+    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
+@register_opt("cudnn")
+@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
+def local_dnn_conv_output_merge(node, *inputs):
+    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
+    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
+@register_opt("cudnn")
+@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
+def local_dnn_convw_output_merge(node, *inputs):
+    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
+    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
+@register_opt("cudnn")
+@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
+def local_dnn_convi_output_merge(node, *inputs):
+    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
+    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
+def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
+    if not dnn_available(ctx_name):
+        return
+    if not op.ignore_border:
+        return
+    img, ws, stride, pad = inputs
+    nd = op.ndim
+    if nd not in (2, 3):
+        return
+    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
+    mode = op.mode
+    # dnn_pool expects exactly 2 non-pooling dimensions
+    if img.ndim == nd + 2:
+        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        img_padded = pad_dims(img, 2, nd)
+        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
+        return unpad_dims(ret_padded, img, 2, nd)
+pool_db.register(
+    "local_gpua_pool_dnn_alternative",
+    op_lifter([Pool])(local_gpua_pool_dnn_alternative),
+    "gpuarray",
+    "fast_compile",
+    "fast_run",
+    "cudnn",
+    position=0,
+)
+pool_db2.register(
+    "local_gpua_pool_dnn_alternative",
+    local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
+    "gpuarray",
+    "fast_compile",
+    "fast_run",
+    "cudnn",
+    position=0,
+)
+def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
+    if not dnn_available(ctx_name):
+        return
+    if not op.ignore_border:
+        return
+    inp, out, out_grad, ws, stride, pad = inputs
+    nd = op.ndim
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
+    mode = op.mode
+    # the GPU ops expect exactly 2 non-pooling dimensions
+    if inp.ndim == nd + 2:
+        return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        out_padded = pad_dims(out, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = GpuDnnPoolGrad(mode=mode)(
+            inp_padded, out_padded, out_grad_padded, ws, stride, pad
+        )
+        return unpad_dims(ret_padded, inp, 2, nd)
+pool_db.register(
+    "local_gpua_pool_dnn_grad_stride",
+    op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
+    "gpuarray",
+    "fast_compile",
+    "fast_run",
+    "cudnn",
+    position=0,
+)
+pool_db2.register(
+    "local_gpua_pool_dnn_grad_stride",
+    local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
+    "gpuarray",
+    "fast_compile",
+    "fast_run",
+    "cudnn",
+    position=0,
+)
+def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
+    if not dnn_available(ctx_name):
+        return
+    if not op.ignore_border:
+        return
+    inp, out_grad, ws, stride, pad = inputs
+    nd = op.ndim
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
+    mode = op.mode
+    # the GPU ops expect exactly 2 non-pooling dimensions
+    if inp.ndim == nd + 2:
+        # We reuse out_grad because cuDNN does not use the value of the `out`
+        # argument but still checks its shape for average pooling. This
+        # has been observed in v2 and v3 as far as I know.
+        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = GpuDnnPoolGrad(mode=mode)(
+            inp_padded, out_grad_padded, out_grad_padded, ws, stride, pad
+        )
+        return unpad_dims(ret_padded, inp, 2, nd)
+pool_db.register(
+    "local_gpua_avg_pool_dnn_grad_stride",
+    op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
+    "gpuarray",
+    "fast_compile",
+    "fast_run",
+    "cudnn",
+    position=0,
+)
+pool_db2.register(
+    "local_gpua_avg_pool_dnn_grad_stride",
+    local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
+    "gpuarray",
+    "fast_compile",
+    "fast_run",
+    "cudnn",
+    position=0,
+)
+@register_opt("cudnn", "fast_compile")
+@local_optimizer([GpuSoftmax])
+def local_softmax_dnn(node):
+    if isinstance(node.op, GpuSoftmax):
+        if not dnn_available(node.outputs[0].type.context_name):
+            return
+        ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
+        ins = gpu_contiguous(ins)
+        out = GpuDnnSoftmax("accurate", "channel")(ins)
+        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
+        return [out]
+@register_opt("cudnn", "stabilize")
+@local_optimizer([GpuElemwise])
+def local_log_softmax_dnn(node):
+    # This looks for GpuDnnSoftmax so we know that we have cudnn.
+    if (
+        isinstance(node.op, GpuElemwise)
+        and isinstance(node.op.scalar_op, Log)
+        and node.inputs[0].owner
+        and isinstance(node.inputs[0].owner.op, GpuDnnSoftmax)
+        and len(node.inputs[0].clients) == 1
+    ):
+        softmax_node = node.inputs[0].owner
+        new_softmax = GpuDnnSoftmax("log", softmax_node.op.mode)
+        return [new_softmax(softmax_node.inputs[0])]
+@register_opt("cudnn", "fast_compile")
+@op_lifter([LogSoftmax])
+@register_opt2([LogSoftmax], "fast_compile", "cudnn")
+def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
+    # Transform the input in the format expected by GpuDnnSoftmax
+    inp = inputs[0]
+    if inp.ndim != 2:
+        return
+    if not dnn_available(ctx_name):
+        return
+    inp = inp.dimshuffle(0, 1, "x", "x")
+    inp.tag.context_name = ctx_name
+    # Apply GpuDnnSoftmax and return the result
+    out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp))
+    return [out.dimshuffle(0, 1)]
+@register_opt("cudnn", "fast_compile")
+@op_lifter([SoftmaxGrad])
+@register_opt2([SoftmaxGrad], "cudnn", "fast_compile")
+def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
+    if not dnn_available(ctx_name):
+        return
+    ins = []
+    for n in inputs:
+        n = as_gpuarray_variable(n, ctx_name)
+        if n.ndim != 2:
+            return
+        ins.append(n.dimshuffle(0, "x", 1, "x"))
+    out = GpuDnnSoftmaxGrad("accurate", "instance")(
+        gpu_contiguous(ins[0]), gpu_contiguous(ins[1])
+    )
+    return [out.dimshuffle(0, 2)]
+@register_opt("cudnn")
+@local_optimizer([GpuCAReduceCuda])
+def local_dnn_reduction(node):
+    if not isinstance(node.op, GpuCAReduceCuda):
+        return
+    if not dnn_available(node.inputs[0].type.context_name):
+        return
+    if version(raises=False) < 6000:
+        return
+    if node.inputs[0].ndim > 8:
+        return
+    acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)
+    if node.inputs[0].dtype != node.outputs[0].dtype:
+        # We can mix float16 and float32, but not float64.
+        if node.inputs[0].dtype == "float64" or node.outputs[0].dtype == "float64":
+            return
+        if acc_dtype != "float32":
+            return
+    if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
+        return
+    if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
+        return
+    if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
+        return
+    if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
+        return
+    def _identity(a):
+        return a
+    def _square(a):
+        return GpuElemwise(theano.scalar.basic.sqr)(a)
+    scal = node.op.scalar_op.name
+    post = _identity
+    if node.op.pre_scalar_op is not None:
+        if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
+            if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
+                scal = "norm2"
+                post = _square
+            elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
+                scal = "norm1"
+            else:
+                return
+        elif isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and isinstance(
+            node.op.pre_scalar_op, theano.scalar.basic.Abs
+        ):
+            scal = "absmax"
+        else:
+            return
+    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
+        return
+    with inherit_stack_trace(node.outputs):
+        ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype, False)(
+            node.inputs[0]
+        )
+        return [post(ret)]
+@register_opt("cudnn")
+@local_optimizer([GpuMaxAndArgmax])
+def local_cudnn_maxandargmax(node):
+    if not isinstance(node.op, GpuMaxAndArgmax):
+        return
+    if not dnn_available(node.inputs[0].type.context_name):
+        return
+    if version(raises=False) < 6000:
+        return
+    if node.inputs[0].ndim > 8:
+        return
+    if node.inputs[0].dtype != node.outputs[0].dtype:
+        return
+    if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
+        return
+    # order of the axes influences the output indices
+    if node.op.axis is not None and tuple(sorted(node.op.axis)) != node.op.axis:
+        return
+    max, arg = GpuDnnReduction(
+        "maximum", node.op.axis, node.outputs[0].dtype, node.outputs[0].dtype, True
+    )(node.inputs[0])
+    # cudnn can only return int32 indices
+    return (
+        max,
+        as_gpuarray_variable(arg.astype("int64"), node.outputs[1].type.context_name),
+    )
+@register_opt("cudnn", "fast_compile")
+@op_lifter([Argmax])
+@register_opt2([Argmax], "fast_compile", "cudnn")
+def local_dnn_argmax(op, ctx_name, inputs, outputs):
+    if not dnn_available(ctx_name):
+        return
+    if version(raises=False) < 6000:
+        return
+    if inputs[0].ndim > 8:
+        return
+    if inputs[0].dtype not in ["float16", "float32", "float64"]:
+        return
+    # order of the axes influences the output indices
+    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
+        return
+    max, arg = GpuDnnReduction(
+        "maximum", op.axis, inputs[0].dtype, inputs[0].dtype, True
+    )(*inputs)
+    return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
+class NoCuDNNRaise(Optimizer):
+    def apply(self, fgraph):
+        """
+        Raise a error if cudnn can't be used.
+        """
+        for c in list_contexts():
+            if not dnn_available(c):
+                # Make an assert error as we want Theano to fail, not
+                # just skip this optimization.
+                raise AssertionError(
+                    "cuDNN optimization was enabled, but Theano was not able "
+                    "to use it for context "
+                    + str(c)
+                    + ". We got this error: \n"
+                    + dnn_available.msg
+                )
+gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, "cudnn")
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_batch_norm_inplace_output(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
+        return GpuDnnBatchNorm(
+            mode=node.op.mode,
+            running_averages=node.op.running_averages,
+            inplace_running_mean=node.op.inplace_running_mean,
+            inplace_running_var=node.op.inplace_running_var,
+            inplace_output=True,
+        )(*node.inputs)
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_batch_norm_inplace_running_mean(node):
+    if (
+        isinstance(node.op, GpuDnnBatchNorm)
+        and node.op.running_averages
+        and not node.op.inplace_running_mean
+    ):
+        return GpuDnnBatchNorm(
+            mode=node.op.mode,
+            running_averages=node.op.running_averages,
+            inplace_running_mean=True,
+            inplace_running_var=node.op.inplace_running_var,
+            inplace_output=node.op.inplace_output,
+        )(*node.inputs)
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_batch_norm_inplace_running_var(node):
+    if (
+        isinstance(node.op, GpuDnnBatchNorm)
+        and node.op.running_averages
+        and not node.op.inplace_running_var
+    ):
+        return GpuDnnBatchNorm(
+            mode=node.op.mode,
+            running_averages=node.op.running_averages,
+            inplace_running_mean=node.op.inplace_running_mean,
+            inplace_running_var=True,
+            inplace_output=node.op.inplace_output,
+        )(*node.inputs)
+@register_inplace()
+@local_optimizer([GpuDnnBatchNormInference], inplace=True)
+def local_batch_norm_inference_inplace(node):
+    if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
+        return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
 import copy
 import numpy as np
-import theano
 from six.moves import StringIO
-from theano import Apply, scalar, Op
+import theano
+from theano import Apply, Op, scalar
 from theano.gof.utils import MethodNotDefined
-from theano.scalar import Scalar, Composite
+from theano.scalar import Composite, Scalar
-from theano.tensor.elemwise import Elemwise, DimShuffle, CAReduceDtype
+from theano.scalar.basic import complex_types, upgrade_to_float_no_complex
-from theano.scalar.basic_scipy import Erfinv, Erfcinv
+from theano.scalar.basic_scipy import Erfcinv, Erfinv
-from theano.scalar.basic import upgrade_to_float_no_complex, complex_types
+from theano.tensor.elemwise import CAReduceDtype, DimShuffle, Elemwise
 try:
    import pygpu
    from pygpu import gpuarray
-    from pygpu.tools import ArrayArg
-    from pygpu.reduction import ReductionKernel
    from pygpu.gpuarray import dtype_to_typecode
+    from pygpu.reduction import ReductionKernel
+    from pygpu.tools import ArrayArg
 except ImportError:
    pass
 from .basic_ops import (
-    as_gpuarray_variable,
-    HideC,
    GpuKernelBase,
+    HideC,
    Kernel,
+    as_gpuarray_variable,
    infer_context_name,
 )
-from .type import GpuArrayType, gpu_context_type
 from .fp16_help import load_w, write_w
+from .type import GpuArrayType, gpu_context_type
 def make_argument(v, name):

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
 from theano import Apply, Op
 from theano.tensor.extra_ops import CumOp
 try:
    from pygpu import gpuarray
 except ImportError:
    pass
+import theano.scalar as scalar
+from theano.gof import ParamsType
 from .basic_ops import (
-    as_gpuarray_variable,
    GpuKernelBase,
-    Kernel,
    GpuReshape,
-    infer_context_name,
+    Kernel,
+    as_gpuarray_variable,
    gpuarray_helper_inc_dir,
+    infer_context_name,
 )
-from .opt import register_opt, op_lifter, register_opt2
+from .opt import op_lifter, register_opt, register_opt2
 from .type import gpu_context_type
-from theano.gof import ParamsType
-import theano.scalar as scalar
 class GpuCumOp(GpuKernelBase, Op):

--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -2,18 +2,17 @@ import numpy as np
 import theano
 import theano.tensor as tt
 from theano import Op
-from theano.gradient import DisconnectedType
 from theano.gpuarray.basic_ops import (
-    gpu_contiguous,
    as_gpuarray_variable,
+    gpu_contiguous,
    infer_context_name,
 )
+from theano.gpuarray.opt import op_lifter, register_opt, register_opt2
 from theano.gpuarray.type import GpuArrayType
+from theano.gradient import DisconnectedType
 from theano.tensor.fft import IRFFTOp
-from theano.gpuarray.opt import register_opt, op_lifter, register_opt2
 try:
    import pygpu

--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
 import warnings
-import pkg_resources
 import numpy as np
+import pkg_resources
 from numpy.linalg.linalg import LinAlgError
 import theano
 from theano import Op, config, tensor
-from theano.scalar import bool as bool_t
 from theano.gof import COp, ParamsType
-from theano.gpuarray import GpuArrayType
+from theano.gpuarray.basic_ops import (
-from .basic_ops import (
    CGpuKernelBase,
    as_gpuarray_variable,
    gpu_contiguous,
    gpuarray_helper_inc_dir,
    infer_context_name,
 )
-from .type import gpu_context_type
+from theano.gpuarray.type import GpuArrayType, gpu_context_type
+from theano.scalar import bool as bool_t
 try:
    import pygpu
-    from pygpu.basic import triu, tril
+    from pygpu.basic import tril, triu
    pygpu_available = True
 except ImportError:

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -2,6 +2,7 @@
 import warnings
 try:
    import pygpu
 except ImportError:
@@ -11,20 +12,20 @@ import theano
 import theano.sandbox.multinomial
 from theano import Apply
 from theano.gof import Op
+from theano.scalar import as_scalar
 from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from .basic_ops import (
-    as_gpuarray_variable,
-    infer_context_name,
    GpuKernelBase,
    Kernel,
+    as_gpuarray_variable,
    gpuarray_helper_inc_dir,
+    infer_context_name,
 )
-from .opt import register_opt, op_lifter, register_opt2
-from .type import GpuArrayType
 from .elemwise import GpuDimShuffle
-from theano.scalar import as_scalar
+from .fp16_help import load_w, work_dtype, write_w
-from .fp16_help import write_w, load_w, work_dtype
+from .opt import op_lifter, register_opt, register_opt2
+from .type import GpuArrayType
 class GPUAMultinomialFromUniform(GpuKernelBase, Op):

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
 import theano.tensor as tt
+from theano import Apply, Op
-from theano import Op, Apply
 from theano.gof import ParamsType
 from theano.tensor.nnet.neighbours import Images2Neibs
 try:
    from pygpu import gpuarray
 except ImportError:
    pass
 from theano.gpuarray.basic_ops import (
-    as_gpuarray_variable,
    GpuKernelBase,
    Kernel,
+    as_gpuarray_variable,
    infer_context_name,
 )
 from theano.gpuarray.type import GpuArrayType, gpu_context_type

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
 import numpy as np
-from theano import Op, Apply
 from six import StringIO
+from theano import Apply, Op
 try:
    import pygpu
    from pygpu import gpuarray
@@ -10,14 +11,14 @@ except ImportError:
    pass
 from .basic_ops import (
-    as_gpuarray_variable,
    GpuKernelBase,
    Kernel,
+    as_gpuarray_variable,
    gpuarray_helper_inc_dir,
    infer_context_name,
 )
+from .fp16_help import load_w, work_dtype, write_w
 from .type import GpuArrayType
-from .fp16_help import work_dtype, load_w, write_w
 class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
 import copy
 import logging
 import pdb
-import time
 import sys
+import time
+from collections import Counter
 import numpy as np
 import theano
-from theano import tensor, scalar, gof, config
-from theano.compile import optdb
-from theano.compile.ops import shape_i
-from theano.gof import (
-    local_optimizer,
-    EquilibriumDB,
-    TopoOptimizer,
-    LocalGroupDB,
-    SequenceDB,
-    Optimizer,
-    DB,
-    toolbox,
-    graph,
-)
-from theano.gof.opt import LocalMetaOptimizer, copy_stack_trace, inherit_stack_trace
-from theano.ifelse import IfElse
-from theano.misc.ordered_set import OrderedSet
-from theano.scalar.basic import Scalar, Pow, Cast
-from theano.scalar.basic import log, neg, true_div
-from theano.scalar.basic_scipy import Erfinv, Erfcinv
-from theano.scan_module import scan_utils, scan_op, scan_opt
-from theano.tensor.nnet import bn, conv3d2d
-from theano.tensor.nnet.conv import ConvOp
-from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
-from theano.tensor.nnet.abstract_conv import (
-    BaseAbstractConv,
-    AbstractConv2d,
-    AbstractConv2d_gradWeights,
-    AbstractConv2d_gradInputs,
-    AbstractConv3d,
-    AbstractConv3d_gradWeights,
-    AbstractConv3d_gradInputs,
-    get_conv_output_shape,
-)
-from theano.tensor.nnet.neighbours import Images2Neibs
-from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
 import theano.tensor.nlinalg as nlinalg
 import theano.tensor.signal.pool as pool
 import theano.tensor.slinalg as slinalg
-from collections import Counter
+from theano import config, gof, scalar, tensor
 from theano.breakpoint import PdbBreakpoint
+from theano.compile import optdb
-from .type import (
+from theano.compile.ops import shape_i
-    GpuArrayType,
+from theano.gof import Optimizer, graph, local_optimizer, toolbox
-    GpuArrayConstant,
+from theano.gof.opt import LocalMetaOptimizer, copy_stack_trace, inherit_stack_trace
-    get_context,
+from theano.gpuarray.basic_ops import (
-    ContextNotDefined,
-    move_to_gpu,
-)
-from .basic_ops import (
-    as_gpuarray_variable,
-    infer_context_name,
-    host_from_gpu,
-    GpuToGpu,
-    HostFromGpu,
-    GpuFromHost,
-    GpuSplit,
-    GpuContiguous,
-    gpu_contiguous,
    GpuAlloc,
    GpuAllocEmpty,
-    GpuReshape,
+    GpuContiguous,
    GpuEye,
+    GpuFromHost,
+    GpuJoin,
+    GpuReshape,
+    GpuSplit,
+    GpuToGpu,
    GpuTri,
+    HostFromGpu,
+    as_gpuarray_variable,
+    gpu_contiguous,
    gpu_join,
-    GpuJoin,
+    host_from_gpu,
+    infer_context_name,
 )
-from .blas import (
+from theano.gpuarray.blas import (
-    gpu_dot22,
+    GpuCorr3dMM,
+    GpuCorr3dMM_gradInputs,
+    GpuCorr3dMM_gradWeights,
+    GpuCorrMM,
+    GpuCorrMM_gradInputs,
+    GpuCorrMM_gradWeights,
    GpuGemm,
-    GpuGer,
    GpuGemmBatch,
-    gpugemm_no_inplace,
+    GpuGer,
+    gpu_dot22,
    gpugemm_inplace,
+    gpugemm_no_inplace,
    gpugemmbatch_no_inplace,
-    gpugemv_no_inplace,
    gpugemv_inplace,
-    GpuCorrMM,
+    gpugemv_no_inplace,
-    GpuCorrMM_gradInputs,
-    GpuCorrMM_gradWeights,
-    GpuCorr3dMM,
-    GpuCorr3dMM_gradInputs,
-    GpuCorr3dMM_gradWeights,
-)
-from .pool import (
-    GpuPool,
-    GpuMaxPoolGrad,
-    GpuAveragePoolGrad,
-    GpuMaxPoolRop,
-    GpuDownsampleFactorMaxGradGrad,
 )
-from .blocksparse import (
+from theano.gpuarray.blocksparse import (
    GpuSparseBlockGemv,
    GpuSparseBlockOuter,
-    gpu_sparse_block_outer,
-    gpu_sparse_block_outer_inplace,
    gpu_sparse_block_gemv,
    gpu_sparse_block_gemv_inplace,
+    gpu_sparse_block_outer,
+    gpu_sparse_block_outer_inplace,
 )
-from .nnet import (
+from theano.gpuarray.ctc import GpuConnectionistTemporalClassification
-    gpu_crossentropy_softmax_1hot_with_bias_dx,
+from theano.gpuarray.dnn_opt import (
-    gpu_crossentropy_softmax_argmax_1hot_with_bias,
+    local_abstractconv3d_cudnn_alt,
-    gpu_softmax_with_bias,
+    local_abstractconv_cudnn,
-    gpu_softmax,
+    local_abstractconv_cudnn_alt,
+    local_abstractconv_gi_cudnn,
+    local_abstractconv_gw_cudnn,
 )
-from .elemwise import (
+from theano.gpuarray.elemwise import (
-    GpuElemwise,
-    GpuDimShuffle,
-    GpuCAReduceCuda,
    GpuCAReduceCPY,
-    gpu_erfinv,
+    GpuCAReduceCuda,
+    GpuDimShuffle,
+    GpuElemwise,
    gpu_erfcinv,
+    gpu_erfinv,
    max_inputs_to_GpuElemwise,
 )
-from .subtensor import (
+from theano.gpuarray.linalg import (
-    GpuIncSubtensor,
+    MATRIX_STRUCTURES_SOLVE,
-    GpuSubtensor,
+    GpuCholesky,
-    GpuAdvancedSubtensor,
+    GpuCublasTriangularSolve,
-    GpuAdvancedSubtensor1,
+    GpuCusolverSolve,
+    GpuMagmaCholesky,
+    GpuMagmaEigh,
+    GpuMagmaMatrixInverse,
+    cublas_available,
+    cusolver_available,
+    gpu_qr,
+    gpu_svd,
+)
+from theano.gpuarray.neighbours import GpuImages2Neibs
+from theano.gpuarray.nnet import (
+    gpu_crossentropy_softmax_1hot_with_bias_dx,
+    gpu_crossentropy_softmax_argmax_1hot_with_bias,
+    gpu_softmax,
+    gpu_softmax_with_bias,
+)
+from theano.gpuarray.opt_util import (
+    alpha_merge,
+    op_lifter,
+    output_merge,
+    pad_dims,
+    safe_to_cpu,
+    safe_to_gpu,
+    unpad_dims,
+)
+from theano.gpuarray.optdb import (
+    GraphToGPUDB,
+    abstract_batch_norm_db,
+    abstract_batch_norm_db2,
+    abstract_batch_norm_groupopt,
+    abstractconv_groupopt,
+    gpu_cut_copies,
+    gpu_optimizer,
+    gpu_seqopt,
+    matrix_ops_db,
+    matrix_ops_db2,
+    pool_db,
+    pool_db2,
+    register_inplace,
+    register_opt,
+    register_opt2,
+)
+from theano.gpuarray.pool import (
+    GpuAveragePoolGrad,
+    GpuDownsampleFactorMaxGradGrad,
+    GpuMaxPoolGrad,
+    GpuMaxPoolRop,
+    GpuPool,
+)
+from theano.gpuarray.reduction import GpuMaxAndArgmax
+from theano.gpuarray.subtensor import (
    GpuAdvancedIncSubtensor,
    GpuAdvancedIncSubtensor1,
    GpuAdvancedIncSubtensor1_dev20,
+    GpuAdvancedSubtensor,
+    GpuAdvancedSubtensor1,
    GpuAllocDiag,
    GpuExtractDiag,
+    GpuIncSubtensor,
+    GpuSubtensor,
 )
-from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
+from theano.gpuarray.type import (
-from .reduction import GpuMaxAndArgmax
+    ContextNotDefined,
-from .linalg import (
+    GpuArrayConstant,
-    GpuCusolverSolve,
+    GpuArrayType,
-    MATRIX_STRUCTURES_SOLVE,
+    get_context,
-    GpuCholesky,
+    move_to_gpu,
-    cusolver_available,
-    GpuMagmaMatrixInverse,
-    gpu_svd,
-    GpuMagmaCholesky,
-    gpu_qr,
-    GpuMagmaEigh,
-    GpuCublasTriangularSolve,
-    cublas_available,
 )
-from .neighbours import GpuImages2Neibs
+from theano.ifelse import IfElse
-from .ctc import GpuConnectionistTemporalClassification
+from theano.misc.ordered_set import OrderedSet
+from theano.scalar.basic import Cast, Pow, Scalar, log, neg, true_div
-_logger = logging.getLogger("theano.gpuarray.opt")
+from theano.scalar.basic_scipy import Erfcinv, Erfinv
+from theano.scan_module import scan_op, scan_opt, scan_utils
+from theano.tensor.nnet import bn, conv3d2d
-gpu_optimizer = EquilibriumDB()
+from theano.tensor.nnet.abstract_conv import (
-gpu_cut_copies = EquilibriumDB()
+    AbstractConv2d,
+    AbstractConv2d_gradInputs,
-# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
+    AbstractConv2d_gradWeights,
-gpu_optimizer2 = EquilibriumDB()
+    AbstractConv3d,
+    AbstractConv3d_gradInputs,
+    AbstractConv3d_gradWeights,
-class GraphToGPUDB(DB):
+    BaseAbstractConv,
-    """
+    get_conv_output_shape,
-    Retrieves the list local optimizers based on the optimizer flag's value
+)
-    from EquilibriumOptimizer by calling the method query.
+from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
+from theano.tensor.nnet.conv import ConvOp
-    """
+from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
+from theano.tensor.nnet.neighbours import Images2Neibs
-    def query(self, *tags, **kwtags):
-        opt = gpu_optimizer2.query(*tags, **kwtags)
-        return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
+_logger = logging.getLogger("theano.gpuarray.opt")
-gpu_seqopt = SequenceDB()
 gpu_seqopt.register(
    "gpuarray_graph_optimization",
@@ -200,68 +196,6 @@ gpu_seqopt.register(
    "gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_compile", "fast_run", "gpuarray"
 )
-# do not add 'fast_run' to these two as this would always enable gpuarray mode
-optdb.register(
-    "gpuarray_opt",
-    gpu_seqopt,
-    optdb.__position__.get("add_destroy_handler", 49.5) - 1,
-    "gpuarray",
-)
-def register_opt(*tags, **kwargs):
-    def f(local_opt):
-        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
-        gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
-        return local_opt
-    return f
-def register_opt2(tracks, *tags, **kwargs):
-    """
-    Decorator for the new GraphToGPU optimizer.
-    Takes an extra parameter(Op) compared to register_opt decorator.
-    Parameters
-    ----------
-    tracks : List of Op class Or Op instance or None
-        The Node's Op to which optimization is being applied.
-    tags : String
-        The optimization tag to which the optimizer will be registered.
-    """
-    def f(local_opt):
-        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
-        if isinstance(local_opt, theano.gof.DB):
-            opt = local_opt
-        else:
-            opt = theano.gof.local_optimizer(tracks)(local_opt)
-        gpu_optimizer2.register(name, opt, "fast_run", "gpuarray", *tags)
-        return local_opt
-    return f
-def register_inplace(*tags, **kwargs):
-    def f(local_opt):
-        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
-        optdb.register(
-            name,
-            TopoOptimizer(local_opt, failure_callback=TopoOptimizer.warn_inplace),
-            60,
-            "fast_run",
-            "inplace",
-            "gpuarray",
-            *tags,
-        )
-        return local_opt
-    return f
 register_opt("fast_compile")(theano.tensor.opt.local_track_shape_i)
 register_opt(final_opt=True, name="gpua_constant_folding")(tensor.opt.constant_folding)
 gpu_optimizer.register(
@@ -269,108 +203,11 @@ gpu_optimizer.register(
 )
-# Define a few operations to use in optimizations,
-# in order to avoid introducin new CPU Ops, or useless ones.
-def safe_to_gpu(x, ctx_name):
-    if isinstance(x.type, tensor.TensorType):
-        return GpuFromHost(ctx_name)(x)
-    else:
-        return x
-def safe_to_cpu(x):
-    if isinstance(x.type, GpuArrayType):
-        return x.transfer("cpu")
-    else:
-        return x
 gpu_log = GpuElemwise(log)
 gpu_neg = GpuElemwise(neg)
 gpu_true_div = GpuElemwise(true_div)
-def op_lifter(OP, cuda_only=False):
-    """
-    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
-    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
-    """
-    def f(maker):
-        def local_opt(node):
-            if type(node.op) in OP:
-                # Either one of our inputs is on the gpu or
-                # all of our clients are on the gpu
-                replace = False
-                # TODO: Maybe set context_name with infer_context_name()?
-                context_name = None
-                # We replace if any input is a host_from_gpu
-                for i in node.inputs:
-                    if i.owner and i.owner.op == host_from_gpu and move_to_gpu(i):
-                        context_name = i.owner.inputs[0].type.context_name
-                        replace = True
-                        break
-                if not replace:
-                    # We replace if *all* clients are on the GPU
-                    clients = [c for o in node.outputs for c in o.clients]
-                    replace = len(clients) != 0
-                    for c, idx in clients:
-                        if c == "output" or not isinstance(c.op, GpuFromHost):
-                            replace = False
-                    # TODO: check that the clients want the same context?
-                    if replace:
-                        # All clients are GpuFromHost and we have at least one
-                        context_name = clients[0][0].op.context_name
-                # Check if we should replace
-                if (
-                    not replace
-                    or (cuda_only and get_context(context_name).kind != b"cuda")
-                    or any(["complex" in getattr(i, "dtype", "") for i in node.inputs])
-                ):
-                    return False
-                # tag the inputs with the context in case
-                # the context was derived from the outputs
-                for i in node.inputs:
-                    i.tag.context_name = context_name
-                new_op = maker(node.op, context_name, node.inputs, node.outputs)
-                # This is needed as sometimes new_op inherits from OP.
-                if new_op and new_op != node.op:
-                    if isinstance(new_op, theano.Op):
-                        new_outputs = new_op(*node.inputs, return_list=True)
-                        to_cpu_fn = safe_to_cpu
-                    elif isinstance(new_op, (tuple, list)):
-                        new_outputs = new_op
-                        to_cpu_fn = safe_to_cpu
-                    else:  # suppose it is a variable on the GPU
-                        new_outputs = [new_op]
-                        def to_cpu_fn(x):
-                            return x.transfer("cpu")
-                    # copy stack traces onto gpu outputs
-                    # also copy the stack traces onto HostFromGpu outputs
-                    on_cpu = []
-                    for old_output, new_output in zip(node.outputs, new_outputs):
-                        copy_stack_trace(old_output, new_output)
-                        cpu = to_cpu_fn(new_output)
-                        on_cpu.append(cpu)
-                        copy_stack_trace(old_output, cpu)
-                    return on_cpu
-            return False
-        local_opt.__name__ = maker.__name__
-        return local_optimizer(OP)(local_opt)
-    return f
 class InputToGpuOptimizer(Optimizer):
    """
    Transfer the input to the gpu to start the rolling wave.
@@ -409,7 +246,6 @@ class InputToGpuOptimizer(Optimizer):
                    raise
                # If there is no context tag and no default context
                # then it stays on the CPU
-                pass
 gpu_seqopt.register(
@@ -2507,9 +2343,6 @@ def local_gpu_pool(op, ctx_name, inputs, outputs):
        return unpad_dims(ret_padded, inp, 2, nd)
-pool_db = LocalGroupDB()
-pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
-pool_db2.__name__ = "pool_db2"
 lifter = op_lifter([pool.Pool])(local_gpu_pool)
 pool_db.register(
    "local_gpu_pool", lifter, "gpuarray", "fast_compile", "fast_run", position=1
@@ -2903,10 +2736,6 @@ def local_gpu_cholesky(op, context_name, inputs, outputs):
    return op
-matrix_ops_db = LocalGroupDB()
-matrix_ops_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
-matrix_ops_db2.__name__ = "matrix_ops_db2"
 # For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for
 # matrices of size 1000). Thus, cusolver is prioritized during graph
 # optimizations. To explicitly use magma, you should disable cusolver using
@@ -3094,24 +2923,6 @@ optdb.register(
    "scan",
 )
-# Register GPU convolution implementation
-# They are tried in a specific order so we can control
-# which ones take precedence over others.
-abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
-abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
-register_opt("fast_compile")(abstractconv_groupopt)
-# We import these opts here instead of at the top of this file
-# to avoid a circular dependency problem with dnn
-from .dnn import (  # noqa: E402
-    local_abstractconv_cudnn,
-    local_abstractconv_gw_cudnn,
-    local_abstractconv_gi_cudnn,
-    local_abstractconv_cudnn_alt,
-    local_abstractconv3d_cudnn_alt,
-)
 abstractconv_groupopt.register(
    "local_abstractconv_dnn",
    local_abstractconv_cudnn,
@@ -3240,19 +3051,15 @@ abstractconv_groupopt.register("conv_metaopt", conv_metaopt, "conv_meta", positi
 # We import these opts here instead of at the top of this file
 # to avoid a circular dependency problem with dnn
-from .dnn import (  # noqa: E402
+from theano.gpuarray.dnn import (  # noqa: E402
+    local_abstract_batch_norm_inference_cudnn,
    local_abstract_batch_norm_train_cudnn,
    local_abstract_batch_norm_train_grad_cudnn,
-    local_abstract_batch_norm_inference_cudnn,
 )
-abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
-abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
 register_opt("fast_compile")(abstract_batch_norm_groupopt)
-abstract_batch_norm_db = LocalGroupDB()
-abstract_batch_norm_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
-abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
 register_opt("fast_compile", name="abstract_batch_norm_db")(abstract_batch_norm_db)
 register_opt2(
    [

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -2,13 +2,38 @@ from functools import wraps
 import numpy as np
-from theano import tensor, scalar as scal, Constant
+from theano import Constant
+from theano import scalar as scal
+from theano import tensor
 from theano.gof import local_optimizer
-from theano.gof.opt import inherit_stack_trace
+from theano.gof.op import Op
-from theano.tensor import DimShuffle, get_scalar_constant_value, NotScalarConstantError
+from theano.gof.opt import copy_stack_trace, inherit_stack_trace
+from theano.gpuarray.basic_ops import (
+    GpuAllocEmpty,
+    GpuFromHost,
+    GpuReshape,
+    HostFromGpu,
+    host_from_gpu,
+)
+from theano.gpuarray.elemwise import GpuDimShuffle, GpuElemwise
+from theano.gpuarray.type import GpuArrayType, get_context, move_to_gpu
+from theano.tensor import DimShuffle, NotScalarConstantError, get_scalar_constant_value
+# Define a few operations to use in optimizations,
+# in order to avoid introducin new CPU Ops, or useless ones.
+def safe_to_gpu(x, ctx_name):
+    if isinstance(x.type, tensor.TensorType):
+        return GpuFromHost(ctx_name)(x)
+    else:
+        return x
-from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape
+def safe_to_cpu(x):
-from .elemwise import GpuDimShuffle, GpuElemwise
+    if isinstance(x.type, GpuArrayType):
+        return x.transfer("cpu")
+    else:
+        return x
 def grab_cpu_scalar(v, nd):
@@ -420,3 +445,84 @@ def unpad_dims(output, input, leftdims, rightdims):
    # restore the output to the original shape
    outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
    return GpuReshape(input.ndim)(output, outshp)
+def op_lifter(OP, cuda_only=False):
+    """
+    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
+    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
+    """
+    def f(maker):
+        def local_opt(node):
+            if type(node.op) in OP:
+                # Either one of our inputs is on the gpu or
+                # all of our clients are on the gpu
+                replace = False
+                # TODO: Maybe set context_name with infer_context_name()?
+                context_name = None
+                # We replace if any input is a host_from_gpu
+                for i in node.inputs:
+                    if i.owner and i.owner.op == host_from_gpu and move_to_gpu(i):
+                        context_name = i.owner.inputs[0].type.context_name
+                        replace = True
+                        break
+                if not replace:
+                    # We replace if *all* clients are on the GPU
+                    clients = [c for o in node.outputs for c in o.clients]
+                    replace = len(clients) != 0
+                    for c, idx in clients:
+                        if c == "output" or not isinstance(c.op, GpuFromHost):
+                            replace = False
+                    # TODO: check that the clients want the same context?
+                    if replace:
+                        # All clients are GpuFromHost and we have at least one
+                        context_name = clients[0][0].op.context_name
+                # Check if we should replace
+                if (
+                    not replace
+                    or (cuda_only and get_context(context_name).kind != b"cuda")
+                    or any(["complex" in getattr(i, "dtype", "") for i in node.inputs])
+                ):
+                    return False
+                # tag the inputs with the context in case
+                # the context was derived from the outputs
+                for i in node.inputs:
+                    i.tag.context_name = context_name
+                new_op = maker(node.op, context_name, node.inputs, node.outputs)
+                # This is needed as sometimes new_op inherits from OP.
+                if new_op and new_op != node.op:
+                    if isinstance(new_op, Op):
+                        new_outputs = new_op(*node.inputs, return_list=True)
+                        to_cpu_fn = safe_to_cpu
+                    elif isinstance(new_op, (tuple, list)):
+                        new_outputs = new_op
+                        to_cpu_fn = safe_to_cpu
+                    else:  # suppose it is a variable on the GPU
+                        new_outputs = [new_op]
+                        def to_cpu_fn(x):
+                            return x.transfer("cpu")
+                    # copy stack traces onto gpu outputs
+                    # also copy the stack traces onto HostFromGpu outputs
+                    on_cpu = []
+                    for old_output, new_output in zip(node.outputs, new_outputs):
+                        copy_stack_trace(old_output, new_output)
+                        cpu = to_cpu_fn(new_output)
+                        on_cpu.append(cpu)
+                        copy_stack_trace(old_output, cpu)
+                    return on_cpu
+            return False
+        local_opt.__name__ = maker.__name__
+        return local_optimizer(OP)(local_opt)
+    return f
--- a/theano/gpuarray/optdb.py
+++ b/theano/gpuarray/optdb.py
+from theano.compile import optdb
+from theano.gof.opt import GraphToGPULocalOptGroup, TopoOptimizer, local_optimizer
+from theano.gof.optdb import DB, EquilibriumDB, LocalGroupDB, SequenceDB
+gpu_optimizer = EquilibriumDB()
+gpu_cut_copies = EquilibriumDB()
+# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
+gpu_optimizer2 = EquilibriumDB()
+gpu_seqopt = SequenceDB()
+# do not add 'fast_run' to these two as this would always enable gpuarray mode
+optdb.register(
+    "gpuarray_opt",
+    gpu_seqopt,
+    optdb.__position__.get("add_destroy_handler", 49.5) - 1,
+    "gpuarray",
+)
+pool_db = LocalGroupDB()
+pool_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
+pool_db2.__name__ = "pool_db2"
+matrix_ops_db = LocalGroupDB()
+matrix_ops_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
+matrix_ops_db2.__name__ = "matrix_ops_db2"
+abstract_batch_norm_db = LocalGroupDB()
+abstract_batch_norm_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
+abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
+abstract_batch_norm_groupopt = LocalGroupDB()
+abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
+def register_opt(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
+        gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
+        return local_opt
+    return f
+def register_opt2(tracks, *tags, **kwargs):
+    """
+    Decorator for the new GraphToGPU optimizer.
+    Takes an extra parameter(Op) compared to register_opt decorator.
+    Parameters
+    ----------
+    tracks : List of Op class Or Op instance or None
+        The Node's Op to which optimization is being applied.
+    tags : String
+        The optimization tag to which the optimizer will be registered.
+    """
+    def f(local_opt):
+        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
+        if isinstance(local_opt, DB):
+            opt = local_opt
+        else:
+            opt = local_optimizer(tracks)(local_opt)
+        gpu_optimizer2.register(name, opt, "fast_run", "gpuarray", *tags)
+        return local_opt
+    return f
+def register_inplace(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
+        optdb.register(
+            name,
+            TopoOptimizer(local_opt, failure_callback=TopoOptimizer.warn_inplace),
+            60,
+            "fast_run",
+            "inplace",
+            "gpuarray",
+            *tags,
+        )
+        return local_opt
+    return f
+# Register GPU convolution implementation
+# They are tried in a specific order so we can control
+# which ones take precedence over others.
+abstractconv_groupopt = LocalGroupDB()
+abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
+register_opt("fast_compile")(abstractconv_groupopt)
+class GraphToGPUDB(DB):
+    """
+    Retrieves the list local optimizers based on the optimizer flag's value
+    from EquilibriumOptimizer by calling the method query.
+    """
+    def query(self, *tags, **kwtags):
+        from theano.gpuarray.opt import GraphToGPU
+        opt = gpu_optimizer2.query(*tags, **kwtags)
+        return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
--- a/theano/gpuarray/pool.py
+++ b/theano/gpuarray/pool.py
@@ -5,14 +5,15 @@ from theano.scalar import bool as bool_t
 from theano.tensor.basic import as_tensor_variable
 from theano.tensor.signal.pool import Pool, PoolingMode_t
-from .type import gpu_context_type
 from .basic_ops import (
    CGpuKernelBase,
-    infer_context_name,
-    gpuarray_helper_inc_dir,
    as_gpuarray_variable,
    gpu_contiguous,
+    gpuarray_helper_inc_dir,
+    infer_context_name,
 )
+from .type import gpu_context_type
 try:
    import pygpu

--- a/theano/gpuarray/reduction.py
+++ b/theano/gpuarray/reduction.py
-from theano.gof import Op, Apply
+from theano.gof import Apply, Op
 from theano.gof.type import Generic
-from .basic_ops import infer_context_name, as_gpuarray_variable, gpuarray_helper_inc_dir
+from .basic_ops import as_gpuarray_variable, gpuarray_helper_inc_dir, infer_context_name
 from .type import GpuArrayType
 try:
    import pygpu
 except ImportError:

--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -9,21 +9,21 @@ http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
 from theano import Apply, tensor
 from theano.gof import local_optimizer
-from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform
+from theano.sandbox.rng_mrg import mrg_uniform, mrg_uniform_base
-from theano.tensor import as_tensor_variable, get_vector_length
 from theano.scalar import int32 as int_t
+from theano.tensor import as_tensor_variable, get_vector_length
 from .basic_ops import (
+    GpuFromHost,
    GpuKernelBase,
    Kernel,
-    infer_context_name,
-    GpuFromHost,
-    host_from_gpu,
    as_gpuarray_variable,
+    host_from_gpu,
+    infer_context_name,
 )
-from .type import GpuArrayType, gpu_context_type
 from .fp16_help import write_w
 from .opt import register_opt, register_opt2
+from .type import GpuArrayType, gpu_context_type
 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):

--- a/theano/gpuarray/sort.py
+++ b/theano/gpuarray/sort.py
@@ -11,13 +11,14 @@ from theano.tensor.sort import TopKOp
 from .basic_ops import (
    GpuKernelBase,
    Kernel,
-    infer_context_name,
    as_gpuarray_variable,
    gpuarray_helper_inc_dir,
+    infer_context_name,
 )
-from .opt import register_opt, op_lifter, register_opt2
+from .opt import op_lifter, register_opt, register_opt2
 from .type import GpuArrayType
 try:
    import pygpu
    import pygpu.gpuarray as ga

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
 import numpy as np
-import theano.tensor as tt
 from six import integer_types
 from six.moves import StringIO
-from theano import gof, Op
+import theano.tensor as tt
+from theano import Op, gof
 from theano.gof import ParamsType
 from theano.gradient import grad_not_implemented
+from theano.scalar import bool as bool_t
+from theano.scalar import int32 as int_t
+from theano.scalar import uint32 as size_t
 from theano.tensor import AllocDiag
 from theano.tensor.subtensor import (
-    IncSubtensor,
-    AdvancedSubtensor,
-    Subtensor,
    AdvancedIncSubtensor,
+    AdvancedSubtensor,
    AdvancedSubtensor1,
+    IncSubtensor,
+    Subtensor,
    get_idx_list,
 )
-from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t
 try:
    import pygpu
@@ -25,16 +26,17 @@ try:
 except ImportError:
    pass
-from theano.gpuarray.type import GpuArrayType, gpu_context_type
 from theano.gpuarray.basic_ops import (
-    as_gpuarray_variable,
-    HideC,
    GpuKernelBase,
+    HideC,
    Kernel,
+    as_gpuarray_variable,
+    gpu_contiguous,
    gpuarray_helper_inc_dir,
    infer_context_name,
-    gpu_contiguous,
 )
+from theano.gpuarray.type import GpuArrayType, gpu_context_type
 iadd_reg = {}

--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
-import sys
 import os
+import sys
 import warnings
-import six.moves.copyreg as copyreg
 import numpy as np
+import six.moves.copyreg as copyreg
 import theano
+from theano import Constant, Type, Variable, config, scalar, tensor
+from theano.compile import SharedVariable
 from theano.tensor.type import TensorType
 from theano.tensor.var import _tensor_py_operators
-from theano import Type, Variable, Constant, tensor, config, scalar
-from theano.compile import SharedVariable
 # Make sure this is importable even if pygpu is absent
 # (it will not work though)