Merge pull request #6143 from affanv14/metaopt

Implement Meta-optimizer on new backend

Merge pull request #6143 from affanv14/metaopt
3e86efec · Frédéric Bastien · GitHub · 43556e98 · 35df202d · 3e86efec
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -1116,3 +1116,26 @@ import theano and print the config variable, as in:
    The number of user stack level to keep for variables during Theano
    compilation. If higher then 0, will make us keep Theano internal
    stack trace.
+
+.. attribute:: config.metaopt.verbose
+
+    Int value, default: 0
+
+    The verbosity level of the meta-optimizer. 0 for silent.
+    1 to only warn if we cannot meta-optimize some op.
+    2 for full output of separate timings and selected implementation
+
+
+.. attribute:: config.metaopt.optimizer_excluding
+
+    Default: ``""``
+
+    A list of optimizer tags that we don't want included in the Meta-optimizer.
+    If multiple tags, separate them by ':'.
+
+.. attribute:: config.metaopt.optimizer_including
+
+    Default: ``""``
+
+    A list of optimizer tags that we want included in the Meta-optimizer.
+    If multiple tags, separate them by ':'.
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -1456,10 +1456,23 @@ AddConfigVar('blas.ldflags',

 AddConfigVar(
    'metaopt.verbose',
-    "Enable verbose output for meta optimizers",
-    theano.configparser.BoolParam(False),
+    "0 for silent, 1 for only warnings, 2 for full output with"
+    "timings and selected implementation",
+    theano.configparser.IntParam(0),
    in_c_key=False)

+AddConfigVar('metaopt.optimizer_excluding',
+             ("exclude optimizers with these tags. "
+              "Separate tags with ':'."),
+             StrParam(""),
+             in_c_key=False)
+
+AddConfigVar('metaopt.optimizer_including',
+             ("include optimizers with these tags. "
+              "Separate tags with ':'."),
+             StrParam(""),
+             in_c_key=False)
+
 AddConfigVar('profile',
             "If VM should collect profile information",
             BoolParam(False),

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1131,13 +1131,20 @@ class LocalMetaOptimizer(LocalOptimizer):

    """

-    def __init__(self, tracks=None, optimizers=()):
-        self._tracks = tracks
-        self.optimizers = list(optimizers)
+    def __init__(self):
        self.verbose = config.metaopt.verbose
+        self.track_dict = defaultdict(lambda: [])
+        self.tag_dict = defaultdict(lambda: [])
+        self._tracks = []
+        self.optimizers = []

-    def register(self, optimizer):
+    def register(self, optimizer, tag_list):
        self.optimizers.append(optimizer)
+        for c in optimizer.tracks():
+            self.track_dict[c].append(optimizer)
+            self._tracks.append(c)
+        for tag in tag_list:
+            self.tag_dict[tag].append(optimizer)

    def tracks(self):
        return self._tracks
@@ -1167,39 +1174,40 @@ class LocalMetaOptimizer(LocalOptimizer):
            missing.difference_update(givens.keys())
        # ensure we have data for all input variables that need it
        if missing:
-            if self.verbose:
+            if self.verbose > 0:
                print(("%s cannot meta-optimize %s, "
                       "%d of %d input shapes unknown" %
                       (self.__class__.__name__, node, len(missing), node.nin)))
            return
        # now we can apply the different optimizations in turn,
        # compile the resulting subgraphs and time their execution
-        if self.verbose:
+        if self.verbose > 1:
            print(("%s meta-optimizing %s (%d choices):" %
-                   (self.__class__.__name__, node, len(self.optimizers))))
+                   (self.__class__.__name__, node, len(self.get_opts(node)))))
        timings = []
-        for opt in self.optimizers:
+        for opt in self.get_opts(node):
            outputs = opt.transform(node)
            if outputs:
                try:
                    fn = theano.function([], outputs, givens=givens,
                                         on_unused_input='ignore')
-                    timing = min(self.time_call(fn) for _ in range(3))
+                    fn.trust_input = True
+                    timing = min(self.time_call(fn) for _ in range(2))
                except Exception as e:
-                    if self.verbose:
+                    if self.verbose > 0:
                        print("* %s: exception" % opt, e)
                    continue
                else:
-                    if self.verbose:
+                    if self.verbose > 1:
                        print("* %s: %.5g sec" % (opt, timing))
                    timings.append((timing, outputs, opt))
            else:
-                if self.verbose:
+                if self.verbose > 0:
                    print("* %s: not applicable" % opt)
        # finally, we choose the fastest one
        if timings:
            timings.sort()
-            if self.verbose:
+            if self.verbose > 1:
                print("= %s" % timings[0][2])
            return timings[0][1]
        return
@@ -1213,6 +1221,12 @@ class LocalMetaOptimizer(LocalOptimizer):
        """
        raise NotImplementedError()

+    def get_opts(self, node):
+        """
+        Can be overrided to change the way opts are selected
+        """
+        return self.track_dict[type(node.op)]
+
    def time_call(self, fn):
        start = time.time()
        fn()
@@ -2313,7 +2327,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        self.final_optimizers = []
        self.cleanup_optimizers = []
        self.tracks_on_change_inputs = tracks_on_change_inputs
-
        for opt in optimizers:
            if isinstance(opt, LocalOptimizer):
                if opt.tracks() is None:

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1014,7 +1014,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)

-    elif (border_mode == 'full' and subsample == (1, 1) and dilation == (1, 1) and
+    elif (border_mode == 'full' and subsample == (1, 1) and
          direction_hint != 'forward!' and num_groups == 1):
        # Special case: We can be faster by using GpuDnnConvGradI to compute
        # the full convolution as the backward pass of a valid convolution.
@@ -1024,11 +1024,11 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        out_shp = (shape_i(img, 0, fgraph),
                   shape_i(kerns, 1, fgraph),
-                   shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
-                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1)
+                   shape_i(img, 2, fgraph) + (shape_i(kerns, 2, fgraph) - 1) * dilation[0],
+                   shape_i(img, 3, fgraph) + (shape_i(kerns, 3, fgraph) - 1) * dilation[1])
        out_shp = assert_conv_shape(out_shp)
        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1),
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=dilation,
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return GpuDnnConvGradI()(kerns, img, out, desc)

@@ -1133,7 +1133,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)

-    elif (border_mode == 'full' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and
+    elif (border_mode == 'full' and subsample == (1, 1, 1) and
          direction_hint != 'forward!'):
        # Special case: We can be faster by using GpuDnnConvGradI to compute
        # the full convolution as the backward pass of a valid convolution.
@@ -1143,12 +1143,12 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        out_shp = (shape_i(img, 0, fgraph),
                   shape_i(kerns, 1, fgraph),
-                   shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
-                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1,
-                   shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1)
+                   shape_i(img, 2, fgraph) + (shape_i(kerns, 2, fgraph) - 1) * dilation[0],
+                   shape_i(img, 3, fgraph) + (shape_i(kerns, 3, fgraph) - 1) * dilation[1],
+                   shape_i(img, 4, fgraph) + (shape_i(kerns, 4, fgraph) - 1) * dilation[2])
        out_shp = assert_conv_shape(out_shp)
        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=dilation,
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return GpuDnnConvGradI()(kerns, img, out, desc)

@@ -2888,6 +2888,215 @@ def local_abstractconv_cudnn(node):
        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


+@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
+def local_abstractconv_cudnn_alt(node):
+    if(not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
+       AbstractConv2d_gradInputs))):
+        return
+
+    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
+        return None
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+
+    if not dnn_available(inp1.type.context_name):
+        return
+
+    op = node.op
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    precision = get_precision(None, [inp1, inp2])
+
+    if node.op.filter_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+
+    if isinstance(op, AbstractConv2d):
+        if border_mode == 'half' or subsample != (1, 1) or num_groups != 1:
+            return None
+        if border_mode == 'full':
+            direction_hint = 'bprop inputs'
+        elif border_mode == 'valid' and filter_dilation == (1, 1):
+            direction_hint = 'bprop weights'
+        else:
+            return None
+
+        rval = dnn_conv(inp1, inp2,
+                        border_mode=border_mode,
+                        subsample=subsample,
+                        dilation=filter_dilation,
+                        direction_hint=direction_hint,
+                        conv_mode=conv_mode,
+                        num_groups=num_groups)
+
+    elif isinstance(op, AbstractConv2d_gradWeights):
+        if(border_mode == 'valid' and subsample == (1, 1) and
+           filter_dilation == (1, 1) and num_groups == 1):
+            img = gpu_contiguous(inp1)
+            topgrad = gpu_contiguous(inp2)
+            ctx_name = infer_context_name(img, topgrad)
+            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
+            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
+            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
+            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
+            out_shp = get_conv_output_shape(ishape,
+                                            tshape,
+                                            border_mode=border_mode,
+                                            subsample=subsample,
+                                            filter_dilation=filter_dilation)
+
+            out_shp = assert_conv_shape(out_shp)
+            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
+            desc = GpuDnnConvDesc(border_mode=border_mode,
+                                  subsample=subsample,
+                                  dilation=filter_dilation,
+                                  conv_mode='cross',
+                                  precision=precision)(out.shape)
+
+            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
+            if conv_mode == 'conv':
+                conv = conv[:, :, ::-1, ::-1]
+
+            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
+        else:
+            return None
+
+    elif isinstance(op, AbstractConv2d_gradInputs):
+        if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
+            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
+            topgrad = gpu_contiguous(inp2)
+            ctx_name = infer_context_name(kerns, topgrad)
+            conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
+            desc = GpuDnnConvDesc(border_mode='full',
+                                  subsample=subsample,
+                                  dilation=filter_dilation,
+                                  conv_mode=conv_mode,
+                                  precision=precision)(kerns.shape)
+
+            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
+            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
+            shape = get_conv_output_shape(tshape,
+                                          kshape,
+                                          border_mode='full',
+                                          subsample=subsample,
+                                          filter_dilation=filter_dilation)
+
+            shape = assert_conv_shape(shape)
+            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
+            rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns, out, desc)
+        else:
+            return None
+
+    return [rval]
+
+
+@local_optimizer([AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs])
+def local_abstractconv3d_cudnn_alt(node):
+    if(not isinstance(node.op, (AbstractConv3d,
+                                AbstractConv3d_gradWeights,
+                                AbstractConv3d_gradInputs))):
+        return
+
+    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
+        return None
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+
+    if not dnn_available(inp1.type.context_name):
+        return
+
+    op = node.op
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    precision = get_precision(None, [inp1, inp2])
+
+    if node.op.filter_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+
+    if isinstance(op, AbstractConv3d):
+        if border_mode == 'half' or subsample != (1, 1, 1):
+            return None
+        if border_mode == 'full':
+            direction_hint = 'bprop inputs'
+        elif border_mode == 'valid' and filter_dilation == (1, 1, 1):
+            direction_hint = 'bprop weights'
+        else:
+            return None
+
+        rval = dnn_conv3d(inp1, inp2,
+                          border_mode=border_mode,
+                          subsample=subsample,
+                          dilation=filter_dilation,
+                          direction_hint=direction_hint,
+                          conv_mode=conv_mode)
+
+    elif isinstance(op, AbstractConv3d_gradWeights):
+        if(border_mode == 'valid' and subsample == (1, 1, 1) and
+           filter_dilation == (1, 1, 1)):
+            img = gpu_contiguous(inp1)
+            topgrad = gpu_contiguous(inp2)
+            ctx_name = infer_context_name(img, topgrad)
+            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
+            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
+            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
+            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
+            out_shp = get_conv_output_shape(ishape,
+                                            tshape,
+                                            border_mode=border_mode,
+                                            subsample=subsample,
+                                            filter_dilation=filter_dilation)
+
+            out_shp = assert_conv_shape(out_shp)
+            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
+            desc = GpuDnnConvDesc(border_mode=border_mode,
+                                  subsample=subsample,
+                                  dilation=filter_dilation,
+                                  conv_mode='cross',
+                                  precision=precision)(out.shape)
+
+            conv = GpuDnnConv(algo=None)(img, topgrad, out, desc)
+            if conv_mode == 'conv':
+                conv = conv[:, :, ::-1, ::-1, ::-1]
+
+            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
+        else:
+            return None
+
+    elif isinstance(op, AbstractConv3d_gradInputs):
+        if border_mode == 'valid' and subsample == (1, 1, 1):
+            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
+            topgrad = gpu_contiguous(inp2)
+            ctx_name = infer_context_name(kerns, topgrad)
+            conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
+            desc = GpuDnnConvDesc(border_mode='full',
+                                  subsample=subsample,
+                                  dilation=filter_dilation,
+                                  conv_mode=conv_mode,
+                                  precision=precision)(kerns.shape)
+
+            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
+            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
+            shape = get_conv_output_shape(tshape,
+                                          kshape,
+                                          border_mode='full',
+                                          subsample=subsample,
+                                          filter_dilation=filter_dilation)
+
+            shape = assert_conv_shape(shape)
+            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
+            rval = GpuDnnConv(algo=None)(topgrad, kerns, out, desc)
+        else:
+            return None
+
+    return [rval]
+
+
 @local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
 def local_abstractconv_gw_cudnn(node):
    ctx = infer_context_name(*node.inputs)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -15,6 +15,7 @@ from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        LocalGroupDB,
                        SequenceDB, Optimizer, DB, toolbox, graph)
+from theano.gof.opt import LocalMetaOptimizer
 from theano.ifelse import IfElse
 from theano.misc.ordered_set import OrderedSet

@@ -23,7 +24,7 @@ from theano.scalar.basic import log, neg, true_div
 from theano.scalar.basic_scipy import Erfinv, Erfcinv
 from theano.scan_module import scan_utils, scan_op, scan_opt

-from theano.tensor.nnet import bn
+from theano.tensor.nnet import bn, conv3d2d
 from theano.tensor.nnet.conv import ConvOp
 from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
@@ -32,12 +33,14 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
                                              AbstractConv2d_gradInputs,
                                              AbstractConv3d,
                                              AbstractConv3d_gradWeights,
-                                              AbstractConv3d_gradInputs)
+                                              AbstractConv3d_gradInputs,
+                                              get_conv_output_shape)
 from theano.tensor.nnet.neighbours import Images2Neibs
 from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
 import theano.tensor.nlinalg as nlinalg
 import theano.tensor.signal.pool as pool
 import theano.tensor.slinalg as slinalg
+from collections import Counter

 from theano.tests.breakpoint import PdbBreakpoint

@@ -1625,6 +1628,49 @@ def local_abstractconv_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv2d])
+def local_abstractconv_gemm_alt(node):
+    if not isinstance(node.op, AbstractConv2d):
+        return None
+    img, kern = node.inputs
+    if (not isinstance(img.type, GpuArrayType) or
+            not isinstance(kern.type, GpuArrayType)):
+        return None
+    ctx = infer_context_name(img, kern)
+
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+
+    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1:
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1]
+
+        kern = kern.dimshuffle(1, 0, 2, 3)
+        rval = GpuCorrMM_gradInputs('valid',
+                                    subsample,
+                                    filter_dilation)(
+            gpu_contiguous(kern), gpu_contiguous(img))
+
+    elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
+          num_groups == 1):
+        if node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1]
+
+        rval = GpuCorrMM_gradWeights(border_mode,
+                                     subsample,
+                                     filter_dilation)(
+            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
+            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
+        rval = as_gpuarray_variable(rval.dimshuffle(1, 0, 2, 3),
+                                    context_name=ctx)
+    else:
+        return None
+
+    return [rval]
+
+
 @local_optimizer([AbstractConv3d])
 def local_abstractconv3d_gemm(node):
    if not isinstance(node.op, AbstractConv3d):
@@ -1694,6 +1740,74 @@ def local_abstractconv3d_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv3d])
+def local_abstractconv3d_alt(node):
+    if not isinstance(node.op, AbstractConv3d):
+        return None
+    img, kern = node.inputs
+    if (not isinstance(img.type, GpuArrayType) or
+            not isinstance(kern.type, GpuArrayType)):
+        return None
+    ctx = infer_context_name(img, kern)
+
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+
+    if ((border_mode == 'full') and (subsample == (1, 1, 1))):
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        kern = kern.dimshuffle(1, 0, 2, 3, 4)
+        rval = GpuCorr3dMM_gradInputs('valid',
+                                      subsample,
+                                      filter_dilation)(
+            gpu_contiguous(kern), gpu_contiguous(img))
+
+    elif(subsample == (1, 1, 1) and filter_dilation == (1, 1, 1) and
+         border_mode == 'valid'):
+        if node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        rval = GpuCorr3dMM_gradWeights(border_mode,
+                                       subsample,
+                                       filter_dilation)(
+            gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
+            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
+        rval = as_gpuarray_variable(rval.dimshuffle(1, 0, 2, 3, 4),
+                                    context_name=ctx)
+    else:
+        return None
+    return [rval]
+
+
+@local_optimizer([AbstractConv3d])
+def local_abstractconv3d2d(node):
+    if not isinstance(node.op, AbstractConv3d):
+        return None
+    img, kern = node.inputs
+    if (not isinstance(img.type, GpuArrayType) or
+            not isinstance(kern.type, GpuArrayType)):
+        return None
+
+    ctx = infer_context_name(img, kern)
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+
+    if subsample == (1, 1, 1) and filter_dilation == (1, 1, 1):
+        reorder_array = [0, 2, 1, 3, 4]
+        rval = conv3d2d.conv3d(gpu_contiguous(img.dimshuffle(*reorder_array)),
+                               gpu_contiguous(kern.dimshuffle(*reorder_array)),
+                               [node.op.imshp[i] for i in reorder_array],
+                               [node.op.kshp[i] for i in reorder_array],
+                               border_mode=border_mode)
+        rval = as_gpuarray_variable(rval.dimshuffle(*reorder_array),
+                                    context_name=ctx)
+
+        return [rval]
+    else:
+        return None
+
+
 @local_optimizer([AbstractConv2d_gradWeights])
 def local_abstractconv_gradweights_gemm(node):
    if not isinstance(node.op, AbstractConv2d_gradWeights):
@@ -1716,6 +1830,70 @@ def local_abstractconv_gradweights_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv2d_gradWeights])
+def local_abstractconv_gemm_gradweights_alt(node):
+    if not isinstance(node.op, AbstractConv2d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, GpuArrayType) or \
+            not isinstance(topgrad.type, GpuArrayType):
+        return None
+    ctx = infer_context_name(img, topgrad)
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+
+    if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
+       num_groups == 1):
+        rval = GpuCorrMM(border_mode,
+                         subsample,
+                         filter_dilation)(
+            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
+            gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3)))
+
+        if node.op.filter_flip:
+            rval = rval[:, :, ::-1, ::-1]
+
+        rval = rval.dimshuffle(1, 0, 2, 3)
+        rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+        rval = as_gpuarray_variable(rval, context_name=ctx)
+        return [rval]
+    else:
+        return None
+
+
+@local_optimizer([AbstractConv3d_gradWeights])
+def local_abstractconv3d_gemm_gradweights_alt(node):
+    if not isinstance(node.op, AbstractConv3d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, GpuArrayType) or \
+            not isinstance(topgrad.type, GpuArrayType):
+        return None
+    ctx = infer_context_name(img, topgrad)
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+
+    if border_mode == 'valid' and subsample == (1, 1, 1) and filter_dilation == (1, 1, 1):
+        rval = GpuCorr3dMM(border_mode,
+                           subsample,
+                           filter_dilation)(
+            gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
+            gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4)))
+
+        if node.op.filter_flip:
+            rval = rval[:, :, ::-1, ::-1, ::-1]
+
+        rval = rval.dimshuffle(1, 0, 2, 3, 4)
+        rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+        rval = as_gpuarray_variable(rval, context_name=ctx)
+        return [rval]
+    else:
+        return None
+
+
 @local_optimizer([AbstractConv3d_gradWeights])
 def local_abstractconv3d_gradweights_gemm(node):
    if not isinstance(node.op, AbstractConv3d_gradWeights):
@@ -1757,6 +1935,33 @@ def local_abstractconv_gradinputs_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv2d_gradInputs])
+def local_abstractconv_gradinputs_gemm_alt(node):
+    if not isinstance(node.op, AbstractConv2d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, GpuArrayType) or \
+            not isinstance(topgrad.type, GpuArrayType):
+        return None
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+
+    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1]
+
+        rval = GpuCorrMM(border_mode='full',
+                         subsample=subsample,
+                         filter_dilation=filter_dilation)(
+            gpu_contiguous(topgrad),
+            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
+        return [rval]
+    else:
+        return None
+
+
 @local_optimizer([AbstractConv3d_gradInputs])
 def local_abstractconv3d_gradinputs_gemm(node):
    if not isinstance(node.op, AbstractConv3d_gradInputs):
@@ -1776,6 +1981,111 @@ def local_abstractconv3d_gradinputs_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv3d_gradInputs])
+def local_abstractconv3d_gradinputs_gemm_alt(node):
+    if not isinstance(node.op, AbstractConv3d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, GpuArrayType) or \
+            not isinstance(topgrad.type, GpuArrayType):
+        return None
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+
+    if border_mode == 'valid' and subsample == (1, 1, 1):
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        rval = GpuCorr3dMM(border_mode='full',
+                           subsample=subsample,
+                           filter_dilation=filter_dilation)(
+            gpu_contiguous(topgrad),
+            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
+        return [rval]
+    else:
+        return None
+
+
+class ConvMetaOptimizer(LocalMetaOptimizer):
+
+    def __init__(self):
+        super(ConvMetaOptimizer, self).__init__()
+
+    def time_call(self, fn):
+        start = time.time()
+        fn()[0].sync()
+        return time.time() - start
+
+    def provide_inputs(self, node, inputs):
+        result = {}
+
+        shapes = (node.op.imshp, node.op.kshp)
+        if(node.op.imshp is None or node.op.kshp is None or
+                any([s is None for shape in shapes for s in shape])):
+            return result
+
+        if type(node.op) in [AbstractConv2d, AbstractConv3d]:
+            img, kern = node.inputs
+            for(var, shape) in zip((img, kern), shapes):
+                result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
+                                            var.name,
+                                            broadcastable=var.broadcastable,
+                                            borrow=True)
+
+        if type(node.op) in [AbstractConv2d_gradWeights, AbstractConv3d_gradWeights]:
+            img, top, kshape = node.inputs
+
+            tshp = get_conv_output_shape(node.op.imshp,
+                                         node.op.kshp,
+                                         node.op.border_mode,
+                                         node.op.subsample,
+                                         node.op.filter_dilation)
+
+            result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[2:])
+
+            for(var, shape) in zip((img, top), (node.op.imshp, tshp)):
+                result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
+                                            var.name,
+                                            broadcastable=var.broadcastable,
+                                            borrow=True)
+
+        if type(node.op) in [AbstractConv2d_gradInputs, AbstractConv3d_gradInputs]:
+            kern, top, ishape = node.inputs
+
+            tshp = get_conv_output_shape(node.op.imshp,
+                                         node.op.kshp,
+                                         node.op.border_mode,
+                                         node.op.subsample,
+                                         node.op.filter_dilation)
+
+            result[ishape] = theano.tensor.as_tensor_variable(node.op.imshp[2:])
+
+            for(var, shape) in zip((kern, top), (node.op.kshp, tshp)):
+                result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
+                                            var.name,
+                                            broadcastable=var.broadcastable,
+                                            borrow=True)
+
+        return result
+
+    def get_opts(self, node):
+        opts = Counter([opt for opt in self.track_dict[type(node.op)]
+                       if opt in self.tag_dict['default']])
+        include_tags = config.metaopt.optimizer_including.split(':')
+        exclude_tags = config.metaopt.optimizer_excluding.split(':')
+
+        for in_opt in include_tags:
+            opts.update([opt for opt in self.track_dict[type(node.op)]
+                        if opt in self.tag_dict[in_opt]])
+
+        for ex_opt in exclude_tags:
+            opts.subtract([opt for opt in self.track_dict[type(node.op)]
+                          if opt in self.tag_dict[ex_opt]])
+
+        opts = list(opts + Counter())
+        return opts
+
+
 # This deals with any abstract convs that have a transfer somewhere
 @register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @op_lifter([AbstractConv2d,
@@ -2354,8 +2664,12 @@ register_opt('fast_compile')(abstractconv_groupopt)

 # We import these opts here instead of at the top of this file
 # to avoid a circular dependency problem with dnn
-from .dnn import (local_abstractconv_cudnn, local_abstractconv_gw_cudnn,
-                  local_abstractconv_gi_cudnn)     # noqa: 402
+from .dnn import (local_abstractconv_cudnn,
+                  local_abstractconv_gw_cudnn,
+                  local_abstractconv_gi_cudnn,     # noqa: 402
+                  local_abstractconv_cudnn_alt,
+                  local_abstractconv3d_cudnn_alt)
+
 abstractconv_groupopt.register('local_abstractconv_dnn',
                               local_abstractconv_cudnn, 20,
                               'conv_dnn',
@@ -2393,6 +2707,46 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
                               'conv_gemm',
                               'gpuarray', 'fast_compile', 'fast_run')

+conv_metaopt = ConvMetaOptimizer()
+
+conv_metaopt.register(local_abstractconv_cudnn,
+                      ['default', 'cudnn', 'conv_dnn'])
+conv_metaopt.register(local_abstractconv_gw_cudnn,
+                      ['default', 'cudnn', 'conv_dnn'])
+conv_metaopt.register(local_abstractconv_gi_cudnn,
+                      ['default', 'cudnn', 'conv_dnn'])
+conv_metaopt.register(local_abstractconv_gemm,
+                      ['default', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv3d_gemm,
+                      ['default', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv_gradweights_gemm,
+                      ['default', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv3d_gradweights_gemm,
+                      ['default', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv_gradinputs_gemm,
+                      ['default', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv3d_gradinputs_gemm,
+                      ['default', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv_gemm_alt,
+                      ['default', 'alternative', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv_gemm_gradweights_alt,
+                      ['default', 'alternative', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv_gradinputs_gemm_alt,
+                      ['default', 'alternative', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv_cudnn_alt,
+                      ['default', 'alternative', 'cudnn', 'conv_dnn'])
+conv_metaopt.register(local_abstractconv3d_cudnn_alt,
+                      ['default', 'alternative', 'cudnn', 'conv_dnn'])
+conv_metaopt.register(local_abstractconv3d_alt,
+                      ['default', 'alternative', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv3d_gemm_gradweights_alt,
+                      ['default', 'alternative', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv3d_gradinputs_gemm_alt,
+                      ['default', 'alternative', 'conv_gemm'])
+conv_metaopt.register(local_abstractconv3d2d,
+                      ['alternative', 'conv3d2d'])
+
+abstractconv_groupopt.register('conv_metaopt', conv_metaopt, 'conv_meta', position=0)

 # Register cuDNN batch normalization implementation


--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -22,6 +22,9 @@ from ..subtensor import GpuSubtensor
 from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky

 from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, SkipTest
+import unittest
+from theano.tensor.nnet import abstract_conv
+from theano.gpuarray import dnn, blas


 def test_local_assert():
@@ -699,3 +702,200 @@ def test_crossentropycategorical1hot_lifter():
                   for n in f.maker.fgraph.apply_nodes)
    f(rng.uniform(0.1, 0.9, (13, 5)).astype(theano.config.floatX),
      rng.randint(5, size=(13,)))
+
+
+class Conv_opt_test(unittest.TestCase):
+
+    def optimizer_2d(self, input_shapes, direction, include_tags, exclude_tags,
+                     op, border_mode='valid', subsample=(1, 1), filter_dilation=(1, 1)):
+
+        inp1 = theano.shared(np.random.random(input_shapes[0]).astype(theano.config.floatX))
+        inp2 = theano.shared(np.random.random(input_shapes[1]).astype(theano.config.floatX))
+        if(direction == 0):
+            conv_op = abstract_conv.conv2d(inp1,
+                                           inp2,
+                                           input_shapes[0],
+                                           input_shapes[1],
+                                           border_mode=border_mode,
+                                           subsample=subsample,
+                                           filter_dilation=filter_dilation)
+
+        if(direction == 1):
+            conv_op = abstract_conv.conv2d_grad_wrt_weights(inp1,
+                                                            inp2,
+                                                            input_shapes[2],
+                                                            input_shapes[0],
+                                                            border_mode=border_mode,
+                                                            subsample=subsample,
+                                                            filter_dilation=filter_dilation)
+
+        if(direction == 2):
+            conv_op = abstract_conv.conv2d_grad_wrt_inputs(inp1,
+                                                           inp2,
+                                                           input_shapes[2],
+                                                           input_shapes[1],
+                                                           border_mode=border_mode,
+                                                           subsample=subsample,
+                                                           filter_dilation=filter_dilation)
+
+        theano.config.metaopt.optimizer_including = include_tags
+        theano.config.metaopt.optimizer_excluding = exclude_tags
+        mode = mode_with_gpu.including('conv_meta')
+
+        ref_func = theano.function([], conv_op, mode=mode_with_gpu)
+        conv_func = theano.function([], conv_op, mode=mode)
+        assert any([isinstance(node.op, op)
+                    for node in conv_func.maker.fgraph.toposort()])
+        utt.assert_allclose(conv_func(), ref_func())
+
+    def optimizer_3d(self, input_shapes, direction, include_tags, exclude_tags,
+                     op, border_mode='valid', subsample=(1, 1, 1),
+                     filter_dilation=(1, 1, 1)):
+        inp1 = theano.shared(np.random.random(input_shapes[0]).astype(theano.config.floatX))
+        inp2 = theano.shared(np.random.random(input_shapes[1]).astype(theano.config.floatX))
+        if(direction == 0):
+            conv_op = abstract_conv.conv3d(inp1,
+                                           inp2,
+                                           input_shapes[0],
+                                           input_shapes[1],
+                                           border_mode=border_mode,
+                                           subsample=subsample,
+                                           filter_dilation=filter_dilation)
+
+        if(direction == 1):
+            conv_op = abstract_conv.conv3d_grad_wrt_weights(inp1,
+                                                            inp2,
+                                                            input_shapes[2],
+                                                            input_shapes[0],
+                                                            border_mode=border_mode,
+                                                            subsample=subsample,
+                                                            filter_dilation=filter_dilation)
+
+        if(direction == 2):
+            conv_op = abstract_conv.conv3d_grad_wrt_inputs(inp1,
+                                                           inp2,
+                                                           input_shapes[2],
+                                                           input_shapes[1],
+                                                           border_mode=border_mode,
+                                                           subsample=subsample,
+                                                           filter_dilation=filter_dilation)
+
+        theano.config.metaopt.optimizer_including = include_tags
+        theano.config.metaopt.optimizer_excluding = exclude_tags
+        mode = mode_with_gpu.including('conv_meta')
+
+        ref_func = theano.function([], conv_op, mode=mode_with_gpu)
+        conv_func = theano.function([], conv_op, mode=mode)
+        if op is not None:
+            assert any([isinstance(node.op, op)
+                       for node in conv_func.maker.fgraph.toposort()])
+        utt.assert_allclose(conv_func(), ref_func())
+
+    def test_optimizers(self):
+        imshp2d = [(2, 3, 5, 5), (2, 2, 5, 7), (2, 1, 3, 3)]
+        kshp2d = [(4, 3, 3, 3), (3, 2, 3, 5), (4, 1, 1, 1)]
+        tshp2d = [(2, 4, 3, 3), (2, 3, 3, 3), (2, 4, 3, 3)]
+
+        for imshp, kshp, tshp in zip(imshp2d, kshp2d, tshp2d):
+            # forward passes
+            self.optimizer_2d([imshp, kshp, tshp], 0,
+                              'alternative',
+                              'conv_dnn:default',
+                              blas.GpuCorrMM_gradWeights)
+            self.optimizer_2d([imshp, kshp, tshp], 0,
+                              'alternative',
+                              'conv_gemm:default',
+                              dnn.GpuDnnConvGradW)
+            # backwards wrt weights
+            self.optimizer_2d([imshp, tshp, kshp], 1,
+                              'alternative',
+                              'conv_dnn:default',
+                              blas.GpuCorrMM)
+            self.optimizer_2d([imshp, tshp, kshp], 1,
+                              'alternative',
+                              'conv_gemm:default',
+                              dnn.GpuDnnConv)
+            # backwards wrt to inputs
+            self.optimizer_2d([tshp, kshp, imshp], 2,
+                              'alternative',
+                              'conv_dnn:default',
+                              blas.GpuCorrMM)
+            self.optimizer_2d([tshp, kshp, imshp], 2,
+                              'alternative',
+                              'conv_gemm:default',
+                              dnn.GpuDnnConv)
+
+        imshp3d = [(2, 3, 5, 5, 5), (2, 2, 5, 7, 5), (2, 1, 3, 3, 3)]
+        kshp3d = [(4, 3, 3, 3, 3), (3, 2, 3, 5, 3), (4, 1, 1, 1, 1)]
+        tshp3d = [(2, 4, 3, 3, 3), (2, 3, 3, 3, 3), (2, 4, 3, 3, 3)]
+
+        for imshp, kshp, tshp in zip(imshp3d, kshp3d, tshp3d):
+            # forwards passes
+            self.optimizer_3d([imshp, kshp, tshp], 0,
+                              'alternative',
+                              'conv_dnn:default:conv3d2d',
+                              blas.GpuCorr3dMM_gradWeights)
+            self.optimizer_3d([imshp, kshp, tshp], 0,
+                              'conv3d2d',
+                              'default',
+                              None)
+            self.optimizer_3d([imshp, kshp, tshp], 0,
+                              'alternative',
+                              'conv_gemm:default:conv3d2d',
+                              dnn.GpuDnnConvGradW)
+            # backward pass wrt weight
+            self.optimizer_3d([imshp, tshp, kshp], 1,
+                              'alternative',
+                              'conv_dnn:default',
+                              blas.GpuCorr3dMM)
+            self.optimizer_3d([imshp, tshp, kshp], 1,
+                              'alternative',
+                              'conv_gemm:default',
+                              dnn.GpuDnnConv)
+
+            # backward pass wrt inputs
+            self.optimizer_3d([tshp, kshp, imshp], 2,
+                              'alternative',
+                              'conv_dnn:default',
+                              blas.GpuCorr3dMM)
+            self.optimizer_3d([tshp, kshp, imshp], 2,
+                              'alternative',
+                              'conv_gemm:default',
+                              dnn.GpuDnnConv)
+
+        # conv2d forward pass with Non-default border_mode and filter_dilation
+        imshp2d = [(2, 3, 5, 5), (4, 2, 5, 5)]
+        kshp2d = [(4, 3, 3, 3), (3, 2, 3, 3)]
+        filter_dilation = [(1, 1), (2, 2)]
+        for imshp, kshp, fdil in zip(imshp2d, kshp2d, filter_dilation):
+            self.optimizer_2d([imshp, kshp], 0,
+                              'alternative',
+                              'conv_dnn:default',
+                              blas.GpuCorrMM_gradInputs,
+                              border_mode='full',
+                              filter_dilation=fdil)
+            # works only for cudnn > 6.0
+            self.optimizer_2d([imshp, kshp], 0,
+                              'alternative',
+                              'conv_gemm:default',
+                              dnn.GpuDnnConvGradI,
+                              border_mode='full',
+                              filter_dilation=fdil)
+        # conv3d forward pass with Non-default border_mode and filter_dilation
+        imshp3d = [(2, 3, 5, 5, 5), (4, 2, 5, 5, 5)]
+        kshp3d = [(4, 3, 3, 3, 3), (3, 2, 3, 3, 3)]
+        filter_dilation = [(1, 1, 1), (2, 2, 2)]
+        for imshp, kshp, fdil in zip(imshp3d, kshp3d, filter_dilation):
+            self.optimizer_3d([imshp, kshp], 0,
+                              'alternative',
+                              'conv_dnn:default:conv3d2d',
+                              blas.GpuCorr3dMM_gradInputs,
+                              border_mode='full',
+                              filter_dilation=fdil)
+            # works only for cudnn > 6.0
+            self.optimizer_3d([imshp, kshp], 0,
+                              'alternative',
+                              'conv_gemm:default:conv3d2d',
+                              dnn.GpuDnnConvGradI,
+                              border_mode='full',
+                              filter_dilation=fdil)