Merge pull request #2266 from f0k/metaopt

conv2d meta-optimizer

Merge pull request #2266 from f0k/metaopt
b0b3fb8d · Frédéric Bastien · de11ad82 · 58c9da99 · b0b3fb8d · b0b3fb8d
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -823,6 +823,95 @@ class LocalOptimizer(object):
                (' ' * level), self.__class__.__name__, id(self))
+class LocalMetaOptimizer(LocalOptimizer):
+    """Base class for meta-optimizers that try a set of LocalOptimizers
+    to replace a node and choose the one that executes the fastest"""
+    def __init__(self, tracks=None, optimizers=()):
+        self._tracks = tracks
+        self.optimizers = list(optimizers)
+        self.verbose = False
+    def register(self, optimizer):
+        self.optimizers.append(optimizer)
+    def tracks(self):
+        return self._tracks
+    def transform(self, node):
+        # safety check: depending on registration, tracks may have been ignored
+        if self._tracks is not None:
+            if not isinstance(node.op, tuple(self._tracks)):
+                return
+        # first, we need to provide dummy values for all inputs
+        # to the node that are not shared variables anyway
+        givens = {}
+        missing = set()
+        for input in node.inputs:
+            if isinstance(input, theano.compile.SharedVariable):
+                pass
+            elif hasattr(input.tag, 'test_value'):
+                givens[input] = theano.shared(
+                        input.type.filter(input.tag.test_value),
+                        input.name, borrow=True)
+            else:
+                missing.add(input)
+        if missing:
+            givens.update(self.provide_inputs(node, missing))
+            missing.difference_update(givens.keys())
+        # ensure we have data for all input variables that need it
+        if missing:
+            if self.verbose:
+                print ("%s cannot meta-optimize %s, "
+                       "%d of %d input shapes unknown" %
+                       (self.__class__.__name__, node, len(missing), node.nin))
+            return
+        # now we can apply the different optimizations in turn,
+        # compile the resulting subgraphs and time their execution
+        if self.verbose:
+            print ("%s meta-optimizing %s (%d choices):" %
+                   (self.__class__.__name__, node, len(self.optimizers)))
+        timings = []
+        for opt in self.optimizers:
+            outputs = opt.transform(node)
+            if outputs:
+                try:
+                    fn = theano.function([],
+                                         [theano.Out(output, borrow=True)
+                                          for output in outputs],
+                                         givens=givens)
+                    timing = min(self.time_call(fn) for _ in range(3))
+                except Exception as e:
+                    if self.verbose:
+                        print "* %s: exception" % opt, e
+                    continue
+                else:
+                    if self.verbose:
+                        print "* %s: %.5g sec" % (opt, timing)
+                    timings.append((timing, outputs, opt))
+            else:
+                if self.verbose:
+                    print "* %s: not applicable" % opt
+        # finally, we choose the fastest one
+        if timings:
+            timings.sort()
+            if self.verbose:
+                print "= %s" % timings[0][2]
+            return timings[0][1]
+        return
+    def provide_inputs(self, node, inputs):
+        """If implemented, returns a dictionary mapping all symbolic variables
+        in ``inputs`` to SharedVariable instances of suitable dummy values. The
+        ``node`` can be inspected to infer required input shapes."""
+        raise NotImplementedError()
+    def time_call(self, fn):
+        start = time.time()
+        fn()
+        return time.time() - start
 class FromFunctionLocalOptimizer(LocalOptimizer):
    """WRITEME"""
    def __init__(self, fn, tracks=None, requirements=()):

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1207,7 +1207,37 @@ if True:
                             border_mode=border_mode, subsample=subsample,
                             direction_hint=direction_hint)]
-    @register_opt('cudnn')
+    # This optimizer is registered in opt.py as part of the meta-optimizer.
+    # It tries exactly the opposite code path of what local_conv_dnn() uses,
+    # because for some input/kernel shape configurations, this is faster.
+    @local_optimizer([GpuConv])
+    def local_conv_dnn_alternative(node):
+        if not dnn_available():
+            return
+        if isinstance(node.op, GpuConv):
+            border_mode = node.op.border_mode
+            subsample = node.op.subsample
+            if border_mode not in ['full', 'valid'] or subsample != (1, 1):
+                return
+            img, kern = node.inputs
+            direction_hint = node.op.direction_hint
+            if border_mode == 'full':
+                # for a full convolution, try using the forward pass instead
+                # of the backward pass wrt. inputs
+                direction_hint = 'forward!'
+            elif border_mode == 'valid':
+                # for a valid convolution, try using the backward pass wrt.
+                # weights instead of the forward pass and vice versa
+                if direction_hint == 'bprop weights':
+                    direction_hint = 'forward'
+                else:
+                    direction_hint = 'bprop weights'
+            return [dnn_conv(img, kern,
+                             border_mode=border_mode, subsample=subsample,
+                             direction_hint=direction_hint)]
+# DISABLED as there is problems in the handling of borders
+#    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMax])
    def local_pool_dnn(node):
        if not dnn_available():

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -3,6 +3,7 @@ _logger = logging.getLogger('theano.sandbox.cuda.opt')
 import copy
 import sys
+import time
 import warnings
 import numpy
@@ -15,6 +16,7 @@ import theano.ifelse
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
                        Optimizer, toolbox)
+from theano.gof.opt import LocalMetaOptimizer
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import (
    gpu_eye, gpu_contiguous,
@@ -1345,6 +1347,65 @@ conv_groupopt.register('local_conv_gemm', local_conv_gemm, 30,
                       'fast_compile', 'fast_run')
+class LocalCudaMetaOptimizer(LocalMetaOptimizer):
+    """Base class for CUDA-based LocalMetaOptimizers"""
+    def time_call(self, fn):
+        # Override time_call() to do device synchronization
+        theano.sandbox.cuda.synchronize()
+        start = time.time()
+        fn()
+        theano.sandbox.cuda.synchronize()
+        return time.time() - start
+# Convolution Meta-optimizer
+theano.configparser.AddConfigVar('conv_meta.verbose',
+        "Enable verbose output for conv_meta optimizer",
+        theano.configparser.BoolParam(False), in_c_key=False)
+class ConvMetaOptimizer(LocalCudaMetaOptimizer):
+    def __init__(self, optimizers):
+        super(ConvMetaOptimizer, self).__init__([GpuConv], optimizers)
+        self.verbose = config.conv_meta.verbose
+    def provide_inputs(self, node, inputs):
+        # We need to provide dummy data for the given inputs.
+        # We can make use of the fact that GpuConv often knows its shapes.
+        result = {}
+        img, kern = node.inputs
+        # provide dummy image and filters if needed
+        vars = (img, kern)
+        if node.op.imshp is not None and len(node.op.imshp) == 3:
+            nchannels = node.op.imshp[0]
+        else:
+            nchannels = None
+        shapes = ((node.op.bsize,) + node.op.imshp,
+                  (node.op.nkern, nchannels) + node.op.kshp)
+        for (var, shape) in zip(vars, shapes):
+            if ((var in inputs) and
+                (shape is not None) and
+                not any(s is None for s in shape)):
+                result[var] = theano.shared(
+# TODO: Use var.type.filter when cuda_ndarray.filter supports non-strict casts
+#                        var.type.filter(numpy.random.randn(*shape),
+#                                        allow_downcast=True),
+                        numpy.require(numpy.random.randn(*shape),
+                                      dtype=var.dtype),
+                        var.name, borrow=True)
+        # return mapping
+        return result
+# We just register all optimizers from conv_groupopt with the metaoptimizer
+conv_metaopt = ConvMetaOptimizer(
+        conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
+# Then we add some optimizers that try less obvious options
+conv_metaopt.register(dnn.local_conv_dnn_alternative)
+# Finally, we register the metaoptimizer as the first optimizer in conv_groupopt
+conv_groupopt.register('conv_meta', conv_metaopt, 0)
 @local_optimizer([Conv3D])
 def local_conv3d_fft(node):
    if not isinstance(node.op, Conv3D):