Add a meta-optimizer for GpuConv

ab6c786e · f0k · 36437aca · ab6c786e · ab6c786e
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -823,6 +823,86 @@ class LocalOptimizer(object):
                (' ' * level), self.__class__.__name__, id(self))


+class LocalMetaOptimizer(LocalOptimizer):
+    """Base class for meta-optimizers that try a set of LocalOptimizers
+    to replace a node and choose the one that executes the fastest"""
+
+    def __init__(self, tracks=None, optimizers=()):
+        self._tracks = tracks
+        self.optimizers = list(optimizers)
+        self.verbose = False
+
+    def register(self, optimizer):
+        self.optimizers.append(optimizer)
+
+    def tracks(self):
+        return self._tracks
+
+    def transform(self, node):
+        # safety check: not sure if needed, but all optimizers do it
+        if self._tracks is not None:
+            if not isinstance(node.op, tuple(self._tracks)):
+                return
+        # first, we need to provide dummy values for all inputs
+        # to the node that are not shared variables anyway
+        givens = {}
+        missing = set()
+        for input in node.inputs:
+            if isinstance(input, theano.compile.SharedVariable):
+                pass
+            elif hasattr(input.tag, 'test_value'):
+                givens[input] = theano.shared(
+                        numpy.require(input.tag.test_value,
+                                      dtype=input.dtype),
+                        input.name, borrow=True)
+            else:
+                missing.add(input)
+        if missing:
+            givens.update(self.provide_inputs(node, missing))
+        # ensure we have data for all input variables that need it
+        if any(var not in givens for var in missing):
+            if self.verbose:
+                print ("%s skipping %s (cannot create test inputs)" %
+                       (self.__class__.__name__, node))
+            return
+        # now we can apply the different optimizations in turn,
+        # compile the resulting subgraphs and time their execution
+        timings = []
+        for opt in self.optimizers:
+            outputs = opt.transform(node)
+            if outputs:
+                try:
+                    fn = theano.function([],
+                                         [theano.Out(output, borrow=True)
+                                          for output in outputs],
+                                         givens=givens)
+                    timing = min(self.time_call(fn) for _ in range(3))
+                except Exception as e:
+                    if self.verbose:
+                        print e
+                    continue
+                else:
+                    if self.verbose:
+                        print opt, timing
+                    timings.append((timing, outputs))
+        # finally, we choose the fastest one
+        if timings:
+            timings.sort()
+            return timings[0][1]
+        return
+
+    def provide_inputs(self, node, inputs):
+        """If implemented, returns a dictionary mapping all symbolic variables
+        in ``inputs`` to SharedVariable instances of suitable dummy values. The
+        ``node`` can be inspected to infer required input shapes."""
+        raise NotImplementedError()
+
+    def time_call(self, fn):
+        start = time.time()
+        fn()
+        return time.time() - start
+
+
 class FromFunctionLocalOptimizer(LocalOptimizer):
    """WRITEME"""
    def __init__(self, fn, tracks=None, requirements=()):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -3,6 +3,7 @@ _logger = logging.getLogger('theano.sandbox.cuda.opt')

 import copy
 import sys
+import time
 import warnings

 import numpy
@@ -15,6 +16,7 @@ import theano.ifelse
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
                        Optimizer, toolbox)
+from theano.gof.opt import LocalMetaOptimizer
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import (
    gpu_eye, gpu_contiguous,
@@ -153,6 +155,21 @@ gpu_seqopt.register('InputToGpuOptimizer', InputToGpuOptimizer(),
                    'merge')  # TODO: how to make it mandatory for gpu_seqopt?


+class LocalCudaMetaOptimizer(LocalMetaOptimizer):
+    """Base class for CUDA-based LocalMetaOptimizers"""
+
+    def __init__(self, *args):
+        super(LocalCudaMetaOptimizer, self).__init__(*args)
+
+    def time_call(self, fn):
+        # Override time_call() to do device synchronization
+        theano.sandbox.cuda.synchronize()
+        start = time.time()
+        fn()
+        theano.sandbox.cuda.synchronize()
+        return time.time() - start
+
+
 @local_optimizer([gpu_from_host, host_from_gpu])
 def local_cut_gpu_host_gpu(node):
    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
@@ -1345,6 +1362,43 @@ conv_groupopt.register('local_conv_gemm', local_conv_gemm, 30,
                       'fast_compile', 'fast_run')


+# Convolution Meta-optimizer
+
+class ConvMetaOptimizer(LocalCudaMetaOptimizer):
+    def __init__(self, optimizers):
+        super(ConvMetaOptimizer, self).__init__([GpuConv], optimizers)
+
+    def provide_inputs(self, node, inputs):
+        # We need to provide dummy data for the given inputs.
+        # We can make use of the fact that GpuConv often knows its shapes.
+        result = {}
+        img, kern = node.inputs
+        # provide dummy image and filters if needed
+        vars = (img, kern)
+        if node.op.imshp is not None and len(node.op.imshp) == 3:
+            nchannels = node.op.imshp[0]
+        else:
+            nchannels = None
+        shapes = ((node.op.bsize,) + node.op.imshp,
+                  (node.op.nkern, nchannels) + node.op.kshp)
+        for (var, shape) in zip(vars, shapes):
+            if ((var in inputs) and
+                (shape is not None) and
+                not any(s is None for s in shape)):
+                result[var] = theano.shared(
+                        numpy.require(numpy.random.randn(*shape),
+                                      dtype=var.dtype),
+                        var.name, borrow=True)
+        # return mapping
+        return result
+
+# We just register all optimizers from conv_groupopt with the metaoptimizer
+conv_metaopt = ConvMetaOptimizer(
+        conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
+# And then register the metaoptimizer as the first optimizer in conv_groupopt
+conv_groupopt.register('conv_meta', conv_metaopt, 0)
+
+
 @local_optimizer([Conv3D])
 def local_conv3d_fft(node):
    if not isinstance(node.op, Conv3D):