提交 ab6c786e authored 作者: f0k's avatar f0k

Add a meta-optimizer for GpuConv

上级 36437aca
......@@ -823,6 +823,86 @@ class LocalOptimizer(object):
(' ' * level), self.__class__.__name__, id(self))
class LocalMetaOptimizer(LocalOptimizer):
"""Base class for meta-optimizers that try a set of LocalOptimizers
to replace a node and choose the one that executes the fastest"""
def __init__(self, tracks=None, optimizers=()):
self._tracks = tracks
self.optimizers = list(optimizers)
self.verbose = False
def register(self, optimizer):
self.optimizers.append(optimizer)
def tracks(self):
return self._tracks
def transform(self, node):
# safety check: not sure if needed, but all optimizers do it
if self._tracks is not None:
if not isinstance(node.op, tuple(self._tracks)):
return
# first, we need to provide dummy values for all inputs
# to the node that are not shared variables anyway
givens = {}
missing = set()
for input in node.inputs:
if isinstance(input, theano.compile.SharedVariable):
pass
elif hasattr(input.tag, 'test_value'):
givens[input] = theano.shared(
numpy.require(input.tag.test_value,
dtype=input.dtype),
input.name, borrow=True)
else:
missing.add(input)
if missing:
givens.update(self.provide_inputs(node, missing))
# ensure we have data for all input variables that need it
if any(var not in givens for var in missing):
if self.verbose:
print ("%s skipping %s (cannot create test inputs)" %
(self.__class__.__name__, node))
return
# now we can apply the different optimizations in turn,
# compile the resulting subgraphs and time their execution
timings = []
for opt in self.optimizers:
outputs = opt.transform(node)
if outputs:
try:
fn = theano.function([],
[theano.Out(output, borrow=True)
for output in outputs],
givens=givens)
timing = min(self.time_call(fn) for _ in range(3))
except Exception as e:
if self.verbose:
print e
continue
else:
if self.verbose:
print opt, timing
timings.append((timing, outputs))
# finally, we choose the fastest one
if timings:
timings.sort()
return timings[0][1]
return
def provide_inputs(self, node, inputs):
"""If implemented, returns a dictionary mapping all symbolic variables
in ``inputs`` to SharedVariable instances of suitable dummy values. The
``node`` can be inspected to infer required input shapes."""
raise NotImplementedError()
def time_call(self, fn):
start = time.time()
fn()
return time.time() - start
class FromFunctionLocalOptimizer(LocalOptimizer):
"""WRITEME"""
def __init__(self, fn, tracks=None, requirements=()):
......
......@@ -3,6 +3,7 @@ _logger = logging.getLogger('theano.sandbox.cuda.opt')
import copy
import sys
import time
import warnings
import numpy
......@@ -15,6 +16,7 @@ import theano.ifelse
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
Optimizer, toolbox)
from theano.gof.opt import LocalMetaOptimizer
from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import (
gpu_eye, gpu_contiguous,
......@@ -153,6 +155,21 @@ gpu_seqopt.register('InputToGpuOptimizer', InputToGpuOptimizer(),
'merge') # TODO: how to make it mandatory for gpu_seqopt?
class LocalCudaMetaOptimizer(LocalMetaOptimizer):
"""Base class for CUDA-based LocalMetaOptimizers"""
def __init__(self, *args):
super(LocalCudaMetaOptimizer, self).__init__(*args)
def time_call(self, fn):
# Override time_call() to do device synchronization
theano.sandbox.cuda.synchronize()
start = time.time()
fn()
theano.sandbox.cuda.synchronize()
return time.time() - start
@local_optimizer([gpu_from_host, host_from_gpu])
def local_cut_gpu_host_gpu(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
......@@ -1345,6 +1362,43 @@ conv_groupopt.register('local_conv_gemm', local_conv_gemm, 30,
'fast_compile', 'fast_run')
# Convolution Meta-optimizer
class ConvMetaOptimizer(LocalCudaMetaOptimizer):
def __init__(self, optimizers):
super(ConvMetaOptimizer, self).__init__([GpuConv], optimizers)
def provide_inputs(self, node, inputs):
# We need to provide dummy data for the given inputs.
# We can make use of the fact that GpuConv often knows its shapes.
result = {}
img, kern = node.inputs
# provide dummy image and filters if needed
vars = (img, kern)
if node.op.imshp is not None and len(node.op.imshp) == 3:
nchannels = node.op.imshp[0]
else:
nchannels = None
shapes = ((node.op.bsize,) + node.op.imshp,
(node.op.nkern, nchannels) + node.op.kshp)
for (var, shape) in zip(vars, shapes):
if ((var in inputs) and
(shape is not None) and
not any(s is None for s in shape)):
result[var] = theano.shared(
numpy.require(numpy.random.randn(*shape),
dtype=var.dtype),
var.name, borrow=True)
# return mapping
return result
# We just register all optimizers from conv_groupopt with the metaoptimizer
conv_metaopt = ConvMetaOptimizer(
conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
# And then register the metaoptimizer as the first optimizer in conv_groupopt
conv_groupopt.register('conv_meta', conv_metaopt, 0)
@local_optimizer([Conv3D])
def local_conv3d_fft(node):
if not isinstance(node.op, Conv3D):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论