提交 3e86efec authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6143 from affanv14/metaopt

Implement Meta-optimizer on new backend
......@@ -1116,3 +1116,26 @@ import theano and print the config variable, as in:
The number of user stack level to keep for variables during Theano
compilation. If higher then 0, will make us keep Theano internal
stack trace.
.. attribute:: config.metaopt.verbose
Int value, default: 0
The verbosity level of the meta-optimizer. 0 for silent.
1 to only warn if we cannot meta-optimize some op.
2 for full output of separate timings and selected implementation
.. attribute:: config.metaopt.optimizer_excluding
Default: ``""``
A list of optimizer tags that we don't want included in the Meta-optimizer.
If multiple tags, separate them by ':'.
.. attribute:: config.metaopt.optimizer_including
Default: ``""``
A list of optimizer tags that we want included in the Meta-optimizer.
If multiple tags, separate them by ':'.
......@@ -1456,10 +1456,23 @@ AddConfigVar('blas.ldflags',
AddConfigVar(
'metaopt.verbose',
"Enable verbose output for meta optimizers",
theano.configparser.BoolParam(False),
"0 for silent, 1 for only warnings, 2 for full output with"
"timings and selected implementation",
theano.configparser.IntParam(0),
in_c_key=False)
AddConfigVar('metaopt.optimizer_excluding',
("exclude optimizers with these tags. "
"Separate tags with ':'."),
StrParam(""),
in_c_key=False)
AddConfigVar('metaopt.optimizer_including',
("include optimizers with these tags. "
"Separate tags with ':'."),
StrParam(""),
in_c_key=False)
AddConfigVar('profile',
"If VM should collect profile information",
BoolParam(False),
......
......@@ -1131,13 +1131,20 @@ class LocalMetaOptimizer(LocalOptimizer):
"""
def __init__(self, tracks=None, optimizers=()):
self._tracks = tracks
self.optimizers = list(optimizers)
def __init__(self):
self.verbose = config.metaopt.verbose
self.track_dict = defaultdict(lambda: [])
self.tag_dict = defaultdict(lambda: [])
self._tracks = []
self.optimizers = []
def register(self, optimizer):
def register(self, optimizer, tag_list):
self.optimizers.append(optimizer)
for c in optimizer.tracks():
self.track_dict[c].append(optimizer)
self._tracks.append(c)
for tag in tag_list:
self.tag_dict[tag].append(optimizer)
def tracks(self):
return self._tracks
......@@ -1167,39 +1174,40 @@ class LocalMetaOptimizer(LocalOptimizer):
missing.difference_update(givens.keys())
# ensure we have data for all input variables that need it
if missing:
if self.verbose:
if self.verbose > 0:
print(("%s cannot meta-optimize %s, "
"%d of %d input shapes unknown" %
(self.__class__.__name__, node, len(missing), node.nin)))
return
# now we can apply the different optimizations in turn,
# compile the resulting subgraphs and time their execution
if self.verbose:
if self.verbose > 1:
print(("%s meta-optimizing %s (%d choices):" %
(self.__class__.__name__, node, len(self.optimizers))))
(self.__class__.__name__, node, len(self.get_opts(node)))))
timings = []
for opt in self.optimizers:
for opt in self.get_opts(node):
outputs = opt.transform(node)
if outputs:
try:
fn = theano.function([], outputs, givens=givens,
on_unused_input='ignore')
timing = min(self.time_call(fn) for _ in range(3))
fn.trust_input = True
timing = min(self.time_call(fn) for _ in range(2))
except Exception as e:
if self.verbose:
if self.verbose > 0:
print("* %s: exception" % opt, e)
continue
else:
if self.verbose:
if self.verbose > 1:
print("* %s: %.5g sec" % (opt, timing))
timings.append((timing, outputs, opt))
else:
if self.verbose:
if self.verbose > 0:
print("* %s: not applicable" % opt)
# finally, we choose the fastest one
if timings:
timings.sort()
if self.verbose:
if self.verbose > 1:
print("= %s" % timings[0][2])
return timings[0][1]
return
......@@ -1213,6 +1221,12 @@ class LocalMetaOptimizer(LocalOptimizer):
"""
raise NotImplementedError()
def get_opts(self, node):
"""
Can be overrided to change the way opts are selected
"""
return self.track_dict[type(node.op)]
def time_call(self, fn):
start = time.time()
fn()
......@@ -2313,7 +2327,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
self.final_optimizers = []
self.cleanup_optimizers = []
self.tracks_on_change_inputs = tracks_on_change_inputs
for opt in optimizers:
if isinstance(opt, LocalOptimizer):
if opt.tracks() is None:
......
......@@ -1014,7 +1014,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1) and dilation == (1, 1) and
elif (border_mode == 'full' and subsample == (1, 1) and
direction_hint != 'forward!' and num_groups == 1):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
......@@ -1024,11 +1024,11 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
out_shp = (shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph),
shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1)
shape_i(img, 2, fgraph) + (shape_i(kerns, 2, fgraph) - 1) * dilation[0],
shape_i(img, 3, fgraph) + (shape_i(kerns, 3, fgraph) - 1) * dilation[1])
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1),
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc)
......@@ -1133,7 +1133,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and
elif (border_mode == 'full' and subsample == (1, 1, 1) and
direction_hint != 'forward!'):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
......@@ -1143,12 +1143,12 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
out_shp = (shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph),
shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1,
shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1)
shape_i(img, 2, fgraph) + (shape_i(kerns, 2, fgraph) - 1) * dilation[0],
shape_i(img, 3, fgraph) + (shape_i(kerns, 3, fgraph) - 1) * dilation[1],
shape_i(img, 4, fgraph) + (shape_i(kerns, 4, fgraph) - 1) * dilation[2])
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc)
......@@ -2888,6 +2888,215 @@ def local_abstractconv_cudnn(node):
return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_abstractconv_cudnn_alt(node):
if(not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))):
return
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not dnn_available(inp1.type.context_name):
return
op = node.op
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
precision = get_precision(None, [inp1, inp2])
if node.op.filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if isinstance(op, AbstractConv2d):
if border_mode == 'half' or subsample != (1, 1) or num_groups != 1:
return None
if border_mode == 'full':
direction_hint = 'bprop inputs'
elif border_mode == 'valid' and filter_dilation == (1, 1):
direction_hint = 'bprop weights'
else:
return None
rval = dnn_conv(inp1, inp2,
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
direction_hint=direction_hint,
conv_mode=conv_mode,
num_groups=num_groups)
elif isinstance(op, AbstractConv2d_gradWeights):
if(border_mode == 'valid' and subsample == (1, 1) and
filter_dilation == (1, 1) and num_groups == 1):
img = gpu_contiguous(inp1)
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
out_shp = get_conv_output_shape(ishape,
tshape,
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
conv_mode='cross',
precision=precision)(out.shape)
conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
if conv_mode == 'conv':
conv = conv[:, :, ::-1, ::-1]
rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
else:
return None
elif isinstance(op, AbstractConv2d_gradInputs):
if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(kerns, topgrad)
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
desc = GpuDnnConvDesc(border_mode='full',
subsample=subsample,
dilation=filter_dilation,
conv_mode=conv_mode,
precision=precision)(kerns.shape)
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
shape = get_conv_output_shape(tshape,
kshape,
border_mode='full',
subsample=subsample,
filter_dilation=filter_dilation)
shape = assert_conv_shape(shape)
out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns, out, desc)
else:
return None
return [rval]
@local_optimizer([AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs])
def local_abstractconv3d_cudnn_alt(node):
if(not isinstance(node.op, (AbstractConv3d,
AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs))):
return
if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not dnn_available(inp1.type.context_name):
return
op = node.op
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
precision = get_precision(None, [inp1, inp2])
if node.op.filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if isinstance(op, AbstractConv3d):
if border_mode == 'half' or subsample != (1, 1, 1):
return None
if border_mode == 'full':
direction_hint = 'bprop inputs'
elif border_mode == 'valid' and filter_dilation == (1, 1, 1):
direction_hint = 'bprop weights'
else:
return None
rval = dnn_conv3d(inp1, inp2,
border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
direction_hint=direction_hint,
conv_mode=conv_mode)
elif isinstance(op, AbstractConv3d_gradWeights):
if(border_mode == 'valid' and subsample == (1, 1, 1) and
filter_dilation == (1, 1, 1)):
img = gpu_contiguous(inp1)
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(img, topgrad)
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
out_shp = get_conv_output_shape(ishape,
tshape,
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation)
out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(border_mode=border_mode,
subsample=subsample,
dilation=filter_dilation,
conv_mode='cross',
precision=precision)(out.shape)
conv = GpuDnnConv(algo=None)(img, topgrad, out, desc)
if conv_mode == 'conv':
conv = conv[:, :, ::-1, ::-1, ::-1]
rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
else:
return None
elif isinstance(op, AbstractConv3d_gradInputs):
if border_mode == 'valid' and subsample == (1, 1, 1):
kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
topgrad = gpu_contiguous(inp2)
ctx_name = infer_context_name(kerns, topgrad)
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
desc = GpuDnnConvDesc(border_mode='full',
subsample=subsample,
dilation=filter_dilation,
conv_mode=conv_mode,
precision=precision)(kerns.shape)
tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
shape = get_conv_output_shape(tshape,
kshape,
border_mode='full',
subsample=subsample,
filter_dilation=filter_dilation)
shape = assert_conv_shape(shape)
out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
rval = GpuDnnConv(algo=None)(topgrad, kerns, out, desc)
else:
return None
return [rval]
@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
def local_abstractconv_gw_cudnn(node):
ctx = infer_context_name(*node.inputs)
......
......@@ -15,6 +15,7 @@ from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
LocalGroupDB,
SequenceDB, Optimizer, DB, toolbox, graph)
from theano.gof.opt import LocalMetaOptimizer
from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet
......@@ -23,7 +24,7 @@ from theano.scalar.basic import log, neg, true_div
from theano.scalar.basic_scipy import Erfinv, Erfcinv
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet import bn
from theano.tensor.nnet import bn, conv3d2d
from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
......@@ -32,12 +33,14 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv2d_gradInputs,
AbstractConv3d,
AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs)
AbstractConv3d_gradInputs,
get_conv_output_shape)
from theano.tensor.nnet.neighbours import Images2Neibs
from theano.tensor.nnet.ctc import ConnectionistTemporalClassification
import theano.tensor.nlinalg as nlinalg
import theano.tensor.signal.pool as pool
import theano.tensor.slinalg as slinalg
from collections import Counter
from theano.tests.breakpoint import PdbBreakpoint
......@@ -1625,6 +1628,49 @@ def local_abstractconv_gemm(node):
return [rval]
@local_optimizer([AbstractConv2d])
def local_abstractconv_gemm_alt(node):
if not isinstance(node.op, AbstractConv2d):
return None
img, kern = node.inputs
if (not isinstance(img.type, GpuArrayType) or
not isinstance(kern.type, GpuArrayType)):
return None
ctx = infer_context_name(img, kern)
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
if border_mode == 'full' and subsample == (1, 1) and num_groups == 1:
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
kern = kern.dimshuffle(1, 0, 2, 3)
rval = GpuCorrMM_gradInputs('valid',
subsample,
filter_dilation)(
gpu_contiguous(kern), gpu_contiguous(img))
elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
num_groups == 1):
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
rval = GpuCorrMM_gradWeights(border_mode,
subsample,
filter_dilation)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
rval = as_gpuarray_variable(rval.dimshuffle(1, 0, 2, 3),
context_name=ctx)
else:
return None
return [rval]
@local_optimizer([AbstractConv3d])
def local_abstractconv3d_gemm(node):
if not isinstance(node.op, AbstractConv3d):
......@@ -1694,6 +1740,74 @@ def local_abstractconv3d_gemm(node):
return [rval]
@local_optimizer([AbstractConv3d])
def local_abstractconv3d_alt(node):
if not isinstance(node.op, AbstractConv3d):
return None
img, kern = node.inputs
if (not isinstance(img.type, GpuArrayType) or
not isinstance(kern.type, GpuArrayType)):
return None
ctx = infer_context_name(img, kern)
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
if ((border_mode == 'full') and (subsample == (1, 1, 1))):
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1, ::-1]
kern = kern.dimshuffle(1, 0, 2, 3, 4)
rval = GpuCorr3dMM_gradInputs('valid',
subsample,
filter_dilation)(
gpu_contiguous(kern), gpu_contiguous(img))
elif(subsample == (1, 1, 1) and filter_dilation == (1, 1, 1) and
border_mode == 'valid'):
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1, ::-1]
rval = GpuCorr3dMM_gradWeights(border_mode,
subsample,
filter_dilation)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
rval = as_gpuarray_variable(rval.dimshuffle(1, 0, 2, 3, 4),
context_name=ctx)
else:
return None
return [rval]
@local_optimizer([AbstractConv3d])
def local_abstractconv3d2d(node):
if not isinstance(node.op, AbstractConv3d):
return None
img, kern = node.inputs
if (not isinstance(img.type, GpuArrayType) or
not isinstance(kern.type, GpuArrayType)):
return None
ctx = infer_context_name(img, kern)
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
if subsample == (1, 1, 1) and filter_dilation == (1, 1, 1):
reorder_array = [0, 2, 1, 3, 4]
rval = conv3d2d.conv3d(gpu_contiguous(img.dimshuffle(*reorder_array)),
gpu_contiguous(kern.dimshuffle(*reorder_array)),
[node.op.imshp[i] for i in reorder_array],
[node.op.kshp[i] for i in reorder_array],
border_mode=border_mode)
rval = as_gpuarray_variable(rval.dimshuffle(*reorder_array),
context_name=ctx)
return [rval]
else:
return None
@local_optimizer([AbstractConv2d_gradWeights])
def local_abstractconv_gradweights_gemm(node):
if not isinstance(node.op, AbstractConv2d_gradWeights):
......@@ -1716,6 +1830,70 @@ def local_abstractconv_gradweights_gemm(node):
return [rval]
@local_optimizer([AbstractConv2d_gradWeights])
def local_abstractconv_gemm_gradweights_alt(node):
if not isinstance(node.op, AbstractConv2d_gradWeights):
return None
img, topgrad, shape = node.inputs
if not isinstance(img.type, GpuArrayType) or \
not isinstance(topgrad.type, GpuArrayType):
return None
ctx = infer_context_name(img, topgrad)
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
num_groups == 1):
rval = GpuCorrMM(border_mode,
subsample,
filter_dilation)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3)))
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1]
rval = rval.dimshuffle(1, 0, 2, 3)
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_gpuarray_variable(rval, context_name=ctx)
return [rval]
else:
return None
@local_optimizer([AbstractConv3d_gradWeights])
def local_abstractconv3d_gemm_gradweights_alt(node):
if not isinstance(node.op, AbstractConv3d_gradWeights):
return None
img, topgrad, shape = node.inputs
if not isinstance(img.type, GpuArrayType) or \
not isinstance(topgrad.type, GpuArrayType):
return None
ctx = infer_context_name(img, topgrad)
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
if border_mode == 'valid' and subsample == (1, 1, 1) and filter_dilation == (1, 1, 1):
rval = GpuCorr3dMM(border_mode,
subsample,
filter_dilation)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4)))
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1, ::-1]
rval = rval.dimshuffle(1, 0, 2, 3, 4)
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_gpuarray_variable(rval, context_name=ctx)
return [rval]
else:
return None
@local_optimizer([AbstractConv3d_gradWeights])
def local_abstractconv3d_gradweights_gemm(node):
if not isinstance(node.op, AbstractConv3d_gradWeights):
......@@ -1757,6 +1935,33 @@ def local_abstractconv_gradinputs_gemm(node):
return [rval]
@local_optimizer([AbstractConv2d_gradInputs])
def local_abstractconv_gradinputs_gemm_alt(node):
if not isinstance(node.op, AbstractConv2d_gradInputs):
return None
kern, topgrad, shape = node.inputs
if not isinstance(kern.type, GpuArrayType) or \
not isinstance(topgrad.type, GpuArrayType):
return None
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
num_groups = node.op.num_groups
if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
rval = GpuCorrMM(border_mode='full',
subsample=subsample,
filter_dilation=filter_dilation)(
gpu_contiguous(topgrad),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
return [rval]
else:
return None
@local_optimizer([AbstractConv3d_gradInputs])
def local_abstractconv3d_gradinputs_gemm(node):
if not isinstance(node.op, AbstractConv3d_gradInputs):
......@@ -1776,6 +1981,111 @@ def local_abstractconv3d_gradinputs_gemm(node):
return [rval]
@local_optimizer([AbstractConv3d_gradInputs])
def local_abstractconv3d_gradinputs_gemm_alt(node):
if not isinstance(node.op, AbstractConv3d_gradInputs):
return None
kern, topgrad, shape = node.inputs
if not isinstance(kern.type, GpuArrayType) or \
not isinstance(topgrad.type, GpuArrayType):
return None
border_mode = node.op.border_mode
subsample = node.op.subsample
filter_dilation = node.op.filter_dilation
if border_mode == 'valid' and subsample == (1, 1, 1):
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1, ::-1]
rval = GpuCorr3dMM(border_mode='full',
subsample=subsample,
filter_dilation=filter_dilation)(
gpu_contiguous(topgrad),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
return [rval]
else:
return None
class ConvMetaOptimizer(LocalMetaOptimizer):
def __init__(self):
super(ConvMetaOptimizer, self).__init__()
def time_call(self, fn):
start = time.time()
fn()[0].sync()
return time.time() - start
def provide_inputs(self, node, inputs):
result = {}
shapes = (node.op.imshp, node.op.kshp)
if(node.op.imshp is None or node.op.kshp is None or
any([s is None for shape in shapes for s in shape])):
return result
if type(node.op) in [AbstractConv2d, AbstractConv3d]:
img, kern = node.inputs
for(var, shape) in zip((img, kern), shapes):
result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
var.name,
broadcastable=var.broadcastable,
borrow=True)
if type(node.op) in [AbstractConv2d_gradWeights, AbstractConv3d_gradWeights]:
img, top, kshape = node.inputs
tshp = get_conv_output_shape(node.op.imshp,
node.op.kshp,
node.op.border_mode,
node.op.subsample,
node.op.filter_dilation)
result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[2:])
for(var, shape) in zip((img, top), (node.op.imshp, tshp)):
result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
var.name,
broadcastable=var.broadcastable,
borrow=True)
if type(node.op) in [AbstractConv2d_gradInputs, AbstractConv3d_gradInputs]:
kern, top, ishape = node.inputs
tshp = get_conv_output_shape(node.op.imshp,
node.op.kshp,
node.op.border_mode,
node.op.subsample,
node.op.filter_dilation)
result[ishape] = theano.tensor.as_tensor_variable(node.op.imshp[2:])
for(var, shape) in zip((kern, top), (node.op.kshp, tshp)):
result[var] = theano.shared(np.random.random(shape).astype(var.dtype),
var.name,
broadcastable=var.broadcastable,
borrow=True)
return result
def get_opts(self, node):
opts = Counter([opt for opt in self.track_dict[type(node.op)]
if opt in self.tag_dict['default']])
include_tags = config.metaopt.optimizer_including.split(':')
exclude_tags = config.metaopt.optimizer_excluding.split(':')
for in_opt in include_tags:
opts.update([opt for opt in self.track_dict[type(node.op)]
if opt in self.tag_dict[in_opt]])
for ex_opt in exclude_tags:
opts.subtract([opt for opt in self.track_dict[type(node.op)]
if opt in self.tag_dict[ex_opt]])
opts = list(opts + Counter())
return opts
# This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile', 'conv_dnn', 'cudnn')
@op_lifter([AbstractConv2d,
......@@ -2354,8 +2664,12 @@ register_opt('fast_compile')(abstractconv_groupopt)
# We import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn
from .dnn import (local_abstractconv_cudnn, local_abstractconv_gw_cudnn,
local_abstractconv_gi_cudnn) # noqa: 402
from .dnn import (local_abstractconv_cudnn,
local_abstractconv_gw_cudnn,
local_abstractconv_gi_cudnn, # noqa: 402
local_abstractconv_cudnn_alt,
local_abstractconv3d_cudnn_alt)
abstractconv_groupopt.register('local_abstractconv_dnn',
local_abstractconv_cudnn, 20,
'conv_dnn',
......@@ -2393,6 +2707,46 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
conv_metaopt = ConvMetaOptimizer()
conv_metaopt.register(local_abstractconv_cudnn,
['default', 'cudnn', 'conv_dnn'])
conv_metaopt.register(local_abstractconv_gw_cudnn,
['default', 'cudnn', 'conv_dnn'])
conv_metaopt.register(local_abstractconv_gi_cudnn,
['default', 'cudnn', 'conv_dnn'])
conv_metaopt.register(local_abstractconv_gemm,
['default', 'conv_gemm'])
conv_metaopt.register(local_abstractconv3d_gemm,
['default', 'conv_gemm'])
conv_metaopt.register(local_abstractconv_gradweights_gemm,
['default', 'conv_gemm'])
conv_metaopt.register(local_abstractconv3d_gradweights_gemm,
['default', 'conv_gemm'])
conv_metaopt.register(local_abstractconv_gradinputs_gemm,
['default', 'conv_gemm'])
conv_metaopt.register(local_abstractconv3d_gradinputs_gemm,
['default', 'conv_gemm'])
conv_metaopt.register(local_abstractconv_gemm_alt,
['default', 'alternative', 'conv_gemm'])
conv_metaopt.register(local_abstractconv_gemm_gradweights_alt,
['default', 'alternative', 'conv_gemm'])
conv_metaopt.register(local_abstractconv_gradinputs_gemm_alt,
['default', 'alternative', 'conv_gemm'])
conv_metaopt.register(local_abstractconv_cudnn_alt,
['default', 'alternative', 'cudnn', 'conv_dnn'])
conv_metaopt.register(local_abstractconv3d_cudnn_alt,
['default', 'alternative', 'cudnn', 'conv_dnn'])
conv_metaopt.register(local_abstractconv3d_alt,
['default', 'alternative', 'conv_gemm'])
conv_metaopt.register(local_abstractconv3d_gemm_gradweights_alt,
['default', 'alternative', 'conv_gemm'])
conv_metaopt.register(local_abstractconv3d_gradinputs_gemm_alt,
['default', 'alternative', 'conv_gemm'])
conv_metaopt.register(local_abstractconv3d2d,
['alternative', 'conv3d2d'])
abstractconv_groupopt.register('conv_metaopt', conv_metaopt, 'conv_meta', position=0)
# Register cuDNN batch normalization implementation
......
......@@ -22,6 +22,9 @@ from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, SkipTest
import unittest
from theano.tensor.nnet import abstract_conv
from theano.gpuarray import dnn, blas
def test_local_assert():
......@@ -699,3 +702,200 @@ def test_crossentropycategorical1hot_lifter():
for n in f.maker.fgraph.apply_nodes)
f(rng.uniform(0.1, 0.9, (13, 5)).astype(theano.config.floatX),
rng.randint(5, size=(13,)))
class Conv_opt_test(unittest.TestCase):
def optimizer_2d(self, input_shapes, direction, include_tags, exclude_tags,
op, border_mode='valid', subsample=(1, 1), filter_dilation=(1, 1)):
inp1 = theano.shared(np.random.random(input_shapes[0]).astype(theano.config.floatX))
inp2 = theano.shared(np.random.random(input_shapes[1]).astype(theano.config.floatX))
if(direction == 0):
conv_op = abstract_conv.conv2d(inp1,
inp2,
input_shapes[0],
input_shapes[1],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation)
if(direction == 1):
conv_op = abstract_conv.conv2d_grad_wrt_weights(inp1,
inp2,
input_shapes[2],
input_shapes[0],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation)
if(direction == 2):
conv_op = abstract_conv.conv2d_grad_wrt_inputs(inp1,
inp2,
input_shapes[2],
input_shapes[1],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation)
theano.config.metaopt.optimizer_including = include_tags
theano.config.metaopt.optimizer_excluding = exclude_tags
mode = mode_with_gpu.including('conv_meta')
ref_func = theano.function([], conv_op, mode=mode_with_gpu)
conv_func = theano.function([], conv_op, mode=mode)
assert any([isinstance(node.op, op)
for node in conv_func.maker.fgraph.toposort()])
utt.assert_allclose(conv_func(), ref_func())
def optimizer_3d(self, input_shapes, direction, include_tags, exclude_tags,
op, border_mode='valid', subsample=(1, 1, 1),
filter_dilation=(1, 1, 1)):
inp1 = theano.shared(np.random.random(input_shapes[0]).astype(theano.config.floatX))
inp2 = theano.shared(np.random.random(input_shapes[1]).astype(theano.config.floatX))
if(direction == 0):
conv_op = abstract_conv.conv3d(inp1,
inp2,
input_shapes[0],
input_shapes[1],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation)
if(direction == 1):
conv_op = abstract_conv.conv3d_grad_wrt_weights(inp1,
inp2,
input_shapes[2],
input_shapes[0],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation)
if(direction == 2):
conv_op = abstract_conv.conv3d_grad_wrt_inputs(inp1,
inp2,
input_shapes[2],
input_shapes[1],
border_mode=border_mode,
subsample=subsample,
filter_dilation=filter_dilation)
theano.config.metaopt.optimizer_including = include_tags
theano.config.metaopt.optimizer_excluding = exclude_tags
mode = mode_with_gpu.including('conv_meta')
ref_func = theano.function([], conv_op, mode=mode_with_gpu)
conv_func = theano.function([], conv_op, mode=mode)
if op is not None:
assert any([isinstance(node.op, op)
for node in conv_func.maker.fgraph.toposort()])
utt.assert_allclose(conv_func(), ref_func())
def test_optimizers(self):
imshp2d = [(2, 3, 5, 5), (2, 2, 5, 7), (2, 1, 3, 3)]
kshp2d = [(4, 3, 3, 3), (3, 2, 3, 5), (4, 1, 1, 1)]
tshp2d = [(2, 4, 3, 3), (2, 3, 3, 3), (2, 4, 3, 3)]
for imshp, kshp, tshp in zip(imshp2d, kshp2d, tshp2d):
# forward passes
self.optimizer_2d([imshp, kshp, tshp], 0,
'alternative',
'conv_dnn:default',
blas.GpuCorrMM_gradWeights)
self.optimizer_2d([imshp, kshp, tshp], 0,
'alternative',
'conv_gemm:default',
dnn.GpuDnnConvGradW)
# backwards wrt weights
self.optimizer_2d([imshp, tshp, kshp], 1,
'alternative',
'conv_dnn:default',
blas.GpuCorrMM)
self.optimizer_2d([imshp, tshp, kshp], 1,
'alternative',
'conv_gemm:default',
dnn.GpuDnnConv)
# backwards wrt to inputs
self.optimizer_2d([tshp, kshp, imshp], 2,
'alternative',
'conv_dnn:default',
blas.GpuCorrMM)
self.optimizer_2d([tshp, kshp, imshp], 2,
'alternative',
'conv_gemm:default',
dnn.GpuDnnConv)
imshp3d = [(2, 3, 5, 5, 5), (2, 2, 5, 7, 5), (2, 1, 3, 3, 3)]
kshp3d = [(4, 3, 3, 3, 3), (3, 2, 3, 5, 3), (4, 1, 1, 1, 1)]
tshp3d = [(2, 4, 3, 3, 3), (2, 3, 3, 3, 3), (2, 4, 3, 3, 3)]
for imshp, kshp, tshp in zip(imshp3d, kshp3d, tshp3d):
# forwards passes
self.optimizer_3d([imshp, kshp, tshp], 0,
'alternative',
'conv_dnn:default:conv3d2d',
blas.GpuCorr3dMM_gradWeights)
self.optimizer_3d([imshp, kshp, tshp], 0,
'conv3d2d',
'default',
None)
self.optimizer_3d([imshp, kshp, tshp], 0,
'alternative',
'conv_gemm:default:conv3d2d',
dnn.GpuDnnConvGradW)
# backward pass wrt weight
self.optimizer_3d([imshp, tshp, kshp], 1,
'alternative',
'conv_dnn:default',
blas.GpuCorr3dMM)
self.optimizer_3d([imshp, tshp, kshp], 1,
'alternative',
'conv_gemm:default',
dnn.GpuDnnConv)
# backward pass wrt inputs
self.optimizer_3d([tshp, kshp, imshp], 2,
'alternative',
'conv_dnn:default',
blas.GpuCorr3dMM)
self.optimizer_3d([tshp, kshp, imshp], 2,
'alternative',
'conv_gemm:default',
dnn.GpuDnnConv)
# conv2d forward pass with Non-default border_mode and filter_dilation
imshp2d = [(2, 3, 5, 5), (4, 2, 5, 5)]
kshp2d = [(4, 3, 3, 3), (3, 2, 3, 3)]
filter_dilation = [(1, 1), (2, 2)]
for imshp, kshp, fdil in zip(imshp2d, kshp2d, filter_dilation):
self.optimizer_2d([imshp, kshp], 0,
'alternative',
'conv_dnn:default',
blas.GpuCorrMM_gradInputs,
border_mode='full',
filter_dilation=fdil)
# works only for cudnn > 6.0
self.optimizer_2d([imshp, kshp], 0,
'alternative',
'conv_gemm:default',
dnn.GpuDnnConvGradI,
border_mode='full',
filter_dilation=fdil)
# conv3d forward pass with Non-default border_mode and filter_dilation
imshp3d = [(2, 3, 5, 5, 5), (4, 2, 5, 5, 5)]
kshp3d = [(4, 3, 3, 3, 3), (3, 2, 3, 3, 3)]
filter_dilation = [(1, 1, 1), (2, 2, 2)]
for imshp, kshp, fdil in zip(imshp3d, kshp3d, filter_dilation):
self.optimizer_3d([imshp, kshp], 0,
'alternative',
'conv_dnn:default:conv3d2d',
blas.GpuCorr3dMM_gradInputs,
border_mode='full',
filter_dilation=fdil)
# works only for cudnn > 6.0
self.optimizer_3d([imshp, kshp], 0,
'alternative',
'conv_gemm:default:conv3d2d',
dnn.GpuDnnConvGradI,
border_mode='full',
filter_dilation=fdil)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论