提交 ed415454 authored 作者: Razvan Pascanu's avatar Razvan Pascanu 提交者: Razvan Pascanu

More sane inplace optimization

The optimization before was all or nothing, which is a bad compromise for scan. This new optimization tries to work in place on each outputs, and keeps only those for which it can. This way you do not get your entire op to be non-inplace because of a single output that can not be computed inplace.
上级 f57b7b77
import logging
_logger = logging.getLogger('theano.sandbox.cuda.opt')
import copy
import sys
import warnings
import numpy
import theano
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano import scalar as scal
from theano import tensor, compile, gof
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler,
EquilibriumOptimizer)
InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType
......@@ -1431,7 +1433,7 @@ def gpuScanOptimization(node):
# merged or implement this optimization as a global
# optimization
thescan = host_input.owner.op
info = thescan.info.copy()
info = copy.deepcopy(thescan.info)
info['gpu'] = True
inputs = host_input.owner.inputs
nw_ins = [inputs[0]]
......@@ -1478,7 +1480,7 @@ def gpuScanOptimization(node):
for i in node.inputs]):
thescan = node.op
info = thescan.info.copy()
info = copy.deepcopy(thescan.info)
info['gpu'] = True
inputs = node.inputs
nw_ins = [inputs[0]]
......@@ -1527,41 +1529,9 @@ def gpuScanOptimization(node):
return False
@gof.local_optimizer([None])
def gpu_scan_make_inplace(node):
op = node.op
if (isinstance(op, scan_op.Scan) and
(not op.info['inplace']) and
(op.info['gpu'])):
info = op.info.copy()
info['inplace'] = True
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node)
ls += op.outer_mitsot(node)
ls += op.outer_sitsot(node)
ls_end = op.outer_shared(node)
ls_end += op.outer_nitsot(node)
ls_end += op.outer_non_seqs(node)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = compile.function_module.deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable=broadcastable)
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=typeConstructor)
return new_op.make_node(*inputs).outputs
return False
optdb.register('gpu_scanOp_make_inplace',
theano.tensor.opt.in2out(
gpu_scan_make_inplace, ignore_newtrees=True),
scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
gpu_flag=True),
75,
'gpu',
'fast_run',
......
差异被折叠。
......@@ -949,7 +949,8 @@ def scan(fn,
info['truncate_gradient'] = truncate_gradient
info['name'] = name
info['mode'] = mode
info['inplace'] = False
info['inplace'] = -1
info['destroy_map'] = {}
info['gpu'] = False
info['as_while'] = as_while
info['profile'] = profile
......
......@@ -94,12 +94,6 @@ class Scan(PureOp):
if self.as_while:
self.output_types = self.output_types[:-1]
self.destroy_map = {}
if hasattr(self, 'inplace') and self.inplace:
for idx in xrange(self.n_mit_mot + self.n_mit_sot +
self.n_sit_sot):
self.destroy_map[idx] = [idx + 1 + self.n_seqs]
mode_instance = compile.mode.get_mode(self.mode)
# if the default mode is used, and that mode is ProfileMode
......@@ -411,12 +405,22 @@ class Scan(PureOp):
name = 'do_while'
else:
name = 'for'
if self.inplace:
aux_txt = '%s{inplace,%s,%s}' % (name, gpu_str, str(self.name))
aux_txt = '%s'
if len(self.destroy_map.keys()) > 0:
# Check if all outputs are inplace
if (sorted(self.destroy_map.keys()) == \
sorted(range(self.n_mit_mot +
self.n_mit_sot +
self.n_sit_sot))):
aux_txt += 'all_inplace,%s,%s}'
else:
aux_txt += '{inplace{'
for k in self.destroy_map.keys():
aux_txt += str(k) + ','
aux_txt += '},%s,%s}'
else:
aux_txt = '%s{%s,%s}' % (name, gpu_str, str(self.name))
aux_txt +='{%s,%s}'
aux_txt = aux_txt % (name, gpu_str, str(self.name))
return aux_txt
def __hash__(self):
......
......@@ -13,6 +13,7 @@ __copyright__ = "(c) 2010, Universite de Montreal"
__contact__ = "Razvan Pascanu <r.pascanu@gmail>"
import logging
import copy
import numpy
import theano
......@@ -20,6 +21,8 @@ from theano import tensor
from theano.tensor import opt, get_constant_value
from theano import gof
from theano.gof.python25 import maxsize
from theano.gof.opt import Optimizer
from theano.gof import toolbox, DestroyHandler, InconsistencyError
from theano.compile import optdb
from theano.compile.function_module import deep_copy_op
......@@ -117,7 +120,7 @@ def remove_constants_and_unused_inputs_scan(node):
if len(nw_inner) != len(op_ins):
op_outs = scan_utils.clone(op_outs, replace=givens)
nw_info = op.info.copy()
nw_info = copy.deepcopy(op.info)
nw_info['n_seqs'] = nw_n_seqs
# DEBUG CHECK
nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
......@@ -304,36 +307,68 @@ scan_seqopt.register('scanOp_pushout_nonseqs_ops',
'scan')
@gof.local_optimizer([None])
def scan_make_inplace(node):
op = node.op
if (isinstance(op, scan_op.Scan) and
(not op.info['inplace']) and
(not op.info['gpu'])):
info = op.info.copy()
info['inplace'] = True
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node.inputs)
ls += op.outer_mitsot(node.inputs)
ls += op.outer_sitsot(node.inputs)
ls_end = op.outer_shared(node.inputs)
ls_end += op.outer_nitsot(node.inputs)
ls_end += op.outer_non_seqs(node.inputs)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
new_op = scan_op.Scan(op.inputs,
op.outputs,
info)
return new_op.make_node(*inputs).outputs
return False
class ScanInplaceOptimizer(Optimizer):
"""Graph optimizer for Scan(makes it run inplace)"""
def __init__(self, typeConstructor=None, gpu_flag=False):
Optimizer.__init__(self)
self.typeConstructor = typeConstructor
self.gpu_flag = gpu_flag
def add_requirements(self, env):
env.extend(toolbox.ReplaceValidate())
env.extend(DestroyHandler())
def apply(self, env):
nodes = env.toposort()
scan_nodes = [x for x in nodes
if (isinstance(x.op, scan_op.Scan) and
x.op.info['gpu']== self.gpu_flag)]
for scan_idx in xrange(len(scan_nodes)):
node = scan_nodes[scan_idx]
op = node.op
n_outs = (op.info['n_mit_mot'] +
op.info['n_mit_sot'] +
op.info['n_sit_sot'])
for pos in xrange(n_outs):
info = copy.deepcopy(op.info)
if not 'destroy_map' in info:
info['destroy_map'] = {}
info['destroy_map'][pos] = [pos + 1 + op.info['n_seqs']]
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node.inputs)
ls += op.outer_mitsot(node.inputs)
ls += op.outer_sitsot(node.inputs)
ls_end = op.outer_shared(node.inputs)
ls_end += op.outer_nitsot(node.inputs)
ls_end += op.outer_non_seqs(node.inputs)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=self.typeConstructor)
new_outs = new_op.make_node(*inputs).outputs
try:
env.replace_all_validate(
zip(node.outputs, new_outs),
reason=self.__class__.__name__)
op = new_op
node = new_outs[0].owner
except InconsistencyError, e:
# Failed moving output to be comptued inplace
pass
optdb.register('scanOp_make_inplace',
opt.in2out(scan_make_inplace, ignore_newtrees=True),
ScanInplaceOptimizer(typeConstructor=None,
gpu_flag=False),
75,
'fast_run',
'inplace',
......
......@@ -775,8 +775,11 @@ class T_Scan(unittest.TestCase):
updates=updates,
mode=mode,
allow_input_downcast=True)
# compute output in numpy
scan_node = [x for x in f9.maker.env.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy
numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,))
numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
......@@ -852,6 +855,10 @@ class T_Scan(unittest.TestCase):
mode=mode,
allow_input_downcast=True)
scan_node = [x for x in f9.maker.env.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy
numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,))
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论