提交 ed415454 authored 作者: Razvan Pascanu's avatar Razvan Pascanu 提交者: Razvan Pascanu

More sane inplace optimization

The optimization before was all or nothing, which is a bad compromise for scan. This new optimization tries to work in place on each outputs, and keeps only those for which it can. This way you do not get your entire op to be non-inplace because of a single output that can not be computed inplace.
上级 f57b7b77
import logging import logging
_logger = logging.getLogger('theano.sandbox.cuda.opt') _logger = logging.getLogger('theano.sandbox.cuda.opt')
import copy
import sys import sys
import warnings import warnings
import numpy import numpy
import theano import theano
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano import scalar as scal from theano import scalar as scal
from theano import tensor, compile, gof from theano import tensor, compile, gof
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler, Optimizer, toolbox, DestroyHandler,
EquilibriumOptimizer) InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import * from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
...@@ -1431,7 +1433,7 @@ def gpuScanOptimization(node): ...@@ -1431,7 +1433,7 @@ def gpuScanOptimization(node):
# merged or implement this optimization as a global # merged or implement this optimization as a global
# optimization # optimization
thescan = host_input.owner.op thescan = host_input.owner.op
info = thescan.info.copy() info = copy.deepcopy(thescan.info)
info['gpu'] = True info['gpu'] = True
inputs = host_input.owner.inputs inputs = host_input.owner.inputs
nw_ins = [inputs[0]] nw_ins = [inputs[0]]
...@@ -1478,7 +1480,7 @@ def gpuScanOptimization(node): ...@@ -1478,7 +1480,7 @@ def gpuScanOptimization(node):
for i in node.inputs]): for i in node.inputs]):
thescan = node.op thescan = node.op
info = thescan.info.copy() info = copy.deepcopy(thescan.info)
info['gpu'] = True info['gpu'] = True
inputs = node.inputs inputs = node.inputs
nw_ins = [inputs[0]] nw_ins = [inputs[0]]
...@@ -1527,41 +1529,9 @@ def gpuScanOptimization(node): ...@@ -1527,41 +1529,9 @@ def gpuScanOptimization(node):
return False return False
@gof.local_optimizer([None])
def gpu_scan_make_inplace(node):
op = node.op
if (isinstance(op, scan_op.Scan) and
(not op.info['inplace']) and
(op.info['gpu'])):
info = op.info.copy()
info['inplace'] = True
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node)
ls += op.outer_mitsot(node)
ls += op.outer_sitsot(node)
ls_end = op.outer_shared(node)
ls_end += op.outer_nitsot(node)
ls_end += op.outer_non_seqs(node)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = compile.function_module.deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable=broadcastable)
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=typeConstructor)
return new_op.make_node(*inputs).outputs
return False
optdb.register('gpu_scanOp_make_inplace', optdb.register('gpu_scanOp_make_inplace',
theano.tensor.opt.in2out( scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
gpu_scan_make_inplace, ignore_newtrees=True), gpu_flag=True),
75, 75,
'gpu', 'gpu',
'fast_run', 'fast_run',
......
差异被折叠。
...@@ -949,7 +949,8 @@ def scan(fn, ...@@ -949,7 +949,8 @@ def scan(fn,
info['truncate_gradient'] = truncate_gradient info['truncate_gradient'] = truncate_gradient
info['name'] = name info['name'] = name
info['mode'] = mode info['mode'] = mode
info['inplace'] = False info['inplace'] = -1
info['destroy_map'] = {}
info['gpu'] = False info['gpu'] = False
info['as_while'] = as_while info['as_while'] = as_while
info['profile'] = profile info['profile'] = profile
......
...@@ -94,12 +94,6 @@ class Scan(PureOp): ...@@ -94,12 +94,6 @@ class Scan(PureOp):
if self.as_while: if self.as_while:
self.output_types = self.output_types[:-1] self.output_types = self.output_types[:-1]
self.destroy_map = {}
if hasattr(self, 'inplace') and self.inplace:
for idx in xrange(self.n_mit_mot + self.n_mit_sot +
self.n_sit_sot):
self.destroy_map[idx] = [idx + 1 + self.n_seqs]
mode_instance = compile.mode.get_mode(self.mode) mode_instance = compile.mode.get_mode(self.mode)
# if the default mode is used, and that mode is ProfileMode # if the default mode is used, and that mode is ProfileMode
...@@ -411,12 +405,22 @@ class Scan(PureOp): ...@@ -411,12 +405,22 @@ class Scan(PureOp):
name = 'do_while' name = 'do_while'
else: else:
name = 'for' name = 'for'
aux_txt = '%s'
if self.inplace: if len(self.destroy_map.keys()) > 0:
aux_txt = '%s{inplace,%s,%s}' % (name, gpu_str, str(self.name)) # Check if all outputs are inplace
if (sorted(self.destroy_map.keys()) == \
sorted(range(self.n_mit_mot +
self.n_mit_sot +
self.n_sit_sot))):
aux_txt += 'all_inplace,%s,%s}'
else:
aux_txt += '{inplace{'
for k in self.destroy_map.keys():
aux_txt += str(k) + ','
aux_txt += '},%s,%s}'
else: else:
aux_txt = '%s{%s,%s}' % (name, gpu_str, str(self.name)) aux_txt +='{%s,%s}'
aux_txt = aux_txt % (name, gpu_str, str(self.name))
return aux_txt return aux_txt
def __hash__(self): def __hash__(self):
......
...@@ -13,6 +13,7 @@ __copyright__ = "(c) 2010, Universite de Montreal" ...@@ -13,6 +13,7 @@ __copyright__ = "(c) 2010, Universite de Montreal"
__contact__ = "Razvan Pascanu <r.pascanu@gmail>" __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
import logging import logging
import copy
import numpy import numpy
import theano import theano
...@@ -20,6 +21,8 @@ from theano import tensor ...@@ -20,6 +21,8 @@ from theano import tensor
from theano.tensor import opt, get_constant_value from theano.tensor import opt, get_constant_value
from theano import gof from theano import gof
from theano.gof.python25 import maxsize from theano.gof.python25 import maxsize
from theano.gof.opt import Optimizer
from theano.gof import toolbox, DestroyHandler, InconsistencyError
from theano.compile import optdb from theano.compile import optdb
from theano.compile.function_module import deep_copy_op from theano.compile.function_module import deep_copy_op
...@@ -117,7 +120,7 @@ def remove_constants_and_unused_inputs_scan(node): ...@@ -117,7 +120,7 @@ def remove_constants_and_unused_inputs_scan(node):
if len(nw_inner) != len(op_ins): if len(nw_inner) != len(op_ins):
op_outs = scan_utils.clone(op_outs, replace=givens) op_outs = scan_utils.clone(op_outs, replace=givens)
nw_info = op.info.copy() nw_info = copy.deepcopy(op.info)
nw_info['n_seqs'] = nw_n_seqs nw_info['n_seqs'] = nw_n_seqs
# DEBUG CHECK # DEBUG CHECK
nwScan = scan_op.Scan(nw_inner, op_outs, nw_info) nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
...@@ -304,36 +307,68 @@ scan_seqopt.register('scanOp_pushout_nonseqs_ops', ...@@ -304,36 +307,68 @@ scan_seqopt.register('scanOp_pushout_nonseqs_ops',
'scan') 'scan')
@gof.local_optimizer([None])
def scan_make_inplace(node): class ScanInplaceOptimizer(Optimizer):
op = node.op """Graph optimizer for Scan(makes it run inplace)"""
if (isinstance(op, scan_op.Scan) and def __init__(self, typeConstructor=None, gpu_flag=False):
(not op.info['inplace']) and Optimizer.__init__(self)
(not op.info['gpu'])): self.typeConstructor = typeConstructor
info = op.info.copy() self.gpu_flag = gpu_flag
info['inplace'] = True
# inputs corresponding to sequences and n_steps def add_requirements(self, env):
ls_begin = node.inputs[:1 + op.n_seqs] env.extend(toolbox.ReplaceValidate())
ls = op.outer_mitmot(node.inputs) env.extend(DestroyHandler())
ls += op.outer_mitsot(node.inputs)
ls += op.outer_sitsot(node.inputs) def apply(self, env):
ls_end = op.outer_shared(node.inputs)
ls_end += op.outer_nitsot(node.inputs) nodes = env.toposort()
ls_end += op.outer_non_seqs(node.inputs) scan_nodes = [x for x in nodes
n_outs = len(ls) if (isinstance(x.op, scan_op.Scan) and
for idx in xrange(n_outs): x.op.info['gpu']== self.gpu_flag)]
if ls[idx] in ls[:idx]: for scan_idx in xrange(len(scan_nodes)):
ls[idx] = deep_copy_op(ls[idx]) node = scan_nodes[scan_idx]
op = node.op
inputs = ls_begin + ls + ls_end n_outs = (op.info['n_mit_mot'] +
new_op = scan_op.Scan(op.inputs, op.info['n_mit_sot'] +
op.outputs, op.info['n_sit_sot'])
info) for pos in xrange(n_outs):
return new_op.make_node(*inputs).outputs info = copy.deepcopy(op.info)
return False if not 'destroy_map' in info:
info['destroy_map'] = {}
info['destroy_map'][pos] = [pos + 1 + op.info['n_seqs']]
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node.inputs)
ls += op.outer_mitsot(node.inputs)
ls += op.outer_sitsot(node.inputs)
ls_end = op.outer_shared(node.inputs)
ls_end += op.outer_nitsot(node.inputs)
ls_end += op.outer_non_seqs(node.inputs)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=self.typeConstructor)
new_outs = new_op.make_node(*inputs).outputs
try:
env.replace_all_validate(
zip(node.outputs, new_outs),
reason=self.__class__.__name__)
op = new_op
node = new_outs[0].owner
except InconsistencyError, e:
# Failed moving output to be comptued inplace
pass
optdb.register('scanOp_make_inplace', optdb.register('scanOp_make_inplace',
opt.in2out(scan_make_inplace, ignore_newtrees=True), ScanInplaceOptimizer(typeConstructor=None,
gpu_flag=False),
75, 75,
'fast_run', 'fast_run',
'inplace', 'inplace',
......
...@@ -775,8 +775,11 @@ class T_Scan(unittest.TestCase): ...@@ -775,8 +775,11 @@ class T_Scan(unittest.TestCase):
updates=updates, updates=updates,
mode=mode, mode=mode,
allow_input_downcast=True) allow_input_downcast=True)
scan_node = [x for x in f9.maker.env.toposort()
# compute output in numpy if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy
numpy_x0 = numpy.zeros((3,)) numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,)) numpy_x1 = numpy.zeros((3,))
numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0] numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
...@@ -852,6 +855,10 @@ class T_Scan(unittest.TestCase): ...@@ -852,6 +855,10 @@ class T_Scan(unittest.TestCase):
mode=mode, mode=mode,
allow_input_downcast=True) allow_input_downcast=True)
scan_node = [x for x in f9.maker.env.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy # compute output in numpy
numpy_x0 = numpy.zeros((3,)) numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,)) numpy_x1 = numpy.zeros((3,))
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论