提交 8cbf087d authored 作者: nouiz's avatar nouiz

Merge pull request #620 from pascanur/better_inplace_scan3

Better inplace scan3
import logging import logging
_logger = logging.getLogger('theano.sandbox.cuda.opt') _logger = logging.getLogger('theano.sandbox.cuda.opt')
import copy
import sys import sys
import warnings import warnings
import numpy import numpy
import theano import theano
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano import scalar as scal from theano import scalar as scal
from theano import tensor, compile, gof from theano import tensor, compile, gof
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler, Optimizer, toolbox, DestroyHandler,
EquilibriumOptimizer) InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import * from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
...@@ -1431,7 +1433,7 @@ def gpuScanOptimization(node): ...@@ -1431,7 +1433,7 @@ def gpuScanOptimization(node):
# merged or implement this optimization as a global # merged or implement this optimization as a global
# optimization # optimization
thescan = host_input.owner.op thescan = host_input.owner.op
info = thescan.info.copy() info = copy.deepcopy(thescan.info)
info['gpu'] = True info['gpu'] = True
inputs = host_input.owner.inputs inputs = host_input.owner.inputs
nw_ins = [inputs[0]] nw_ins = [inputs[0]]
...@@ -1478,7 +1480,7 @@ def gpuScanOptimization(node): ...@@ -1478,7 +1480,7 @@ def gpuScanOptimization(node):
for i in node.inputs]): for i in node.inputs]):
thescan = node.op thescan = node.op
info = thescan.info.copy() info = copy.deepcopy(thescan.info)
info['gpu'] = True info['gpu'] = True
inputs = node.inputs inputs = node.inputs
nw_ins = [inputs[0]] nw_ins = [inputs[0]]
...@@ -1512,11 +1514,10 @@ def gpuScanOptimization(node): ...@@ -1512,11 +1514,10 @@ def gpuScanOptimization(node):
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType( typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable=broadcastable) broadcastable=broadcastable)
_outputs = scan_op.Scan( _outputs = scan_op.Scan(
scan_ins, scan_ins,
scan_outs, scan_outs,
info, info,
typeConstructor=typeConstructor).make_node( typeConstructor=typeConstructor).make_node(*nw_ins).outputs
*nw_ins).outputs
outputs = [] outputs = []
for x, y in zip(_outputs, node.outputs): for x, y in zip(_outputs, node.outputs):
if isinstance(y.type, CudaNdarrayType): if isinstance(y.type, CudaNdarrayType):
...@@ -1527,41 +1528,9 @@ def gpuScanOptimization(node): ...@@ -1527,41 +1528,9 @@ def gpuScanOptimization(node):
return False return False
@gof.local_optimizer([None])
def gpu_scan_make_inplace(node):
op = node.op
if (isinstance(op, scan_op.Scan) and
(not op.info['inplace']) and
(op.info['gpu'])):
info = op.info.copy()
info['inplace'] = True
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node)
ls += op.outer_mitsot(node)
ls += op.outer_sitsot(node)
ls_end = op.outer_shared(node)
ls_end += op.outer_nitsot(node)
ls_end += op.outer_non_seqs(node)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = compile.function_module.deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable=broadcastable)
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=typeConstructor)
return new_op.make_node(*inputs).outputs
return False
optdb.register('gpu_scanOp_make_inplace', optdb.register('gpu_scanOp_make_inplace',
theano.tensor.opt.in2out( scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
gpu_scan_make_inplace, ignore_newtrees=True), gpu_flag=True),
75, 75,
'gpu', 'gpu',
'fast_run', 'fast_run',
......
...@@ -949,7 +949,8 @@ def scan(fn, ...@@ -949,7 +949,8 @@ def scan(fn,
info['truncate_gradient'] = truncate_gradient info['truncate_gradient'] = truncate_gradient
info['name'] = name info['name'] = name
info['mode'] = mode info['mode'] = mode
info['inplace'] = False info['inplace'] = -1
info['destroy_map'] = {}
info['gpu'] = False info['gpu'] = False
info['as_while'] = as_while info['as_while'] = as_while
info['profile'] = profile info['profile'] = profile
......
...@@ -94,12 +94,6 @@ class Scan(PureOp): ...@@ -94,12 +94,6 @@ class Scan(PureOp):
if self.as_while: if self.as_while:
self.output_types = self.output_types[:-1] self.output_types = self.output_types[:-1]
self.destroy_map = {}
if hasattr(self, 'inplace') and self.inplace:
for idx in xrange(self.n_mit_mot + self.n_mit_sot +
self.n_sit_sot):
self.destroy_map[idx] = [idx + 1 + self.n_seqs]
mode_instance = compile.mode.get_mode(self.mode) mode_instance = compile.mode.get_mode(self.mode)
# if the default mode is used, and that mode is ProfileMode # if the default mode is used, and that mode is ProfileMode
...@@ -411,12 +405,24 @@ class Scan(PureOp): ...@@ -411,12 +405,24 @@ class Scan(PureOp):
name = 'do_while' name = 'do_while'
else: else:
name = 'for' name = 'for'
aux_txt = '%s'
if self.inplace: if getattr(self, 'destroy_map', None) is None:
aux_txt = '%s{inplace,%s,%s}' % (name, gpu_str, str(self.name)) self.destroy_map = {}
if len(self.destroy_map.keys()) > 0:
# Check if all outputs are inplace
if (sorted(self.destroy_map.keys()) == \
sorted(range(self.n_mit_mot +
self.n_mit_sot +
self.n_sit_sot))):
aux_txt += 'all_inplace,%s,%s}'
else:
aux_txt += '{inplace{'
for k in self.destroy_map.keys():
aux_txt += str(k) + ','
aux_txt += '},%s,%s}'
else: else:
aux_txt = '%s{%s,%s}' % (name, gpu_str, str(self.name)) aux_txt +='{%s,%s}'
aux_txt = aux_txt % (name, gpu_str, str(self.name))
return aux_txt return aux_txt
def __hash__(self): def __hash__(self):
......
...@@ -13,6 +13,7 @@ __copyright__ = "(c) 2010, Universite de Montreal" ...@@ -13,6 +13,7 @@ __copyright__ = "(c) 2010, Universite de Montreal"
__contact__ = "Razvan Pascanu <r.pascanu@gmail>" __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
import logging import logging
import copy
import numpy import numpy
import theano import theano
...@@ -20,6 +21,8 @@ from theano import tensor ...@@ -20,6 +21,8 @@ from theano import tensor
from theano.tensor import opt, get_constant_value from theano.tensor import opt, get_constant_value
from theano import gof from theano import gof
from theano.gof.python25 import maxsize from theano.gof.python25 import maxsize
from theano.gof.opt import Optimizer
from theano.gof import toolbox, DestroyHandler, InconsistencyError
from theano.compile import optdb from theano.compile import optdb
from theano.compile.function_module import deep_copy_op from theano.compile.function_module import deep_copy_op
...@@ -117,7 +120,7 @@ def remove_constants_and_unused_inputs_scan(node): ...@@ -117,7 +120,7 @@ def remove_constants_and_unused_inputs_scan(node):
if len(nw_inner) != len(op_ins): if len(nw_inner) != len(op_ins):
op_outs = scan_utils.clone(op_outs, replace=givens) op_outs = scan_utils.clone(op_outs, replace=givens)
nw_info = op.info.copy() nw_info = copy.deepcopy(op.info)
nw_info['n_seqs'] = nw_n_seqs nw_info['n_seqs'] = nw_n_seqs
# DEBUG CHECK # DEBUG CHECK
nwScan = scan_op.Scan(nw_inner, op_outs, nw_info) nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
...@@ -304,36 +307,67 @@ scan_seqopt.register('scanOp_pushout_nonseqs_ops', ...@@ -304,36 +307,67 @@ scan_seqopt.register('scanOp_pushout_nonseqs_ops',
'scan') 'scan')
@gof.local_optimizer([None]) class ScanInplaceOptimizer(Optimizer):
def scan_make_inplace(node): """Graph optimizer for Scan(makes it run inplace)"""
op = node.op def __init__(self, typeConstructor=None, gpu_flag=False):
if (isinstance(op, scan_op.Scan) and Optimizer.__init__(self)
(not op.info['inplace']) and self.typeConstructor = typeConstructor
(not op.info['gpu'])): self.gpu_flag = gpu_flag
info = op.info.copy()
info['inplace'] = True def add_requirements(self, env):
# inputs corresponding to sequences and n_steps env.extend(toolbox.ReplaceValidate())
ls_begin = node.inputs[:1 + op.n_seqs] env.extend(DestroyHandler())
ls = op.outer_mitmot(node.inputs)
ls += op.outer_mitsot(node.inputs) def apply(self, env):
ls += op.outer_sitsot(node.inputs)
ls_end = op.outer_shared(node.inputs) nodes = env.toposort()
ls_end += op.outer_nitsot(node.inputs) scan_nodes = [x for x in nodes
ls_end += op.outer_non_seqs(node.inputs) if (isinstance(x.op, scan_op.Scan) and
n_outs = len(ls) x.op.info['gpu'] == self.gpu_flag)]
for idx in xrange(n_outs): for scan_idx in xrange(len(scan_nodes)):
if ls[idx] in ls[:idx]: node = scan_nodes[scan_idx]
ls[idx] = deep_copy_op(ls[idx]) op = node.op
n_outs = (op.info['n_mit_mot'] +
inputs = ls_begin + ls + ls_end op.info['n_mit_sot'] +
new_op = scan_op.Scan(op.inputs, op.info['n_sit_sot'])
op.outputs, for pos in xrange(n_outs):
info) info = copy.deepcopy(op.info)
return new_op.make_node(*inputs).outputs if not 'destroy_map' in info:
return False info['destroy_map'] = {}
info['destroy_map'][pos] = [pos + 1 + op.info['n_seqs']]
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node.inputs)
ls += op.outer_mitsot(node.inputs)
ls += op.outer_sitsot(node.inputs)
ls_end = op.outer_shared(node.inputs)
ls_end += op.outer_nitsot(node.inputs)
ls_end += op.outer_non_seqs(node.inputs)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=self.typeConstructor)
new_outs = new_op.make_node(*inputs).outputs
try:
env.replace_all_validate(
zip(node.outputs, new_outs),
reason=self.__class__.__name__)
op = new_op
node = new_outs[0].owner
except InconsistencyError, e:
# Failed moving output to be comptued inplace
pass
optdb.register('scanOp_make_inplace', optdb.register('scanOp_make_inplace',
opt.in2out(scan_make_inplace, ignore_newtrees=True), ScanInplaceOptimizer(typeConstructor=None,
gpu_flag=False),
75, 75,
'fast_run', 'fast_run',
'inplace', 'inplace',
......
...@@ -775,8 +775,11 @@ class T_Scan(unittest.TestCase): ...@@ -775,8 +775,11 @@ class T_Scan(unittest.TestCase):
updates=updates, updates=updates,
mode=mode, mode=mode,
allow_input_downcast=True) allow_input_downcast=True)
scan_node = [x for x in f9.maker.env.toposort()
# compute output in numpy if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy
numpy_x0 = numpy.zeros((3,)) numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,)) numpy_x1 = numpy.zeros((3,))
numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0] numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
...@@ -852,6 +855,10 @@ class T_Scan(unittest.TestCase): ...@@ -852,6 +855,10 @@ class T_Scan(unittest.TestCase):
mode=mode, mode=mode,
allow_input_downcast=True) allow_input_downcast=True)
scan_node = [x for x in f9.maker.env.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy # compute output in numpy
numpy_x0 = numpy.zeros((3,)) numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,)) numpy_x1 = numpy.zeros((3,))
...@@ -880,6 +887,34 @@ class T_Scan(unittest.TestCase): ...@@ -880,6 +887,34 @@ class T_Scan(unittest.TestCase):
#assert not numpy.allclose( theano_x0 , vu2[1:4]) #assert not numpy.allclose( theano_x0 , vu2[1:4])
#assert numpy.allclose( theano_x1 , vu1[0:3]) #assert numpy.allclose( theano_x1 , vu1[0:3])
def test_inplace3(self):
rng = numpy.random.RandomState(utt.fetch_seed())
vx0 = asarrayX(rng.uniform())
vx1 = asarrayX(rng.uniform())
x0 = theano.shared(vx0)
x1 = theano.shared(vx1)
outputs, updates = theano.scan(lambda x,y: (x + asarrayX(1),
y + asarrayX(1)),
[],
[x0,x1],
n_steps = 3)
x0 = asarrayX(numpy.zeros((3,)))
x0[0] = vx0
x0 = theano.tensor.constant(x0)
to_replace = outputs[0].owner.inputs[0].owner.inputs[1]
outputs = theano.clone(outputs,
replace={to_replace: x0})
mode = theano.compile.mode.get_mode(None).including('inplace')
f9 = theano.function([],
outputs,
updates=updates,
mode=mode)
scan_node = [x for x in f9.maker.env.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 not in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# Shared variable with updates # Shared variable with updates
def test_shared_arguments_with_updates(self): def test_shared_arguments_with_updates(self):
rng = numpy.random.RandomState(utt.fetch_seed()) rng = numpy.random.RandomState(utt.fetch_seed())
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论