提交 8cbf087d authored 作者: nouiz's avatar nouiz

Merge pull request #620 from pascanur/better_inplace_scan3

Better inplace scan3
import logging
_logger = logging.getLogger('theano.sandbox.cuda.opt')
import copy
import sys
import warnings
import numpy
import theano
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano import scalar as scal
from theano import tensor, compile, gof
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler,
EquilibriumOptimizer)
InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType
......@@ -1431,7 +1433,7 @@ def gpuScanOptimization(node):
# merged or implement this optimization as a global
# optimization
thescan = host_input.owner.op
info = thescan.info.copy()
info = copy.deepcopy(thescan.info)
info['gpu'] = True
inputs = host_input.owner.inputs
nw_ins = [inputs[0]]
......@@ -1478,7 +1480,7 @@ def gpuScanOptimization(node):
for i in node.inputs]):
thescan = node.op
info = thescan.info.copy()
info = copy.deepcopy(thescan.info)
info['gpu'] = True
inputs = node.inputs
nw_ins = [inputs[0]]
......@@ -1512,11 +1514,10 @@ def gpuScanOptimization(node):
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable=broadcastable)
_outputs = scan_op.Scan(
scan_ins,
scan_outs,
info,
typeConstructor=typeConstructor).make_node(
*nw_ins).outputs
scan_ins,
scan_outs,
info,
typeConstructor=typeConstructor).make_node(*nw_ins).outputs
outputs = []
for x, y in zip(_outputs, node.outputs):
if isinstance(y.type, CudaNdarrayType):
......@@ -1527,41 +1528,9 @@ def gpuScanOptimization(node):
return False
@gof.local_optimizer([None])
def gpu_scan_make_inplace(node):
op = node.op
if (isinstance(op, scan_op.Scan) and
(not op.info['inplace']) and
(op.info['gpu'])):
info = op.info.copy()
info['inplace'] = True
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node)
ls += op.outer_mitsot(node)
ls += op.outer_sitsot(node)
ls_end = op.outer_shared(node)
ls_end += op.outer_nitsot(node)
ls_end += op.outer_non_seqs(node)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = compile.function_module.deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable=broadcastable)
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=typeConstructor)
return new_op.make_node(*inputs).outputs
return False
optdb.register('gpu_scanOp_make_inplace',
theano.tensor.opt.in2out(
gpu_scan_make_inplace, ignore_newtrees=True),
scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
gpu_flag=True),
75,
'gpu',
'fast_run',
......
......@@ -949,7 +949,8 @@ def scan(fn,
info['truncate_gradient'] = truncate_gradient
info['name'] = name
info['mode'] = mode
info['inplace'] = False
info['inplace'] = -1
info['destroy_map'] = {}
info['gpu'] = False
info['as_while'] = as_while
info['profile'] = profile
......
......@@ -94,12 +94,6 @@ class Scan(PureOp):
if self.as_while:
self.output_types = self.output_types[:-1]
self.destroy_map = {}
if hasattr(self, 'inplace') and self.inplace:
for idx in xrange(self.n_mit_mot + self.n_mit_sot +
self.n_sit_sot):
self.destroy_map[idx] = [idx + 1 + self.n_seqs]
mode_instance = compile.mode.get_mode(self.mode)
# if the default mode is used, and that mode is ProfileMode
......@@ -411,12 +405,24 @@ class Scan(PureOp):
name = 'do_while'
else:
name = 'for'
if self.inplace:
aux_txt = '%s{inplace,%s,%s}' % (name, gpu_str, str(self.name))
aux_txt = '%s'
if getattr(self, 'destroy_map', None) is None:
self.destroy_map = {}
if len(self.destroy_map.keys()) > 0:
# Check if all outputs are inplace
if (sorted(self.destroy_map.keys()) == \
sorted(range(self.n_mit_mot +
self.n_mit_sot +
self.n_sit_sot))):
aux_txt += 'all_inplace,%s,%s}'
else:
aux_txt += '{inplace{'
for k in self.destroy_map.keys():
aux_txt += str(k) + ','
aux_txt += '},%s,%s}'
else:
aux_txt = '%s{%s,%s}' % (name, gpu_str, str(self.name))
aux_txt +='{%s,%s}'
aux_txt = aux_txt % (name, gpu_str, str(self.name))
return aux_txt
def __hash__(self):
......
......@@ -13,6 +13,7 @@ __copyright__ = "(c) 2010, Universite de Montreal"
__contact__ = "Razvan Pascanu <r.pascanu@gmail>"
import logging
import copy
import numpy
import theano
......@@ -20,6 +21,8 @@ from theano import tensor
from theano.tensor import opt, get_constant_value
from theano import gof
from theano.gof.python25 import maxsize
from theano.gof.opt import Optimizer
from theano.gof import toolbox, DestroyHandler, InconsistencyError
from theano.compile import optdb
from theano.compile.function_module import deep_copy_op
......@@ -117,7 +120,7 @@ def remove_constants_and_unused_inputs_scan(node):
if len(nw_inner) != len(op_ins):
op_outs = scan_utils.clone(op_outs, replace=givens)
nw_info = op.info.copy()
nw_info = copy.deepcopy(op.info)
nw_info['n_seqs'] = nw_n_seqs
# DEBUG CHECK
nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
......@@ -304,36 +307,67 @@ scan_seqopt.register('scanOp_pushout_nonseqs_ops',
'scan')
@gof.local_optimizer([None])
def scan_make_inplace(node):
op = node.op
if (isinstance(op, scan_op.Scan) and
(not op.info['inplace']) and
(not op.info['gpu'])):
info = op.info.copy()
info['inplace'] = True
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node.inputs)
ls += op.outer_mitsot(node.inputs)
ls += op.outer_sitsot(node.inputs)
ls_end = op.outer_shared(node.inputs)
ls_end += op.outer_nitsot(node.inputs)
ls_end += op.outer_non_seqs(node.inputs)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
new_op = scan_op.Scan(op.inputs,
op.outputs,
info)
return new_op.make_node(*inputs).outputs
return False
class ScanInplaceOptimizer(Optimizer):
"""Graph optimizer for Scan(makes it run inplace)"""
def __init__(self, typeConstructor=None, gpu_flag=False):
Optimizer.__init__(self)
self.typeConstructor = typeConstructor
self.gpu_flag = gpu_flag
def add_requirements(self, env):
env.extend(toolbox.ReplaceValidate())
env.extend(DestroyHandler())
def apply(self, env):
nodes = env.toposort()
scan_nodes = [x for x in nodes
if (isinstance(x.op, scan_op.Scan) and
x.op.info['gpu'] == self.gpu_flag)]
for scan_idx in xrange(len(scan_nodes)):
node = scan_nodes[scan_idx]
op = node.op
n_outs = (op.info['n_mit_mot'] +
op.info['n_mit_sot'] +
op.info['n_sit_sot'])
for pos in xrange(n_outs):
info = copy.deepcopy(op.info)
if not 'destroy_map' in info:
info['destroy_map'] = {}
info['destroy_map'][pos] = [pos + 1 + op.info['n_seqs']]
# inputs corresponding to sequences and n_steps
ls_begin = node.inputs[:1 + op.n_seqs]
ls = op.outer_mitmot(node.inputs)
ls += op.outer_mitsot(node.inputs)
ls += op.outer_sitsot(node.inputs)
ls_end = op.outer_shared(node.inputs)
ls_end += op.outer_nitsot(node.inputs)
ls_end += op.outer_non_seqs(node.inputs)
n_outs = len(ls)
for idx in xrange(n_outs):
if ls[idx] in ls[:idx]:
ls[idx] = deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=self.typeConstructor)
new_outs = new_op.make_node(*inputs).outputs
try:
env.replace_all_validate(
zip(node.outputs, new_outs),
reason=self.__class__.__name__)
op = new_op
node = new_outs[0].owner
except InconsistencyError, e:
# Failed moving output to be comptued inplace
pass
optdb.register('scanOp_make_inplace',
opt.in2out(scan_make_inplace, ignore_newtrees=True),
ScanInplaceOptimizer(typeConstructor=None,
gpu_flag=False),
75,
'fast_run',
'inplace',
......
......@@ -775,8 +775,11 @@ class T_Scan(unittest.TestCase):
updates=updates,
mode=mode,
allow_input_downcast=True)
# compute output in numpy
scan_node = [x for x in f9.maker.env.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy
numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,))
numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
......@@ -852,6 +855,10 @@ class T_Scan(unittest.TestCase):
mode=mode,
allow_input_downcast=True)
scan_node = [x for x in f9.maker.env.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# compute output in numpy
numpy_x0 = numpy.zeros((3,))
numpy_x1 = numpy.zeros((3,))
......@@ -880,6 +887,34 @@ class T_Scan(unittest.TestCase):
#assert not numpy.allclose( theano_x0 , vu2[1:4])
#assert numpy.allclose( theano_x1 , vu1[0:3])
def test_inplace3(self):
rng = numpy.random.RandomState(utt.fetch_seed())
vx0 = asarrayX(rng.uniform())
vx1 = asarrayX(rng.uniform())
x0 = theano.shared(vx0)
x1 = theano.shared(vx1)
outputs, updates = theano.scan(lambda x,y: (x + asarrayX(1),
y + asarrayX(1)),
[],
[x0,x1],
n_steps = 3)
x0 = asarrayX(numpy.zeros((3,)))
x0[0] = vx0
x0 = theano.tensor.constant(x0)
to_replace = outputs[0].owner.inputs[0].owner.inputs[1]
outputs = theano.clone(outputs,
replace={to_replace: x0})
mode = theano.compile.mode.get_mode(None).including('inplace')
f9 = theano.function([],
outputs,
updates=updates,
mode=mode)
scan_node = [x for x in f9.maker.env.toposort()
if isinstance(x.op, theano.scan_module.scan_op.Scan)]
assert 0 not in scan_node[0].op.destroy_map.keys()
assert 1 in scan_node[0].op.destroy_map.keys()
# Shared variable with updates
def test_shared_arguments_with_updates(self):
rng = numpy.random.RandomState(utt.fetch_seed())
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论