提交 65f54e68 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2479 from carriepl/prevent_output_from_inplace

Feature for no outputs from inplace.
...@@ -159,6 +159,18 @@ class AddDestroyHandler(gof.Optimizer): ...@@ -159,6 +159,18 @@ class AddDestroyHandler(gof.Optimizer):
fgraph.attach_feature(gof.DestroyHandler()) fgraph.attach_feature(gof.DestroyHandler())
class AddNoOutputFromInplace(gof.Optimizer):
"""This optimizer adds to the fgraph a feature that will prevent outputs
of a fgraph to be created by performing inplace operations on intermediary
variables. This is useful when the outputs of the fgraph are preallocated
to prevent useless copying of the data. Currently, scan preallocates its
outputs
"""
def add_requirements(self, fgraph):
super(AddNoOutputFromInplace, self).add_requirements(fgraph)
fgraph.attach_feature(gof.NoOutputFromInplace())
class PrintCurrentFunctionGraph(gof.Optimizer): class PrintCurrentFunctionGraph(gof.Optimizer):
"""This optimizer is for debugging. """This optimizer is for debugging.
...@@ -211,6 +223,9 @@ optdb.register('specialize_device', gof.EquilibriumDB(), ...@@ -211,6 +223,9 @@ optdb.register('specialize_device', gof.EquilibriumDB(),
optdb.register('merge2', gof.MergeOptimizer(), optdb.register('merge2', gof.MergeOptimizer(),
49, 'fast_run', 'merge') 49, 'fast_run', 'merge')
optdb.register('add_no_output_from_inplace', AddNoOutputFromInplace(),
49.4)
optdb.register('add_destroy_handler', AddDestroyHandler(), optdb.register('add_destroy_handler', AddDestroyHandler(),
49.5, 'fast_run', 'inplace') 49.5, 'fast_run', 'inplace')
......
import theano
from theano.compile.mode import Mode
import theano.tensor as T
def test_no_output_from_implace():
x = T.matrix()
y = T.matrix()
a = T.dot(x, y)
b = T.tanh(a)
# Ensure that the elemwise op that produces the output is inplace when
# using a mode that does not include the optimization
fct_no_opt = theano.function([x,y], b, mode="FAST_RUN")
op = fct_no_opt.maker.fgraph.outputs[0].owner.op
assert (hasattr(op, 'destroy_map') and 0 in op.destroy_map)
# Ensure that the elemwise op that produces the output is not inplace when
# using a mode that includes the optimization
mode_opt = Mode(linker="cvm", optimizer="fast_run")
mode_opt = mode_opt.including("add_no_output_from_inplace")
fct_opt = theano.function([x,y], b, mode=mode_opt)
op = fct_opt.maker.fgraph.outputs[0].owner.op
assert (not hasattr(op, 'destroy_map') or 0 not in op.destroy_map)
...@@ -74,7 +74,7 @@ from theano.gof.optdb import \ ...@@ -74,7 +74,7 @@ from theano.gof.optdb import \
from theano.gof.toolbox import \ from theano.gof.toolbox import \
Feature, \ Feature, \
Bookkeeper, History, Validator, ReplaceValidate, NodeFinder,\ Bookkeeper, History, Validator, ReplaceValidate, NodeFinder,\
PrintListener, ReplacementDidntRemovedError PrintListener, ReplacementDidntRemovedError, NoOutputFromInplace
from theano.gof.type import \ from theano.gof.type import \
Type, Generic, generic Type, Generic, generic
......
import sys import sys
import time import time
import theano
from theano import config from theano import config
from theano.gof.python25 import partial from theano.gof.python25 import partial
from theano.gof.python25 import OrderedDict from theano.gof.python25 import OrderedDict
...@@ -394,3 +395,26 @@ class PreserveNames(Feature): ...@@ -394,3 +395,26 @@ class PreserveNames(Feature):
new_r.name = r.name new_r.name = r.name
class NoOutputFromInplace(Feature):
def validate(self, fgraph):
if not hasattr(fgraph, 'destroyers'):
return True
for out in list(fgraph.outputs):
if out.owner is None:
continue
# Validate that the node that produces the output does not produce
# it by modifying something else inplace.
node = out.owner
op = node.op
out_idx = node.outputs.index(out)
if hasattr(op, 'destroy_map') and out_idx in op.destroy_map.keys():
raise theano.gof.InconsistencyError(
"A function graph Feature has requested (probably for ",
"efficiency reasons for scan) that outputs of the graph",
"be prevented from being the result of inplace ",
"operations. This has prevented output ", out, " from ",
"being computed by modifying another variable ",
"inplace.")
...@@ -90,7 +90,7 @@ def test_consistency_randomstreams(): ...@@ -90,7 +90,7 @@ def test_consistency_randomstreams():
for use_cuda in test_use_cuda: for use_cuda in test_use_cuda:
#print 'use_cuda =', use_cuda #print 'use_cuda =', use_cuda
samples = [] samples = []
rng = MRG_RandomStreams(seed=seed, use_cuda=False) rng = MRG_RandomStreams(seed=seed, use_cuda=use_cuda)
for i in range(n_streams): for i in range(n_streams):
stream_samples = [] stream_samples = []
u = rng.uniform(size=(n_substreams,), nstreams=n_substreams) u = rng.uniform(size=(n_substreams,), nstreams=n_substreams)
......
diff --git a/theano/scan_module/scan_perform.c b/theano/scan_module/scan_perform.c @@ -5597,7 +5597,7 @@
index aaebb43..2d06b29 100644
--- a/theano/scan_module/scan_perform.c
+++ b/theano/scan_module/scan_perform.c
@@ -5595,7 +5595,7 @@ static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, P
* cdef list stack * cdef list stack
* cdef int offset * cdef int offset
*/ */
...@@ -11,29 +7,29 @@ index aaebb43..2d06b29 100644 ...@@ -11,29 +7,29 @@ index aaebb43..2d06b29 100644
__Pyx_INCREF(__pyx_t_4); __Pyx_INCREF(__pyx_t_4);
__pyx_v_descr = ((PyArray_Descr *)__pyx_t_4); __pyx_v_descr = ((PyArray_Descr *)__pyx_t_4);
__pyx_t_4 = 0; __pyx_t_4 = 0;
@@ -7147,7 +7147,7 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a @@ -7126,7 +7126,7 @@
* arr.base = baseptr * arr.base = baseptr
* *
*/ */
- Py_XDECREF(__pyx_v_arr->base); - Py_XDECREF(__pyx_v_arr->base);
+ Py_XDECREF(PyArray_BASE(__pyx_v_arr)); + Py_XDECREF(PyArray_BASE(__pyx_v_arr));
/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":974 /* "numpy.pxd":973
* baseptr = <PyObject*>base * baseptr = <PyObject*>base
@@ -7156,7 +7156,11 @@ static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_a @@ -7135,7 +7135,11 @@
* *
* cdef inline object get_array_base(ndarray arr): * cdef inline object get_array_base(ndarray arr):
*/ */
- __pyx_v_arr->base = __pyx_v_baseptr; - __pyx_v_arr->base = __pyx_v_baseptr;
+#if NPY_API_VERSION < 0x00000007 + #if NPY_API_VERSION < 0x00000007
+ PyArray_BASE(__pyx_v_arr) = __pyx_v_baseptr; + PyArray_BASE(__pyx_v_arr) = __pyx_v_baseptr;
+#else + #else
+ PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_baseptr); + PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_baseptr);
+#endif + #endif
/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966 __Pyx_RefNannyFinishContext();
* }
@@ -7191,7 +7195,7 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py @@ -7161,7 +7165,7 @@
* return None * return None
* else: * else:
*/ */
...@@ -41,8 +37,8 @@ index aaebb43..2d06b29 100644 ...@@ -41,8 +37,8 @@ index aaebb43..2d06b29 100644
+ __pyx_t_1 = ((PyArray_BASE(__pyx_v_arr) == NULL) != 0); + __pyx_t_1 = ((PyArray_BASE(__pyx_v_arr) == NULL) != 0);
if (__pyx_t_1) { if (__pyx_t_1) {
/* "/home/anakha/.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":978 /* "numpy.pxd":977
@@ -7214,8 +7218,8 @@ static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__py @@ -7185,8 +7189,8 @@
* return <object>arr.base # <<<<<<<<<<<<<< * return <object>arr.base # <<<<<<<<<<<<<<
*/ */
__Pyx_XDECREF(__pyx_r); __Pyx_XDECREF(__pyx_r);
...@@ -52,4 +48,4 @@ index aaebb43..2d06b29 100644 ...@@ -52,4 +48,4 @@ index aaebb43..2d06b29 100644
+ __pyx_r = ((PyObject *)PyArray_BASE(__pyx_v_arr)); + __pyx_r = ((PyObject *)PyArray_BASE(__pyx_v_arr));
goto __pyx_L0; goto __pyx_L0;
} }
__pyx_L3:;
...@@ -535,7 +535,7 @@ class Scan(PureOp): ...@@ -535,7 +535,7 @@ class Scan(PureOp):
self.n_sit_sot + self.n_sit_sot +
self.n_nit_sot) self.n_nit_sot)
wrapped_inputs = [Param(x, borrow=True) for x in self.inputs] wrapped_inputs = [Param(x, borrow=True) for x in self.inputs]
wrapped_outputs = [Out(x, borrow=False) for x in wrapped_outputs = [Out(x, borrow=(x not in self.inputs)) for x in
self.outputs[:slices]] self.outputs[:slices]]
wrapped_outputs += self.outputs[slices:] wrapped_outputs += self.outputs[slices:]
profile = None profile = None
...@@ -927,11 +927,14 @@ class Scan(PureOp): ...@@ -927,11 +927,14 @@ class Scan(PureOp):
offset += 1 offset += 1
# 4. collecting slices where the output should be stored # 4. collecting slices where the output should be stored
# 4.1. Collect slices for mitmots
for idx in xrange(self.n_mit_mot_outs): for idx in xrange(self.n_mit_mot_outs):
output_storage[idx].storage[0] = None output_storage[idx].storage[0] = None
# 4.2. Collect slices for mitsots, sitsots and nitsots
offset = self.n_mit_mot_outs offset = self.n_mit_mot_outs
if i != 0 and self.n_nit_sot > 0: if i != 0:
for idx in xrange(self.n_outs + self.n_nit_sot - for idx in xrange(self.n_outs + self.n_nit_sot -
self.n_mit_mot): self.n_mit_mot):
if (store_steps[idx + self.n_mit_mot] == 1 or if (store_steps[idx + self.n_mit_mot] == 1 or
...@@ -946,15 +949,24 @@ class Scan(PureOp): ...@@ -946,15 +949,24 @@ class Scan(PureOp):
self.n_mit_mot): self.n_mit_mot):
output_storage[idx + offset].storage[0] = None output_storage[idx + offset].storage[0] = None
# 4.3. Collect slices for shared outputs
offset += self.n_outs + self.n_nit_sot - self.n_mit_mot offset += self.n_outs + self.n_nit_sot - self.n_mit_mot
for idx in xrange(self.n_shared_outs): for idx in xrange(self.n_shared_outs):
output_storage[idx + offset].storage[0] = None output_storage[idx + offset].storage[0] = None
# If condition add it to the mix
# 4.4. If there is a condition add it to the mix
if self.as_while: if self.as_while:
pdx = offset + self.n_shared_outs pdx = offset + self.n_shared_outs
output_storage[pdx].storage[0] = None output_storage[pdx].storage[0] = None
# 4.5. Keep a reference to the variables currently in the
# output_storage to be able to compare them with the actual
# outputs of the inner function after its execution
old_output_storage = [o.storage[0] for o in output_storage]
# 5. compute outputs # 5. compute outputs
t0_fn = time.time() t0_fn = time.time()
try: try:
fn() fn()
except Exception: except Exception:
...@@ -974,11 +986,18 @@ class Scan(PureOp): ...@@ -974,11 +986,18 @@ class Scan(PureOp):
else: else:
# old-style linkers raise their own exceptions # old-style linkers raise their own exceptions
raise raise
dt_fn = time.time() - t0_fn dt_fn = time.time() - t0_fn
if self.as_while: if self.as_while:
pdx = offset + self.n_shared_outs pdx = offset + self.n_shared_outs
cond = output_storage[pdx].storage[0] == 0 cond = output_storage[pdx].storage[0] == 0
# Check which of the pre-allocated outputs (if applicable) have
# been reused by the inner function
output_reused = [old_output_storage[o] is
output_storage[o].storage[0]
for o in range(len(output_storage))]
t_fn += dt_fn t_fn += dt_fn
offset_out = 0 offset_out = 0
# 5.1 Copy over the values for mit_mot outputs # 5.1 Copy over the values for mit_mot outputs
...@@ -995,8 +1014,7 @@ class Scan(PureOp): ...@@ -995,8 +1014,7 @@ class Scan(PureOp):
for j in xrange(begin, end): for j in xrange(begin, end):
if (store_steps[j] == 1 or self.vector_outs[j] or if (store_steps[j] == 1 or self.vector_outs[j] or
outs[j][0][pos[j]] is not not output_reused[offset_out + j]):
output_storage[offset_out + j].storage[0]):
outs[j][0][pos[j]] = \ outs[j][0][pos[j]] = \
output_storage[offset_out + j].storage[0] output_storage[offset_out + j].storage[0]
...@@ -1020,8 +1038,7 @@ class Scan(PureOp): ...@@ -1020,8 +1038,7 @@ class Scan(PureOp):
outs[j][0] = outs[j][0][:store_steps[j]] outs[j][0] = outs[j][0][:store_steps[j]]
outs[j][0][pos[j]] = output_storage[jout].storage[0] outs[j][0][pos[j]] = output_storage[jout].storage[0]
elif (store_steps[j] == 1 or self.vector_outs[j] or elif (store_steps[j] == 1 or self.vector_outs[j] or
outs[j][0][pos[j]] is not not output_reused[offset_out + j]):
output_storage[j + offset_out].storage[0]):
outs[j][0][pos[j]] = \ outs[j][0][pos[j]] = \
output_storage[j + offset_out].storage[0] output_storage[j + offset_out].storage[0]
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -62,7 +62,7 @@ import copy ...@@ -62,7 +62,7 @@ import copy
def get_version(): def get_version():
return 0.284 return 0.285
@cython.boundscheck(False) @cython.boundscheck(False)
def perform( def perform(
...@@ -191,6 +191,10 @@ def perform( ...@@ -191,6 +191,10 @@ def perform(
cdef unsigned int begin cdef unsigned int begin
cdef unsigned int end cdef unsigned int end
cdef int cond cdef int cond
cdef unsigned int len_output_storage = (n_mit_mot_outs + n_mit_sot +
n_sit_sot + n_nit_sot +
n_shared_outs)
cdef int output_reused[500] # max 500 outputs
if n_steps < 0: if n_steps < 0:
...@@ -304,11 +308,14 @@ def perform( ...@@ -304,11 +308,14 @@ def perform(
offset += 1 offset += 1
# 4. collecting slices where the output should be stored # 4. collecting slices where the output should be stored
# 4.1. Collect slices for mitmots
for idx in range(n_mit_mot_outs): for idx in range(n_mit_mot_outs):
output_storage[idx].storage[0] = None output_storage[idx].storage[0] = None
# 4.2. Collect slices for mitsots, sitsots and nitsots
offset = n_mit_mot_outs offset = n_mit_mot_outs
if i !=0 and n_nit_sot >0: if i != 0:
for idx in range(n_outs + n_nit_sot - n_mit_mot): for idx in range(n_outs + n_nit_sot - n_mit_mot):
if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or
vector_outs[<unsigned int>(idx+n_mit_mot)] == 1): vector_outs[<unsigned int>(idx+n_mit_mot)] == 1):
...@@ -321,12 +328,21 @@ def perform( ...@@ -321,12 +328,21 @@ def perform(
for idx in range(n_outs + n_nit_sot - n_mit_mot): for idx in range(n_outs + n_nit_sot - n_mit_mot):
output_storage[<unsigned int>(idx+offset)].storage[0] = None output_storage[<unsigned int>(idx+offset)].storage[0] = None
# 4.3. Collect slices for shared outputs
offset += n_outs+n_nit_sot - n_mit_mot offset += n_outs+n_nit_sot - n_mit_mot
for idx in range(n_shared_outs): for idx in range(n_shared_outs):
output_storage[<unsigned int>(idx+offset)].storage[0] = None output_storage[<unsigned int>(idx+offset)].storage[0] = None
# 4.4. If there is a condition add it to the mix
if as_while: if as_while:
pdx = offset + n_shared_outs pdx = offset + n_shared_outs
output_storage[<unsigned int>pdx].storage[0] = None output_storage[<unsigned int>pdx].storage[0] = None
# 4.5. Keep a reference to the variables currently in the
# output_storage to be able to compare them with the actual
# outputs of the inner function after its execution
old_output_storage = [o.storage[0] for o in output_storage]
# 5. compute outputs # 5. compute outputs
t0_fn = time.time() t0_fn = time.time()
...@@ -348,6 +364,11 @@ def perform( ...@@ -348,6 +364,11 @@ def perform(
pdx = offset + n_shared_outs pdx = offset + n_shared_outs
cond = output_storage[pdx].storage[0] == 0 cond = output_storage[pdx].storage[0] == 0
# Check which of the pre-allocated outputs (if applicable) have
# been reused by the inner function
for j in range(len_output_storage):
output_reused[j] = (old_output_storage[j] is
output_storage[j].storage[0])
offset_out = 0 offset_out = 0
# 5.1 Copy over the values for mit_mot outputs # 5.1 Copy over the values for mit_mot outputs
...@@ -363,8 +384,8 @@ def perform( ...@@ -363,8 +384,8 @@ def perform(
offset_out -= n_mit_mot offset_out -= n_mit_mot
for j in range(begin, end): for j in range(begin, end):
if ( store_steps[j] == 1 or vector_outs[j] ==1 or if (store_steps[j] == 1 or vector_outs[j] == 1 or
outs[j][0][pos[j]] is not output_storage[<unsigned int>(offset_out+j)].storage[0]): not output_reused[<unsigned int>(offset_out+j)]):
outs[j][0][pos[j]] = output_storage[<unsigned int>(offset_out+j)].storage[0] outs[j][0][pos[j]] = output_storage[<unsigned int>(offset_out+j)].storage[0]
...@@ -387,7 +408,7 @@ def perform( ...@@ -387,7 +408,7 @@ def perform(
outs[j][0] = outs[j][0][:store_steps[j]] outs[j][0] = outs[j][0][:store_steps[j]]
outs[j][0][pos[j]] = output_storage[jout].storage[0] outs[j][0][pos[j]] = output_storage[jout].storage[0]
elif (store_steps[j] == 1 or vector_outs[j] == 1 or elif (store_steps[j] == 1 or vector_outs[j] == 1 or
outs[j][0][pos[j]] is not output_storage[j+offset_out].storage[0]): not output_reused[<unsigned int>(offset_out+j)]):
outs[j][0][pos[j]] = output_storage[j+offset_out].storage[0] outs[j][0][pos[j]] = output_storage[j+offset_out].storage[0]
......
...@@ -16,7 +16,7 @@ from theano.gof import cmodule ...@@ -16,7 +16,7 @@ from theano.gof import cmodule
_logger = logging.getLogger('theano.scan_module.scan_perform') _logger = logging.getLogger('theano.scan_module.scan_perform')
version = 0.284 # must match constant returned in function get_version() version = 0.285 # must match constant returned in function get_version()
need_reload = False need_reload = False
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论