提交 d99cb9df authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2702 from royxue/reduce_temp

Reduce the number of allocation/reuse allocation of ndarray #1943
......@@ -1101,8 +1101,8 @@ class ProfileStats(object):
print >> file, "(For values in brackets, it's for linker = c|py"
print >> file, "---"
# print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024))
# print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024))
print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
new_max_node_memory_size[0] / 1024.)), int(round(
max_node_memory_size[0] / 1024.)))
......
......@@ -2,6 +2,8 @@
Test of memory profiling
"""
import unittest
import StringIO
import numpy
......@@ -11,94 +13,98 @@ import theano.tensor as T
from theano.ifelse import ifelse
def test_profiling():
class Test_profiling(unittest.TestCase):
"""
Test of Theano profiling with min_peak_memory=True
"""
def test_profiling(self):
config1 = theano.config.profile
config2 = theano.config.profile_memory
config3 = theano.config.profiling.min_peak_memory
try:
theano.config.profile = True
theano.config.profile_memory = True
theano.config.profiling.min_peak_memory = True
config1 = theano.config.profile
config2 = theano.config.profile_memory
config3 = theano.config.profiling.min_peak_memory
try:
theano.config.profile = True
theano.config.profile_memory = True
theano.config.profiling.min_peak_memory = True
x = [T.fvector("val%i" % i) for i in range(3)]
x = [T.fvector("val%i" % i) for i in range(3)]
z = []
z += [T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
z += [x[i] + x[i + 1] for i in range(len(x) - 1)]
z = []
z += [T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
z += [x[i] + x[i + 1] for i in range(len(x) - 1)]
p = theano.ProfileStats(False)
p = theano.ProfileStats(False)
if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
m = "FAST_RUN"
else:
m = None
if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
m = "FAST_RUN"
else:
m = None
f = theano.function(x, z, profile=p, name="test_profiling",
mode=m)
f = theano.function(x, z, profile=p, name="test_profiling",
mode=m)
inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
output = f(*inp)
inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
output = f(*inp)
buf = StringIO.StringIO()
f.profile.summary(buf)
buf = StringIO.StringIO()
f.profile.summary(buf)
# regression testing for future algo speed up
the_string = buf.getvalue()
lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
if theano.config.device == 'cpu':
assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
lines1, lines2)
else:
assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
lines1, lines2)
# regression testing for future algo speed up
the_string = buf.getvalue()
lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
if theano.config.device == 'cpu':
assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
lines1, lines2)
else:
assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
lines1, lines2)
finally:
theano.config.profile = config1
theano.config.profile_memory = config2
theano.config.profiling.min_peak_memory = config3
finally:
theano.config.profile = config1
theano.config.profile_memory = config2
theano.config.profiling.min_peak_memory = config3
def test_ifelse():
config1 = theano.config.profile
config2 = theano.config.profile_memory
def test_ifelse(self):
config1 = theano.config.profile
config2 = theano.config.profile_memory
try:
theano.config.profile = True
theano.config.profile_memory = True
try:
theano.config.profile = True
theano.config.profile_memory = True
a, b = T.scalars('a', 'b')
x, y = T.scalars('x', 'y')
a, b = T.scalars('a', 'b')
x, y = T.scalars('x', 'y')
z = ifelse(T.lt(a, b), x * 2, y * 2)
z = ifelse(T.lt(a, b), x * 2, y * 2)
p = theano.ProfileStats(False)
p = theano.ProfileStats(False)
if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
m = "FAST_RUN"
else:
m = None
if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
m = "FAST_RUN"
else:
m = None
f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
mode=m)
f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
mode=m)
val1 = 0.
val2 = 1.
big_mat1 = 10
big_mat2 = 11
val1 = 0.
val2 = 1.
big_mat1 = 10
big_mat2 = 11
out = f_ifelse(val1, val2, big_mat1, big_mat2)
out = f_ifelse(val1, val2, big_mat1, big_mat2)
finally:
theano.config.profile = config1
theano.config.profile_memory = config2
finally:
theano.config.profile = config1
theano.config.profile_memory = config2
if __name__ == '__main__':
test_profiling()
test_ifelse()
unittest.main()
......@@ -338,3 +338,34 @@ def test_vm_gc():
f = theano.function([x], [pp + pp],
mode=mode)
f([1, 2, 3])
def test_reallocation():
x = tensor.scalar('x')
y = tensor.scalar('y')
z = tensor.tanh(3 * x + y) + tensor.cosh(x + 5 * y)
for l in ['vm_nogc', 'vm', 'vm_nogc', 'vm']:
m = theano.compile.get_mode(theano.Mode(linker=l))
m = m.excluding('fusion', 'inplace')
f = theano.function([x, y], z, name="test_reduce_memory",
mode=m)
output = f(1, 2)
assert output
storage_map = f.fn.storage_map
def check_storage(storage_map):
from theano.tensor.var import TensorConstant
for i in storage_map.keys():
if not isinstance(i, TensorConstant):
keys_copy = storage_map.keys()[:]
keys_copy.remove(i)
for o in keys_copy:
if (storage_map[i][0] and
storage_map[i][0] is storage_map[o][0]):
return [True, storage_map[o][0]]
return [False, None]
assert check_storage(storage_map)[0]
assert len(set([id(v) for v in
storage_map.values()])) < len(storage_map)
......@@ -17,6 +17,8 @@ from theano.configparser import (config, AddConfigVar,
import theano.gof.cmodule
from theano.compat.python2x import defaultdict
logger = logging.getLogger(__name__)
AddConfigVar('profile',
......@@ -53,6 +55,87 @@ AddConfigVar('vm.lazy',
in_c_key=False)
def calculate_reallocate_info(order, fgraph, storage_map, compute_map_re, dependencies):
reallocated_info = {}
viewed_by = {}
for var in fgraph.variables:
viewed_by[var] = []
view_of = {}
pre_allocated = set([])
allocated = set([])
for idx in range(len(order)):
node = order[idx]
dmap = getattr(node.op, 'destroy_map', None)
vmap = getattr(node.op, 'view_map', None)
idx_o = 0
for out in node.outputs:
for var in node.outputs:
compute_map_re[var][0] = 1
ins = None
if dmap and idx_o in dmap:
idx_v = dmap[idx_o]
assert len(
idx_v) == 1, "Here we only support the possibility to destroy one input"
ins = node.inputs[idx_v[0]]
if vmap and idx_o in vmap:
assert ins is None
idx_v = vmap[idx_o]
assert len(
idx_v) == 1, "Here we only support the possibility to view one input"
ins = node.inputs[idx_v[0]]
if ins is not None:
assert isinstance(ins, theano.Variable)
origin = view_of.get(ins, ins)
view_of[out] = origin
viewed_by[origin].append(out)
idx_o += 1
for ins in node.inputs:
assert not (ins in view_of and viewed_by[ins])
if (getattr(ins, 'ndim', None) == 0 and not storage_map[ins][0]
and ins not in fgraph.outputs and ins.owner
and all([compute_map_re[v][0] for v in dependencies.get(ins, [])])
and ins not in allocated):
# Constant Memory cannot be changed
# Constant and shared variables' storage_map value is not empty
reuse_out = None
if ins not in view_of and not viewed_by.get(ins, []):
# where gc
for i in range(idx + 1, len(order)):
if reuse_out:
break
for out in order[i].outputs:
if (getattr(out, 'ndim', None) == 0 and out not in pre_allocated
and ins.type == out.type):
reuse_out = out
pre_allocated.add(out)
allocated.add(ins)
elif ins in view_of:
origin = view_of[ins]
if ins in viewed_by[origin]:
viewed_by[origin].remove(ins)
if (not viewed_by[origin] and
origin not in fgraph.inputs and
not isinstance(origin, theano.Constant)):
# where gc
for i in range(idx + 1, len(order)):
if reuse_out:
break
for out in order[i].outputs:
if (getattr(out, 'ndim', None) == 0 and out not in pre_allocated
and ins.type == out.type):
reuse_out = out
pre_allocated.add(out)
allocated.add(ins)
if reuse_out:
reallocated_info[ins] = [ins, reuse_out]
return reallocated_info
class VM(object):
"""
......@@ -150,7 +233,7 @@ class VM(object):
profile.node_cleared_order = self.node_cleared_order[:]
if hasattr(self, 'dependencies'):
profile.dependencies = self.dependencies.copy()
profile.dependencies = self.dependencies
# clear the timer info out of the buffers
for i in xrange(len(self.call_times)):
......@@ -448,7 +531,7 @@ class Stack(VM):
# DO NOT set compute_map to 0
# If values become False and the
#current_apply is still in the
# current_apply is still in the
# stack, this will cause it to be
# recomputed! This can cause wrong value
# with some combination of inplace op.
......@@ -495,7 +578,7 @@ class Stack(VM):
except Exception:
link.raise_with_op(current_apply,
self.thunks[self.node_idx[current_apply]],
storage_map)
storage_map=storage_map)
if requires:
for r in requires:
......@@ -715,7 +798,7 @@ class VM_Linker(link.LocalLinker):
post_thunk_clear,
computed,
compute_map,
updated_vars
updated_vars,
):
pre_call_clear = [storage_map[v] for v in self.no_recycling]
......@@ -859,12 +942,14 @@ class VM_Linker(link.LocalLinker):
nodes,
thunks,
pre_call_clear,
post_thunk_clear)
post_thunk_clear,
)
else:
vm = Loop(
nodes,
thunks,
pre_call_clear)
pre_call_clear,
)
else:
# Needed when allow_gc=True and profiling
deps = self.compute_gc_dependencies(storage_map)
......@@ -890,6 +975,19 @@ class VM_Linker(link.LocalLinker):
compute_map[k] = [k.owner is None]
thunks = []
# Collect Reallocation Info
compute_map_re = defaultdict(lambda: [0])
for var in fgraph.inputs:
compute_map_re[var][0] = 1
if getattr(fgraph.profile, 'dependencies', None):
dependencies = getattr(fgraph.profile, 'dependencies')
else:
dependencies = self.compute_gc_dependencies(storage_map)
reallocated_info = calculate_reallocate_info(order, fgraph, storage_map, compute_map_re,dependencies)
for node in order:
try:
thunks.append(node.op.make_thunk(node,
......@@ -909,6 +1007,15 @@ class VM_Linker(link.LocalLinker):
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
lazy = self.lazy
if lazy is None:
lazy = config.vm.lazy
if lazy is None:
lazy = not all([(not th.lazy) for th in thunks])
if not (lazy or (config.profile and config.profile_memory) or self.use_cloop or self.callback):
for pair in reallocated_info.values():
storage_map[pair[1]] = storage_map[pair[0]]
computed, last_user = link.gc_helper(order)
if self.allow_gc:
post_thunk_clear = []
......@@ -917,7 +1024,8 @@ class VM_Linker(link.LocalLinker):
for input in node.inputs:
if ((input in computed)
and (input not in fgraph.outputs)
and (node == last_user[input])):
and (node == last_user[input])
and input not in reallocated_info.keys()):
clear_after_this_thunk.append(storage_map[input])
post_thunk_clear.append(clear_after_this_thunk)
else:
......@@ -928,7 +1036,7 @@ class VM_Linker(link.LocalLinker):
post_thunk_clear,
computed,
compute_map,
self.updated_vars
self.updated_vars,
)
vm.storage_map = storage_map
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论