提交 b6410099 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2243 from RoyXue/fix_crash_and_GPU_support

Fix ifelse crash and gpu support
......@@ -642,12 +642,15 @@ class ProfileStats(object):
fct_shapes[node.fgraph].setdefault(node, [])
sum_dense = 0
for out in node.outputs:
sh = self.variable_shape[out]
if hasattr(out.type, 'get_size'):
v = out.type.get_size(sh)
sum_dense += v
if out in self.variable_shape.keys():
sh = self.variable_shape[out]
if hasattr(out.type, 'get_size'):
v = out.type.get_size(sh)
sum_dense += v
else:
v = 'Unknown'
else:
v = "Unknown"
v = 'Variable isnt created'
var_mem[out] = v
fct_memory[node.fgraph][node].append(v)
......@@ -656,14 +659,17 @@ class ProfileStats(object):
# Find the function that used the most of that statistic
max_sum_size = 0
max_node_memory_size = 0
max_running_max_memory_size = 0
# statistics with the old order
# TODO: Make list more flexible with mulitply GPUs later
max_node_memory_size = [0, 0, 0]
max_running_max_memory_size = [0, 0, 0]
max_node_memory_saved_by_view = 0
max_node_memory_saved_by_inplace = 0
# statistic with the new order
new_max_node_memory_size = 0
new_max_running_max_memory_size = 0
# statistics with the new order
new_max_node_memory_size = [0, 0, 0]
new_max_running_max_memory_size = [0, 0, 0]
new_max_node_memory_saved_by_view = 0
new_max_node_memory_saved_by_inplace = 0
......@@ -689,10 +695,11 @@ class ProfileStats(object):
The sum of memory saved by reusing the input instead of
new allocation
"""
node_memory_size = 0
running_memory_size = 0
running_max_memory_size = 0
from theano.sandbox.cuda import CudaNdarrayType
# Initial Mem info values [CPU, GPU]
node_memory_size = [0, 0]
running_memory_size = [0, 0]
running_max_memory_size = [0, 0]
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
# This take only the inputs/outputs dependencies.
......@@ -734,6 +741,10 @@ class ProfileStats(object):
# allocated by the node
idx2 = 0
for out in node.outputs:
if isinstance(out.type, CudaNdarrayType):
cg = 1
else:
cg = 0
ins = None
if dmap and idx2 in dmap:
vidx = dmap[idx2]
......@@ -757,30 +768,36 @@ class ProfileStats(object):
view_of[out] = origin
viewed_by[origin].append(out)
else:
running_memory_size += var_mem[out]
node_memory_size += var_mem[out]
running_memory_size[cg] += var_mem[out]
node_memory_size[cg] += var_mem[out]
idx2 += 1
running_max_memory_size = max(running_max_memory_size,
running_memory_size)
running_max_memory_size[0] = max(running_max_memory_size[0],
running_memory_size[0])
running_max_memory_size[1] = max(running_max_memory_size[1],
running_memory_size[1])
# Mimic the combination of Theano and Python gc
for ins in node.inputs:
assert not (ins in view_of and viewed_by[ins])
# we trac the original var, so this shouldn't happen
if isinstance(ins.type, CudaNdarrayType):
cg = 1
else:
cg = 0
if (dependencies[ins] and
ins not in fgraph.outputs and
ins.owner and
all([compute_map[v][0] for v in dependencies[ins]])):
if ins not in view_of and not viewed_by.get(ins, []):
running_memory_size -= var_mem[ins]
running_memory_size[cg] -= var_mem[ins]
elif ins in view_of:
origin = view_of[ins]
viewed_by[origin].remove(ins)
if (not viewed_by[origin] and
origin not in fgraph.inputs and
not isinstance(origin, theano.Constant)):
running_memory_size -= var_mem[origin]
running_memory_size[cg] -= var_mem[origin]
else:
# ins is viewed_by something else, so its
# memory isn't freed
......@@ -994,24 +1011,46 @@ class ProfileStats(object):
# Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size)
max_node_memory_size = max(max_node_memory_size,
old_running_memory[0])
max_running_max_memory_size = max(max_running_max_memory_size,
old_running_memory[2])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
old_running_memory[4])
max_node_memory_size[0] = max(max_node_memory_size[0],
sum(old_running_memory[0]))
max_running_max_memory_size[0] = max(max_running_max_memory_size[0],
sum(old_running_memory[2]))
# Separate CPU and GPU
max_node_memory_size[1] = max(max_node_memory_size[1],
old_running_memory[0][0])
max_node_memory_size[2] = max(max_node_memory_size[2],
old_running_memory[0][1])
max_running_max_memory_size[1] = max(max_running_max_memory_size[1],
old_running_memory[2][0])
max_running_max_memory_size[2] = max(max_running_max_memory_size[2],
old_running_memory[2][1])
max_node_memory_saved_by_inplace = max(
max_node_memory_saved_by_inplace, old_running_memory[3])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
old_running_memory[4])
# Store max of some stats with new order
new_max_node_memory_size = max(new_max_node_memory_size,
new_running_memory[0])
new_max_running_max_memory_size = max(new_max_running_max_memory_size,
new_running_memory[2])
new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
new_running_memory[4])
new_max_node_memory_size[0] = max(new_max_node_memory_size[0],
sum(new_running_memory[0]))
new_max_running_max_memory_size[0] = max(new_max_running_max_memory_size[0],
sum(new_running_memory[2]))
# Separate CPU and GPU
new_max_node_memory_size[1] = max(new_max_node_memory_size[1],
new_running_memory[0][0])
new_max_node_memory_size[2] = max(new_max_node_memory_size[2],
new_running_memory[0][1])
new_max_running_max_memory_size[1] = max(new_max_running_max_memory_size[1],
new_running_memory[2][0])
new_max_running_max_memory_size[2] = max(new_max_running_max_memory_size[2],
new_running_memory[2][1])
new_max_node_memory_saved_by_inplace = max(
new_max_node_memory_saved_by_inplace, new_running_memory[3])
new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
new_running_memory[4])
# Config: whether print min memory peak
if config.profiling.min_peak_memory:
......@@ -1035,13 +1074,30 @@ class ProfileStats(object):
print >> file, "---"
# print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024))
print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
new_max_node_memory_size / 1024.)), int(round(
max_node_memory_size / 1024.)))
new_max_node_memory_size[0] / 1024.)), int(round(
max_node_memory_size[0] / 1024.)))
print >> file, " CPU: %dKB (%dKB)" % ((int(round(
new_max_node_memory_size[1] / 1024.)), int(round(
max_node_memory_size[1] / 1024.))))
print >> file, " GPU: %dKB (%dKB)" % ((int(round(
new_max_node_memory_size[2] / 1024.)), int(round(
max_node_memory_size[2] / 1024.))))
print >> file, "---"
print >> file, " Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
new_max_running_max_memory_size / 1024.)), int(round(
max_running_max_memory_size / 1024.)))
new_max_running_max_memory_size[0] / 1024.)), int(round(
max_running_max_memory_size[0] / 1024.)))
print >> file, " CPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[1] / 1024.)), int(round(
max_running_max_memory_size[1] / 1024.))))
print >> file, " GPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[2] / 1024.)), int(round(
max_running_max_memory_size[2] / 1024.))))
print >> file, "---"
if min_max_peak:
print >> file, " Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round(
min_max_peak / 1024.)), min_peak_time)
......@@ -1052,8 +1108,10 @@ class ProfileStats(object):
(int(round(new_max_node_memory_saved_by_inplace / 1024.)),
int(round(max_node_memory_saved_by_inplace / 1024.)))
print >> file, " Memory saved if gc is enabled: %dKB (%dKB)" % (int(
round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.))
round(new_max_node_memory_size[0] - new_max_running_max_memory_size[0]) / 1024.), int(
round(max_node_memory_size[0] - max_running_max_memory_size[0]) / 1024.))
print >> file, "---"
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
......
......@@ -8,6 +8,7 @@ import numpy
import theano
import theano.tensor as T
from theano.ifelse import ifelse
def test_profiling():
......@@ -20,11 +21,11 @@ def test_profiling():
theano.config.profile_memory = True
theano.config.profiling.min_peak_memory = True
x = [T.dvector("val%i" % i) for i in range(3)]
x = [T.fvector("val%i" % i) for i in range(3)]
z = []
z += [T.outer(x[i], x[i+1]).sum(axis=1) for i in range(len(x)-1)]
z += [x[i] + x[i+1] for i in range(len(x)-1)]
z += [T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
z += [x[i] + x[i + 1] for i in range(len(x) - 1)]
p = theano.ProfileStats(False)
......@@ -36,7 +37,7 @@ def test_profiling():
f = theano.function(x, z, profile=p, name="test_profiling",
mode=m)
inp = [numpy.arange(1024) + 1 for i in range(len(x))]
inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
output = f(*inp)
buf = StringIO.StringIO()
......@@ -46,8 +47,16 @@ def test_profiling():
the_string = buf.getvalue()
lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
assert "Max if linker=cvm(default): 8224KB (16408KB)" in the_string, (lines1, lines2)
assert "Minimum peak from all valid apply node order is 8208KB" in the_string, (lines1, lines2)
if theano.config.device == 'cpu':
assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
lines1, lines2)
else:
assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
lines1, lines2)
finally:
theano.config.profile = config1
......@@ -55,5 +64,41 @@ def test_profiling():
theano.config.profiling.min_peak_memory = config3
def test_ifelse():
config1 = theano.config.profile
config2 = theano.config.profile_memory
try:
theano.config.profile = True
theano.config.profile_memory = True
a, b = T.scalars('a', 'b')
x, y = T.scalars('x', 'y')
z = ifelse(T.lt(a, b), x * 2, y * 2)
p = theano.ProfileStats(False)
if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
m = "FAST_RUN"
else:
m = None
f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
mode=m)
val1 = 0.
val2 = 1.
big_mat1 = 10
big_mat2 = 11
out = f_ifelse(val1, val2, big_mat1, big_mat2)
finally:
theano.config.profile = config1
theano.config.profile_memory = config2
if __name__ == '__main__':
test_profiling()
test_ifelse()
......@@ -56,6 +56,7 @@ raise_with_op = link.raise_with_op
class VM(object):
"""
A VM object's __call__ method evaluates a Theano program.
......@@ -83,6 +84,7 @@ class VM(object):
storage. False means it *must not* repeat that feedback.
"""
def __init__(self, nodes, thunks, pre_call_clear):
"""
Allocate a virtual machine.
......@@ -159,10 +161,12 @@ class VM(object):
class Loop(VM):
"""
Unconditional start-to-finish program execution in Python.
No garbage collection is allowed on intermediate results.
"""
def __call__(self):
if self.time_thunks:
for cont in self.pre_call_clear:
......@@ -188,10 +192,12 @@ class Loop(VM):
class LoopGC(VM):
"""
Unconditional start-to-finish program execution in Python.
Garbage collection is possible on intermediate results.
"""
def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
self.post_thunk_clear = post_thunk_clear
......@@ -231,6 +237,7 @@ class LoopGC(VM):
class Stack(VM):
"""
Finish-to-start evalution order of thunks.
......@@ -340,7 +347,7 @@ class Stack(VM):
apply_stack = list(self.base_apply_stack)
last_apply_stack_len = -1
#This record all function inputs/shared varibles and constants
# This record all function inputs/shared varibles and constants
for var, data in self.storage_map.iteritems():
if data[0] is None:
continue
......@@ -396,7 +403,7 @@ class Stack(VM):
current_idx = self.node_idx[current_apply]
self.call_counts[current_idx] += 1
self.call_times[current_idx] += dt
## Computing the memory footprint of the the op
# Computing the memory footprint of the the op
# ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory!
for (idx, o) in enumerate(
......@@ -411,7 +418,7 @@ class Stack(VM):
st = getattr(o[0], 'strides',
'input no strides')
if (getattr(o[0], 'flags', False) and
o[0].flags.c_contiguous):
o[0].flags.c_contiguous):
st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and
data[0].is_c_contiguous()):
......@@ -436,15 +443,16 @@ class Stack(VM):
if all(compute_map[v][0]
for v in dependencies[i]):
storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i))
input_index.append(
current_apply.inputs.index(i))
#DO NOT set compute_map to 0
# DO NOT set compute_map to 0
#If values become False and the
# If values become False and the
#current_apply is still in the
#stack, this will cause it to be
#recomputed! This can cause wrong value
#with some combination of inplace op.
# stack, this will cause it to be
# recomputed! This can cause wrong value
# with some combination of inplace op.
compute_map[i][0] = 2
if (config.warn.vm_gc_bug and
current_apply in apply_stack and
......@@ -452,12 +460,13 @@ class Stack(VM):
'destroy_map',
False)):
warnings.warn(
"There was a bug that existed in the default Theano configuration,"
" only in the development version between July 5th 2012"
" and July 30th 2012. This was not in a released version."
" The bug was affecting this script.",
#The stack level is not good when inside a Scan.
stacklevel=3
"There was a bug that existed in the default Theano configuration,"
" only in the development version between July 5th 2012"
" and July 30th 2012. This was not in a released version."
" The bug was affecting this script.",
# The stack level is not good when
# inside a Scan.
stacklevel=3
)
self.node_cleared_order.append(input_index)
......@@ -465,9 +474,8 @@ class Stack(VM):
# -- Non-lazy case, need inputs
apply_stack.append(current_apply)
apply_stack.extend(inp.owner
for inp in current_deps
if inp.owner)
for inp in current_deps
if inp.owner)
elif not computed_outs:
#
......@@ -511,7 +519,7 @@ class Stack(VM):
self.variable_shape[var] = sh
st = getattr(o[0], 'strides', 'input no strides')
if (getattr(o[0], 'flags', False) and
o[0].flags.c_contiguous):
o[0].flags.c_contiguous):
st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and
data[0].is_c_contiguous()):
......@@ -523,7 +531,7 @@ class Stack(VM):
if self.allow_gc:
for i in current_apply.inputs:
if (dependencies[i] and i.owner and
i not in self.outputs):
i not in self.outputs):
empty_storage_map = True
for x in dependencies[i]:
if not compute_map[x][0]:
......@@ -531,9 +539,10 @@ class Stack(VM):
break
if empty_storage_map:
storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i))
#See the not lazy gc code for explanations
#of compute_map change
input_index.append(
current_apply.inputs.index(i))
# See the not lazy gc code for explanations
# of compute_map change
compute_map[i][0] = 2
self.node_cleared_order.append(input_index)
......@@ -560,6 +569,7 @@ try:
import lazylinker_c
class CVM(lazylinker_c.CLazyLinker, VM):
def __init__(self, *args, **kwargs):
lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
# skip VM.__init__
......@@ -576,6 +586,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e:
class VM_Linker(link.LocalLinker):
"""
Class that satisfies the Linker interface by acting as a VM factory.
"""
......@@ -625,9 +636,9 @@ class VM_Linker(link.LocalLinker):
associated to self, else, a new VM_Linker associated to fgraph.
"""
if (config.profile and
hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception(
"You are running the Theano profiler with CUDA enabled."
......@@ -644,12 +655,12 @@ class VM_Linker(link.LocalLinker):
# Warning: make sure to forward the correct values of
# all parameters to __init__ here.
return type(self)(
allow_gc=self.allow_gc,
use_cloop=self.use_cloop,
callback=self.callback,
lazy=self.lazy,
schedule=self.schedule
).accept(fgraph, no_recycling)
allow_gc=self.allow_gc,
use_cloop=self.use_cloop,
callback=self.callback,
lazy=self.lazy,
schedule=self.schedule
).accept(fgraph, no_recycling)
self.fgraph = fgraph
self.no_recycling = no_recycling
return self
......@@ -700,17 +711,17 @@ class VM_Linker(link.LocalLinker):
return dependencies
def make_vm(self, nodes, thunks,
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map,
updated_vars
):
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map,
updated_vars
):
pre_call_clear = [storage_map[v] for v in self.no_recycling]
if (self.callback is not None or
(config.profile and config.profile_memory)):
(config.profile and config.profile_memory)):
if self.use_cloop and self.callback is not None:
logger.warn('CVM does not support callback, using Stack VM.')
......@@ -721,11 +732,11 @@ class VM_Linker(link.LocalLinker):
if self.allow_gc:
deps = self.compute_gc_dependencies(storage_map)
vm = Stack(
nodes, thunks, pre_call_clear,
storage_map, compute_map,
self.fgraph, self.allow_gc,
dependencies=deps,
callback=self.callback)
nodes, thunks, pre_call_clear,
storage_map, compute_map,
self.fgraph, self.allow_gc,
dependencies=deps,
callback=self.callback)
elif self.use_cloop:
# create a map from nodes to ints and vars to ints
nodes_idx = {}
......@@ -747,9 +758,9 @@ class VM_Linker(link.LocalLinker):
# put storage_map and compute_map into a int-based scheme
n_applies = len(nodes)
storage_map_list = [storage_map[vars_idx_inv[i]]
for i in xrange(len(vars_idx_inv))]
for i in xrange(len(vars_idx_inv))]
compute_map_list = [compute_map[vars_idx_inv[i]]
for i in xrange(len(vars_idx_inv))]
for i in xrange(len(vars_idx_inv))]
if nodes:
assert type(storage_map_list[0]) is list
assert type(compute_map_list[0]) is list
......@@ -796,7 +807,7 @@ class VM_Linker(link.LocalLinker):
prereq_var_idxs = []
for prereq_node in ords.get(node, []):
prereq_var_idxs.extend(
[vars_idx[v] for v in prereq_node.outputs])
[vars_idx[v] for v in prereq_node.outputs])
prereq_var_idxs = list(set(prereq_var_idxs))
prereq_var_idxs.sort() # TODO: why sort?
node_prereqs.append(prereq_var_idxs)
......@@ -816,27 +827,27 @@ class VM_Linker(link.LocalLinker):
c0 = sys.getrefcount(node_n_inputs)
vm = CVM(
nodes,
thunks,
pre_call_clear,
allow_gc=self.allow_gc,
call_counts=[0] * len(nodes),
call_times=[0.0] * len(nodes),
compute_map_list=compute_map_list,
storage_map_list=storage_map_list,
base_input_output_list=base_input_output_list,
node_n_inputs=node_n_inputs,
node_n_outputs=node_n_outputs,
node_input_offset=node_input_offset,
node_output_offset=node_output_offset,
var_owner=var_owner,
is_lazy_list=is_lazy_list,
output_vars=output_vars,
node_prereqs=node_prereqs,
node_output_size=node_output_size,
update_storage=update_storage,
dependencies=dependency_map_list,
)
nodes,
thunks,
pre_call_clear,
allow_gc=self.allow_gc,
call_counts=[0] * len(nodes),
call_times=[0.0] * len(nodes),
compute_map_list=compute_map_list,
storage_map_list=storage_map_list,
base_input_output_list=base_input_output_list,
node_n_inputs=node_n_inputs,
node_n_outputs=node_n_outputs,
node_input_offset=node_input_offset,
node_output_offset=node_output_offset,
var_owner=var_owner,
is_lazy_list=is_lazy_list,
output_vars=output_vars,
node_prereqs=node_prereqs,
node_output_size=node_output_size,
update_storage=update_storage,
dependencies=dependency_map_list,
)
assert c0 == sys.getrefcount(node_n_inputs)
else:
lazy = self.lazy
......@@ -848,36 +859,36 @@ class VM_Linker(link.LocalLinker):
# there is no conditional in the graph
if self.allow_gc:
vm = LoopGC(
nodes,
thunks,
pre_call_clear,
post_thunk_clear)
nodes,
thunks,
pre_call_clear,
post_thunk_clear)
else:
vm = Loop(
nodes,
thunks,
pre_call_clear)
nodes,
thunks,
pre_call_clear)
else:
deps = None
if self.allow_gc:
deps = self.compute_gc_dependencies(storage_map)
vm = Stack(
nodes, thunks, pre_call_clear,
storage_map, compute_map,
self.fgraph, self.allow_gc,
dependencies=deps
)
nodes, thunks, pre_call_clear,
storage_map, compute_map,
self.fgraph, self.allow_gc,
dependencies=deps
)
return vm
def make_all(self, profiler=None, input_storage=None,
output_storage=None,
):
):
fgraph = self.fgraph
order = self.schedule(fgraph)
no_recycling = self.no_recycling
input_storage, output_storage, storage_map = link.map_storage(
fgraph, order, input_storage, output_storage)
fgraph, order, input_storage, output_storage)
compute_map = {}
for k in storage_map:
compute_map[k] = [k.owner is None]
......@@ -917,12 +928,12 @@ class VM_Linker(link.LocalLinker):
post_thunk_clear = None
vm = self.make_vm(order, thunks,
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map,
self.updated_vars
)
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map,
self.updated_vars
)
vm.storage_map = storage_map
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论