提交 62ccf59f authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3656 from carriepl/scan_backend_speedup

Scan - Move costly checks from runtime to compilation
......@@ -314,12 +314,23 @@ class Scan(PureOp):
# Generate the mappings between inner and outer inputs and outputs
# if they haven't already been generated.
self.var_mappings = self.get_oinp_iinp_iout_oout_mappings()
if (hasattr(self, 'fn') and
not hasattr(self, 'thunk_mit_mot_out_slices')):
# The thunk has been compiled before mit_mot preallocation feature
# was implemented. Mark every mit_mot output tap as not having
# been preallocated
self.mitmots_preallocated = [False] * self.n_mit_mot_outs
if hasattr(self, 'fn'):
if not hasattr(self, 'thunk_mit_mot_out_slices'):
# The thunk has been compiled before mit_mot preallocation
# feature was implemented. Mark every mit_mot output tap as
# not having been preallocated
self.mitmots_preallocated = [False] * self.n_mit_mot_outs
if not hasattr(self, 'outs_on_gpu'):
# The thunk has been compiled before the analysis, at
# compilation time, of the location of the inputs and outputs.
# Perform this analysis here.
self.inps_on_gpu = [not isinstance(out,
theano.tensor.TensorVariable)
for out in self.fn.maker.fgraph.inputs]
self.outs_on_gpu = [not isinstance(out,
theano.tensor.TensorVariable)
for out in self.fn.maker.fgraph.outputs]
# Ensure that the graph associated with the inner function is valid.
self.validate_inner_graph()
......@@ -858,6 +869,13 @@ class Scan(PureOp):
profile=profile,
on_unused_input='ignore')
# Analyse the compile inner function to determine which inputs and
# outputs are on the gpu and speed up some checks during the execution
self.inps_on_gpu = [not isinstance(out, theano.tensor.TensorVariable)
for out in self.fn.maker.fgraph.inputs]
self.outs_on_gpu = [not isinstance(out, theano.tensor.TensorVariable)
for out in self.fn.maker.fgraph.outputs]
try:
cython_mintaps = numpy.asarray(self.mintaps, dtype='int32')
cython_tap_array_len = \
......@@ -894,6 +912,9 @@ class Scan(PureOp):
cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated,
dtype='int32')
cython_inps_on_gpu = numpy.asarray(self.inps_on_gpu, dtype='int32')
cython_outs_on_gpu = numpy.asarray(self.outs_on_gpu, dtype='int32')
if hasattr(self, 'destroy_map'):
cython_destroy_map = [x in self.destroy_map
for x in xrange(len(node.outputs))]
......@@ -921,6 +942,8 @@ class Scan(PureOp):
cython_mit_mot_out_slices,
cython_mit_mot_out_nslices,
cython_mitmots_preallocated,
cython_inps_on_gpu,
cython_outs_on_gpu,
self.fn.fn,
self.fn,
cython_destroy_map,
......@@ -1280,12 +1303,12 @@ class Scan(PureOp):
var = output_storage[idx].storage[0]
old_output_storage[idx] = var
if hasattr(var, 'gpudata'):
if var is None:
old_output_data[idx] = None
elif self.outs_on_gpu[idx]:
old_output_data[idx] = var.gpudata
elif hasattr(var, 'data'):
old_output_data[idx] = var.data
else:
old_output_data[idx] = None
old_output_data[idx] = var.data
# 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays,
# etc) associated with mitmot inputs currently in the
......@@ -1298,12 +1321,12 @@ class Scan(PureOp):
var = input_storage[idx + self.n_seqs].storage[0]
old_mitmot_input_storage[idx] = var
if hasattr(var, 'gpudata'):
if var is None:
old_mitmot_input_data[idx] = None
elif self.inps_on_gpu[idx]:
old_mitmot_input_data[idx] = var.gpudata
elif hasattr(var, 'data'):
old_mitmot_input_data[idx] = var.data
else:
old_mitmot_input_data[idx] = None
old_mitmot_input_data[idx] = var.data
# 5.1 compute outputs
t0_fn = time.time()
......@@ -1365,9 +1388,9 @@ class Scan(PureOp):
new_var = input_storage[self.n_seqs + inp_idx].storage[0]
if old_var is new_var:
old_data = old_mitmot_input_data[inp_idx]
if hasattr(new_var, 'gpudata'):
if self.inps_on_gpu[self.n_seqs + inp_idx]:
same_data = (new_var.gpudata == old_data)
elif hasattr(new_var, 'data'):
else:
same_data = (new_var.data == old_data)
else:
same_data = False
......@@ -1411,9 +1434,9 @@ class Scan(PureOp):
old_data = old_output_data[offset_out + j]
if old_data is None:
output_reused = False
elif hasattr(new_var, 'gpudata'):
elif self.outs_on_gpu[offset_out + j]:
output_reused = (new_var.gpudata == old_data)
elif hasattr(new_var, 'data'):
else:
output_reused = (new_var.data == old_data)
else:
output_reused = False
......@@ -1454,9 +1477,9 @@ class Scan(PureOp):
if old_var is new_var:
if old_data is None:
output_reused = False
elif hasattr(new_var, 'gpudata'):
elif self.outs_on_gpu[offset_out + j]:
output_reused = (new_var.gpudata == old_data)
elif hasattr(new_var, 'data'):
else:
output_reused = (new_var.data == old_data)
else:
output_reused = False
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论