提交 62ccf59f authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3656 from carriepl/scan_backend_speedup

Scan - Move costly checks from runtime to compilation
...@@ -314,13 +314,24 @@ class Scan(PureOp): ...@@ -314,13 +314,24 @@ class Scan(PureOp):
# Generate the mappings between inner and outer inputs and outputs # Generate the mappings between inner and outer inputs and outputs
# if they haven't already been generated. # if they haven't already been generated.
self.var_mappings = self.get_oinp_iinp_iout_oout_mappings() self.var_mappings = self.get_oinp_iinp_iout_oout_mappings()
if (hasattr(self, 'fn') and if hasattr(self, 'fn'):
not hasattr(self, 'thunk_mit_mot_out_slices')): if not hasattr(self, 'thunk_mit_mot_out_slices'):
# The thunk has been compiled before mit_mot preallocation feature # The thunk has been compiled before mit_mot preallocation
# was implemented. Mark every mit_mot output tap as not having # feature was implemented. Mark every mit_mot output tap as
# been preallocated # not having been preallocated
self.mitmots_preallocated = [False] * self.n_mit_mot_outs self.mitmots_preallocated = [False] * self.n_mit_mot_outs
if not hasattr(self, 'outs_on_gpu'):
# The thunk has been compiled before the analysis, at
# compilation time, of the location of the inputs and outputs.
# Perform this analysis here.
self.inps_on_gpu = [not isinstance(out,
theano.tensor.TensorVariable)
for out in self.fn.maker.fgraph.inputs]
self.outs_on_gpu = [not isinstance(out,
theano.tensor.TensorVariable)
for out in self.fn.maker.fgraph.outputs]
# Ensure that the graph associated with the inner function is valid. # Ensure that the graph associated with the inner function is valid.
self.validate_inner_graph() self.validate_inner_graph()
...@@ -858,6 +869,13 @@ class Scan(PureOp): ...@@ -858,6 +869,13 @@ class Scan(PureOp):
profile=profile, profile=profile,
on_unused_input='ignore') on_unused_input='ignore')
# Analyse the compile inner function to determine which inputs and
# outputs are on the gpu and speed up some checks during the execution
self.inps_on_gpu = [not isinstance(out, theano.tensor.TensorVariable)
for out in self.fn.maker.fgraph.inputs]
self.outs_on_gpu = [not isinstance(out, theano.tensor.TensorVariable)
for out in self.fn.maker.fgraph.outputs]
try: try:
cython_mintaps = numpy.asarray(self.mintaps, dtype='int32') cython_mintaps = numpy.asarray(self.mintaps, dtype='int32')
cython_tap_array_len = \ cython_tap_array_len = \
...@@ -894,6 +912,9 @@ class Scan(PureOp): ...@@ -894,6 +912,9 @@ class Scan(PureOp):
cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated, cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated,
dtype='int32') dtype='int32')
cython_inps_on_gpu = numpy.asarray(self.inps_on_gpu, dtype='int32')
cython_outs_on_gpu = numpy.asarray(self.outs_on_gpu, dtype='int32')
if hasattr(self, 'destroy_map'): if hasattr(self, 'destroy_map'):
cython_destroy_map = [x in self.destroy_map cython_destroy_map = [x in self.destroy_map
for x in xrange(len(node.outputs))] for x in xrange(len(node.outputs))]
...@@ -921,6 +942,8 @@ class Scan(PureOp): ...@@ -921,6 +942,8 @@ class Scan(PureOp):
cython_mit_mot_out_slices, cython_mit_mot_out_slices,
cython_mit_mot_out_nslices, cython_mit_mot_out_nslices,
cython_mitmots_preallocated, cython_mitmots_preallocated,
cython_inps_on_gpu,
cython_outs_on_gpu,
self.fn.fn, self.fn.fn,
self.fn, self.fn,
cython_destroy_map, cython_destroy_map,
...@@ -1280,12 +1303,12 @@ class Scan(PureOp): ...@@ -1280,12 +1303,12 @@ class Scan(PureOp):
var = output_storage[idx].storage[0] var = output_storage[idx].storage[0]
old_output_storage[idx] = var old_output_storage[idx] = var
if hasattr(var, 'gpudata'): if var is None:
old_output_data[idx] = None
elif self.outs_on_gpu[idx]:
old_output_data[idx] = var.gpudata old_output_data[idx] = var.gpudata
elif hasattr(var, 'data'):
old_output_data[idx] = var.data
else: else:
old_output_data[idx] = None old_output_data[idx] = var.data
# 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays, # 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays,
# etc) associated with mitmot inputs currently in the # etc) associated with mitmot inputs currently in the
...@@ -1298,12 +1321,12 @@ class Scan(PureOp): ...@@ -1298,12 +1321,12 @@ class Scan(PureOp):
var = input_storage[idx + self.n_seqs].storage[0] var = input_storage[idx + self.n_seqs].storage[0]
old_mitmot_input_storage[idx] = var old_mitmot_input_storage[idx] = var
if hasattr(var, 'gpudata'): if var is None:
old_mitmot_input_data[idx] = None
elif self.inps_on_gpu[idx]:
old_mitmot_input_data[idx] = var.gpudata old_mitmot_input_data[idx] = var.gpudata
elif hasattr(var, 'data'):
old_mitmot_input_data[idx] = var.data
else: else:
old_mitmot_input_data[idx] = None old_mitmot_input_data[idx] = var.data
# 5.1 compute outputs # 5.1 compute outputs
t0_fn = time.time() t0_fn = time.time()
...@@ -1365,9 +1388,9 @@ class Scan(PureOp): ...@@ -1365,9 +1388,9 @@ class Scan(PureOp):
new_var = input_storage[self.n_seqs + inp_idx].storage[0] new_var = input_storage[self.n_seqs + inp_idx].storage[0]
if old_var is new_var: if old_var is new_var:
old_data = old_mitmot_input_data[inp_idx] old_data = old_mitmot_input_data[inp_idx]
if hasattr(new_var, 'gpudata'): if self.inps_on_gpu[self.n_seqs + inp_idx]:
same_data = (new_var.gpudata == old_data) same_data = (new_var.gpudata == old_data)
elif hasattr(new_var, 'data'): else:
same_data = (new_var.data == old_data) same_data = (new_var.data == old_data)
else: else:
same_data = False same_data = False
...@@ -1411,9 +1434,9 @@ class Scan(PureOp): ...@@ -1411,9 +1434,9 @@ class Scan(PureOp):
old_data = old_output_data[offset_out + j] old_data = old_output_data[offset_out + j]
if old_data is None: if old_data is None:
output_reused = False output_reused = False
elif hasattr(new_var, 'gpudata'): elif self.outs_on_gpu[offset_out + j]:
output_reused = (new_var.gpudata == old_data) output_reused = (new_var.gpudata == old_data)
elif hasattr(new_var, 'data'): else:
output_reused = (new_var.data == old_data) output_reused = (new_var.data == old_data)
else: else:
output_reused = False output_reused = False
...@@ -1454,9 +1477,9 @@ class Scan(PureOp): ...@@ -1454,9 +1477,9 @@ class Scan(PureOp):
if old_var is new_var: if old_var is new_var:
if old_data is None: if old_data is None:
output_reused = False output_reused = False
elif hasattr(new_var, 'gpudata'): elif self.outs_on_gpu[offset_out + j]:
output_reused = (new_var.gpudata == old_data) output_reused = (new_var.gpudata == old_data)
elif hasattr(new_var, 'data'): else:
output_reused = (new_var.data == old_data) output_reused = (new_var.data == old_data)
else: else:
output_reused = False output_reused = False
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论