提交 65ac8e8a authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2386 from abergeron/multi_fixes2

Multi-GPU scan fixes
...@@ -2000,12 +2000,6 @@ def local_gpu_extract_diagonal(node): ...@@ -2000,12 +2000,6 @@ def local_gpu_extract_diagonal(node):
gpu_from_host(diag_node.inputs[0]))] gpu_from_host(diag_node.inputs[0]))]
return False return False
def typeConstructor(broadcastable, dtype):
if dtype == 'float32':
return CudaNdarrayType(broadcastable=broadcastable)
else:
return tensor.TensorType(broadcastable=broadcastable, dtype=dtype)
@register_opt('scan') @register_opt('scan')
@local_optimizer([gpu_from_host, scan_op.Scan]) @local_optimizer([gpu_from_host, scan_op.Scan])
def gpuScanOptimization(node): def gpuScanOptimization(node):
...@@ -2065,9 +2059,7 @@ def gpuScanOptimization(node): ...@@ -2065,9 +2059,7 @@ def gpuScanOptimization(node):
nw_op = scan_op.Scan(scan_ins, nw_op = scan_op.Scan(scan_ins,
scan_outs, scan_outs,
info, info).make_node(*nw_ins)
typeConstructor=typeConstructor).make_node(
*nw_ins)
_outputs = nw_op.outputs _outputs = nw_op.outputs
return _outputs return _outputs
...@@ -2113,8 +2105,7 @@ def gpuScanOptimization(node): ...@@ -2113,8 +2105,7 @@ def gpuScanOptimization(node):
_outputs = scan_op.Scan( _outputs = scan_op.Scan(
scan_ins, scan_ins,
scan_outs, scan_outs,
info, info).make_node(*nw_ins).outputs
typeConstructor=typeConstructor).make_node(*nw_ins).outputs
outputs = [] outputs = []
for x, y in zip(_outputs, node.outputs): for x, y in zip(_outputs, node.outputs):
if isinstance(y.type, CudaNdarrayType): if isinstance(y.type, CudaNdarrayType):
...@@ -2126,8 +2117,7 @@ def gpuScanOptimization(node): ...@@ -2126,8 +2117,7 @@ def gpuScanOptimization(node):
optdb.register('gpu_scanOp_make_inplace', optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor, scan_opt.ScanInplaceOptimizer(gpu_flag=True),
gpu_flag=True),
75, 75,
'gpu', 'gpu',
'fast_run', 'fast_run',
......
...@@ -716,13 +716,11 @@ def local_scan_to_gpua(node): ...@@ -716,13 +716,11 @@ def local_scan_to_gpua(node):
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
nw_op = scan_op.Scan(scan_ins, scan_outs, info, nw_op = scan_op.Scan(scan_ins, scan_outs, info).make_node(*nw_ins)
typeConstructor=GpuArrayType).make_node(*nw_ins)
return nw_op.outputs return nw_op.outputs
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType, scan_opt.ScanInplaceOptimizer(gpua_flag=True),
gpua_flag=True),
75, 75,
'gpua', 'gpua',
'fast_run', 'fast_run',
......
...@@ -15,6 +15,7 @@ from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu ...@@ -15,6 +15,7 @@ from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
class T_Scan(TestCase): class T_Scan(TestCase):
def setUp(self): def setUp(self):
utt.seed_rng() utt.seed_rng()
super(T_Scan, self).setUp()
def test_one_sequence_one_output_weights_gpu1(self): def test_one_sequence_one_output_weights_gpu1(self):
def f_rnn(u_t, x_tm1, W_in, W): def f_rnn(u_t, x_tm1, W_in, W):
......
...@@ -594,7 +594,9 @@ def scan(fn, ...@@ -594,7 +594,9 @@ def scan(fn,
if init_out.get('taps', None) == [-1]: if init_out.get('taps', None) == [-1]:
actual_arg = init_out['initial'] actual_arg = init_out['initial']
arg = safe_new(init_out['initial']) if not isinstance(actual_arg, tensor.Variable):
actual_arg = tensor.as_tensor_variable(actual_arg)
arg = safe_new(actual_arg)
if isinstance(arg, tensor.Constant): if isinstance(arg, tensor.Constant):
# safe new returns a clone of the constants, but that is not # safe new returns a clone of the constants, but that is not
# what we need for initial states # what we need for initial states
......
...@@ -49,7 +49,6 @@ class Scan(PureOp): ...@@ -49,7 +49,6 @@ class Scan(PureOp):
inputs, inputs,
outputs, outputs,
info, info,
typeConstructor=None,
): ):
""" """
:param inputs: inputs of the inner function of scan :param inputs: inputs of the inner function of scan
...@@ -58,21 +57,6 @@ class Scan(PureOp): ...@@ -58,21 +57,6 @@ class Scan(PureOp):
the scan op (like number of different types of the scan op (like number of different types of
arguments, name, mode, if it should run on GPU or arguments, name, mode, if it should run on GPU or
not, etc.) not, etc.)
:param typeConstructor: function that constructs an equivalent
to Theano TensorType
Note: ``typeConstructor`` had been added to refactor how
Theano deals with the GPU. If it runs on the GPU, scan needs
to construct certain outputs (those who reside in the GPU
memory) as the GPU-specific type. However we can not import
gpu code in this file (as it is in sandbox, and not available
on each machine) so the workaround is that the GPU
optimization passes to the constructor of this class a
function that is able to construct a GPU type. This way the
class Scan does not need to be aware of the details for the
GPU, it just constructs any tensor using this function (which
by default constructs normal tensors).
""" """
if 'gpua' not in info: if 'gpua' not in info:
info['gpua'] = False info['gpua'] = False
...@@ -88,19 +72,13 @@ class Scan(PureOp): ...@@ -88,19 +72,13 @@ class Scan(PureOp):
self.output_types = [] self.output_types = []
idx = 0 idx = 0
jdx = 0 jdx = 0
tensorConstructor = lambda broadcastable, dtype: TensorType(
broadcastable=broadcastable, dtype=dtype)
if typeConstructor is None:
typeConstructor = tensorConstructor
while idx < self.n_mit_mot_outs: while idx < self.n_mit_mot_outs:
# Not that for mit_mot there are several output slices per # Not that for mit_mot there are several output slices per
# output sequence # output sequence
o = outputs[idx] o = outputs[idx]
self.output_types.append( self.output_types.append(
typeConstructor( o.type.clone(broadcastable=(False,) + o.type.broadcastable))
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
idx += len(self.mit_mot_out_slices[jdx]) idx += len(self.mit_mot_out_slices[jdx])
jdx += 1 jdx += 1
...@@ -110,9 +88,7 @@ class Scan(PureOp): ...@@ -110,9 +88,7 @@ class Scan(PureOp):
for o in outputs[idx:end]: for o in outputs[idx:end]:
self.output_types.append( self.output_types.append(
typeConstructor( o.type.clone(broadcastable=(False,) + o.type.broadcastable))
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
# shared outputs + possibly the ending condition # shared outputs + possibly the ending condition
for o in outputs[end:]: for o in outputs[end:]:
...@@ -241,10 +217,9 @@ class Scan(PureOp): ...@@ -241,10 +217,9 @@ class Scan(PureOp):
if rval.ndim == as_var.ndim: if rval.ndim == as_var.ndim:
rval = as_var.type.filter_variable(rval) rval = as_var.type.filter_variable(rval)
else: else:
tmp = as_var.type.__class__( tmp = as_var.type.clone(
broadcastable=tuple(var.broadcastable[:1])+\ broadcastable=(tuple(var.broadcastable[:1]) +
tuple(as_var.broadcastable), tuple(as_var.broadcastable)))
dtype=as_var.dtype)
rval = tmp.filter_variable(rval) rval = tmp.filter_variable(rval)
return rval return rval
...@@ -517,11 +492,11 @@ class Scan(PureOp): ...@@ -517,11 +492,11 @@ class Scan(PureOp):
return aux_txt return aux_txt
def __hash__(self): def __hash__(self):
return (hash(type(self)) ^ return hash((type(self),
# and a hash representing the inner graph using the # and a hash representing the inner graph using the
# CLinker.cmodule_key_ # CLinker.cmodule_key_
self._hash_inner_graph ^ self._hash_inner_graph,
scan_utils.hash_listsDictsTuples(self.info)) scan_utils.hash_listsDictsTuples(self.info)))
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
""" """
......
...@@ -916,9 +916,8 @@ class PushOutScanOutput(gof.Optimizer): ...@@ -916,9 +916,8 @@ class PushOutScanOutput(gof.Optimizer):
class ScanInplaceOptimizer(Optimizer): class ScanInplaceOptimizer(Optimizer):
"""Graph optimizer for Scan(makes it run inplace)""" """Graph optimizer for Scan(makes it run inplace)"""
def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False): def __init__(self, gpu_flag=False, gpua_flag=False):
Optimizer.__init__(self) Optimizer.__init__(self)
self.typeConstructor = typeConstructor
self.gpu_flag = gpu_flag self.gpu_flag = gpu_flag
self.gpua_flag = gpua_flag self.gpua_flag = gpua_flag
...@@ -960,8 +959,7 @@ class ScanInplaceOptimizer(Optimizer): ...@@ -960,8 +959,7 @@ class ScanInplaceOptimizer(Optimizer):
inputs = ls_begin + ls + ls_end inputs = ls_begin + ls + ls_end
new_op = scan_op.Scan(op.inputs, new_op = scan_op.Scan(op.inputs,
op.outputs, op.outputs,
info, info)
typeConstructor=self.typeConstructor)
# Do not call make_node for test_value # Do not call make_node for test_value
new_outs = new_op(*inputs, **dict(return_list=True)) new_outs = new_op(*inputs, **dict(return_list=True))
...@@ -2087,8 +2085,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB() ...@@ -2087,8 +2085,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB()
optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan') optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan')
optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan') optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan')
optdb.register('scanOp_make_inplace', optdb.register('scanOp_make_inplace',
ScanInplaceOptimizer(typeConstructor=None, ScanInplaceOptimizer(),
gpu_flag=False),
75, 75,
'fast_run', 'fast_run',
'inplace', 'inplace',
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论