提交 4ed010d8 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

This should get scan working with non-float32 inputs/outputs in gpuarray.

It should also not break the old cuda backend.
上级 484ee1e0
...@@ -1535,6 +1535,11 @@ def local_gpu_extract_diagonal(node): ...@@ -1535,6 +1535,11 @@ def local_gpu_extract_diagonal(node):
gpu_from_host(diag_node.inputs[0]))] gpu_from_host(diag_node.inputs[0]))]
return False return False
def typeConstructor(broadcastable, dtype):
if dtype == 'float32':
return CudaNdarrayType(broadcastable=broadcastable)
else:
return TensorType(broadcastable=broadcastable, dtype=dtype)
@register_opt('scan') @register_opt('scan')
@local_optimizer([gpu_from_host, scan_op.Scan]) @local_optimizer([gpu_from_host, scan_op.Scan])
...@@ -1593,8 +1598,6 @@ def gpuScanOptimization(node): ...@@ -1593,8 +1598,6 @@ def gpuScanOptimization(node):
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
broadcastable=broadcastable)
nw_op = scan_op.Scan(scan_ins, nw_op = scan_op.Scan(scan_ins,
scan_outs, scan_outs,
info, info,
...@@ -1642,10 +1645,6 @@ def gpuScanOptimization(node): ...@@ -1642,10 +1645,6 @@ def gpuScanOptimization(node):
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
def typeConstructor(broadcastable, dtype):
assert dtype == 'float32'
return CudaNdarrayType(broadcastable=broadcastable)
_outputs = scan_op.Scan( _outputs = scan_op.Scan(
scan_ins, scan_ins,
scan_outs, scan_outs,
...@@ -1662,7 +1661,7 @@ def gpuScanOptimization(node): ...@@ -1662,7 +1661,7 @@ def gpuScanOptimization(node):
optdb.register('gpu_scanOp_make_inplace', optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType, scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
gpu_flag=True), gpu_flag=True),
75, 75,
'gpu', 'gpu',
......
...@@ -518,7 +518,6 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -518,7 +518,6 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
def local_scan_to_gpua(node): def local_scan_to_gpua(node):
info = copy.deepcopy(node.op.info) info = copy.deepcopy(node.op.info)
info['gpu'] = True
info['gpua'] = True info['gpua'] = True
nw_ins = [node.inputs[0]] nw_ins = [node.inputs[0]]
e = (1 + e = (1 +
...@@ -540,8 +539,8 @@ def local_scan_to_gpua(node): ...@@ -540,8 +539,8 @@ def local_scan_to_gpua(node):
[safe_to_cpu(x) for x in scan_ins])) [safe_to_cpu(x) for x in scan_ins]))
# We need to construct the hash here, because scan # We need to construct the hash here, because scan
# __init__ does not know about cuda ndarray and can not # __init__ does not know about the gpu and can not
# handle graphs with inputs being Cuda Ndarrays # handle graphs with inputs being on the gpu
tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs) tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs)
local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False) local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
......
...@@ -56,22 +56,21 @@ class Scan(PureOp): ...@@ -56,22 +56,21 @@ class Scan(PureOp):
the scan op (like number of different types of the scan op (like number of different types of
arguments, name, mode, if it should run on GPU or arguments, name, mode, if it should run on GPU or
not, etc.) not, etc.)
:param typeConstructor: function that constructs a Theano TensorType :param typeConstructor: function that constructs an equivalent
able to represent a float32 ndarray. to Theano TensorType
Note: ``typeConstructor`` had been added to refactor how Theano
deals with the GPU. If it runs on the GPU, scan needs to construct Note: ``typeConstructor`` had been added to refactor how
certain outputs (those who reside in the GPU memory) as CudaNdarray. Theano deals with the GPU. If it runs on the GPU, scan needs
However we can not import cuda in this file (as it is in sandbox, to construct certain outputs (those who reside in the GPU
and not available on each machine) so the workaround is that the GPU memory) as the GPU-specific type. However we can not import
optimization (which is aware of cuda types) passes to the gpu code in this file (as it is in sandbox, and not available
constructor of this class a function that is able to construct on each machine) so the workaround is that the GPU
CudaNdarray. This way the class Scan does not need to be aware of optimization passes to the constructor of this class a
CudaNdarray, it just constructs any float32 tensor using this function that is able to construct a GPU type. This way the
function (which by default constructs normal tensors). Note that the class Scan does not need to be aware of the details for the
second assumption in this code is that any float32 output or input GPU, it just constructs any tensor using this function (which
will be moved on the GPU if the optimization gets applied (following by default constructs normal tensors).
Theano's philosophy of moving as much as possible on gpu).
""" """
if 'gpua' not in info: if 'gpua' not in info:
info['gpua'] = False info['gpua'] = False
...@@ -97,23 +96,10 @@ class Scan(PureOp): ...@@ -97,23 +96,10 @@ class Scan(PureOp):
# Not that for mit_mot there are several output slices per # Not that for mit_mot there are several output slices per
# output sequence # output sequence
o = outputs[idx] o = outputs[idx]
# Scan assumes that only variables of dtype float32 might need a self.output_types.append(
# special constructor (i.e. CudaNdarray constructor) when the typeConstructor(
# code is running on GPU, as it is the only type supported by broadcastable=(False,) + o.type.broadcastable,
# Theano yet. Therefore only for dtype float32 we use the passed dtype=o.type.dtype))
# type constructor ``typeConstructor``. For anything else we
# know that even if we run it on the GPU we still construct
# normal Theano tensors.
if o.type.dtype in ['float32']:
self.output_types.append(
typeConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
else:
self.output_types.append(
tensorConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
idx += len(self.mit_mot_out_slices[jdx]) idx += len(self.mit_mot_out_slices[jdx])
jdx += 1 jdx += 1
...@@ -122,23 +108,11 @@ class Scan(PureOp): ...@@ -122,23 +108,11 @@ class Scan(PureOp):
end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot
for o in outputs[idx:end]: for o in outputs[idx:end]:
# Scan assumes that only variables of dtype float32 might need a self.output_types.append(
# special constructor (i.e. CudaNdarray constructor) when the typeConstructor(
# code is running on GPU, as it is the only type supported by broadcastable=(False,) + o.type.broadcastable,
# Theano yet. Therefore only for dtype float32 we use the passed dtype=o.type.dtype))
# type constructor ``typeConstructor``. For anything else we
# know that even if we run it on the GPU we still construct
# normal Theano tensors.
if o.type.dtype in ['float32']:
self.output_types.append(
typeConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
else:
self.output_types.append(
tensorConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
# shared outputs + possibly the ending condition # shared outputs + possibly the ending condition
for o in outputs[end:]: for o in outputs[end:]:
self.output_types.append(o.type) self.output_types.append(o.type)
...@@ -184,14 +158,14 @@ class Scan(PureOp): ...@@ -184,14 +158,14 @@ class Scan(PureOp):
self.n_shared_outs) self.n_shared_outs)
self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
self.n_tap_outs = self.n_mit_mot + self.n_mit_sot self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
if not self.info['gpu']: if self.info['gpu'] or self.info['gpua']:
self._hash_inner_graph = self.info['gpu_hash']
else:
tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs, tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
self.outputs) self.outputs)
local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False) local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
self._cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) self._cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
self._hash_inner_graph = hash(self._cmodule_key) self._hash_inner_graph = hash(self._cmodule_key)
else:
self._hash_inner_graph = self.info['gpu_hash']
def make_node(self, *inputs): def make_node(self, *inputs):
""" """
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论