提交 c190056b authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2342 from abergeron/multi_fixes

Multi fixes (pieces of the multi-gpu code)
......@@ -43,9 +43,6 @@ There are less methods to define for an Op than for a Type:
that a python exception is set) if your C code needs to
raise an exception.
``sub['struct_id']``
The integer id passed to the various _struct methods.
.. method:: c_code_cleanup(node, name, input_names, output_names, sub)
......@@ -99,15 +96,12 @@ There are less methods to define for an Op than for a Type:
module is initialized, before anything else is executed and is
specialized for a particular apply of an :ref:`op`.
.. method:: c_init_code_struct(node, struct_id, sub)
.. method:: c_init_code_struct(node, name, sub)
Allows you to specify code that will be inserted in the struct
constructor of the Op. This is for code which should be
executed once per thunk (Apply node, more or less).
`struct_id` is an integer guaranteed to be unique inside the
struct.
`sub` is a dictionary of extras parameters to the
c_code_init_code_struct method. It contains the following
values:
......@@ -131,31 +125,25 @@ There are less methods to define for an Op than for a Type:
if the code is the same for each apply of an op. It will be
inserted at global scope.
.. method:: c_support_code_struct(node, struct_id)
.. method:: c_support_code_struct(node, name)
Allows you to specify helper functions of variables that will
be specific to one particular thunk. These are inserted at
struct scope.
`struct_id` is an integer guaranteed to be unique inside the
struct.
:note:
You cannot specify kernels in the code returned by this since
that isn't supported by CUDA. You should place your kernels
in :meth:`c_support_code()` or :meth:`c_support_code_apply()`
and call them from this code.
You cannot specify CUDA kernels in the code returned by this
since that isn't supported by CUDA. You should place your
kernels in :meth:`c_support_code()` or
:meth:`c_support_code_apply()` and call them from this code.
.. method:: c_cleanup_code_struct(node, struct_id)
.. method:: c_cleanup_code_struct(node, name)
Allows you to specify code that will be inserted in the struct
destructor of the Op. This is for cleaninp up allocations and
stuff like this when the thunk is released (when you "free" a
compiled function using this op).
`struct_id` is an integer guaranteed to be unique inside the
struct.
.. method:: infer_shape(node, (i0_shapes,i1_shapes,...))
Allow optimizations to lift the Shape op over this op. An
......
......@@ -138,6 +138,16 @@ default values.
:return: the number of bytes taken by the object described by
``shape_info``.
.. method:: clone(dtype=None, broadcastable=None)
Optional, for TensorType-alikes.
Return a copy of the type with a possibly changed value for
dtype and broadcastable (if they aren't `None`).
:param dtype: New dtype for the copy.
:param broadcastable: New broadcastable tuple for the copy.
.. method:: may_share_memory(a, b)
Optional to run, but mandatory for DebugMode. Return True if the Python
......
......@@ -570,7 +570,7 @@ class Rebroadcast(gof.Op):
def __hash__(self):
items = sorted(self.axis.iteritems()) # no ambiguity because each item key is unique
return hash(type(self)) ^ hash(tuple(items))
return hash((type(self), tuple(items)))
def __str__(self):
if len(self.axis) == 0:
......@@ -586,8 +586,7 @@ class Rebroadcast(gof.Op):
def make_node(self, x):
if self.axis.keys() and (x.ndim <= numpy.max(self.axis.keys())):
raise ValueError('Trying to rebroadcast non-existent dimension')
t = x.type.__class__(dtype=x.type.dtype,
broadcastable=[self.axis.get(i, b)
t = x.type.clone(broadcastable=[self.axis.get(i, b)
for i, b in enumerate(
x.type.broadcastable)])
return gof.Apply(self, [x], [t()])
......
......@@ -658,7 +658,6 @@ class CLinker(link.Linker):
# Make the CodeBlock for c_code
sub['id'] = id
sub['struct_id'] = id + 1
sub['fail'] = failure_code(sub)
sub_struct = dict()
......@@ -692,7 +691,7 @@ class CLinker(link.Linker):
" didn't return a string for c_init_code_apply")
try:
struct_init = op.c_init_code_struct(node, id + 1, sub_struct)
struct_init = op.c_init_code_struct(node, name, sub_struct)
assert isinstance(struct_init, basestring), (
str(node.op) +
" didn't return a string for c_init_code_struct")
......@@ -700,7 +699,7 @@ class CLinker(link.Linker):
pass
try:
struct_support = op.c_support_code_struct(node, id + 1)
struct_support = op.c_support_code_struct(node, name)
assert isinstance(struct_support, basestring), (
str(node.op) +
" didn't return a string for c_support_code_struct")
......@@ -708,7 +707,7 @@ class CLinker(link.Linker):
pass
try:
struct_cleanup = op.c_cleanup_code_struct(node, id + 1)
struct_cleanup = op.c_cleanup_code_struct(node, name)
assert isinstance(struct_cleanup, basestring), (
str(node.op) +
" didn't return a string for c_cleanup_code_struct")
......
......@@ -184,7 +184,8 @@ class Apply(Node):
:note:
tags are copied from self to the returned instance.
"""
cp = self.__class__(self.op, self.inputs, [output.clone() for output in self.outputs])
cp = self.__class__(self.op, self.inputs,
[output.clone() for output in self.outputs])
cp.tag = copy(self.tag)
return cp
......
......@@ -322,17 +322,15 @@ class CLinkerOp(CLinkerObject):
raise utils.MethodNotDefined("c_init_code_apply", type(self),
self.__class__.__name__)
def c_init_code_struct(self, node, struct_id, sub):
def c_init_code_struct(self, node, name, sub):
"""
Optional: return a code string specific to the apply
to be inserted in the struct initialization code.
:param node: an Apply instance in the graph being compiled
:param struct_id: a number that serves to uniquely identify
this code. The c_code will receive another
sub parameter named struct_id that will
contain this name.
:param name: a unique name to distinguish you variables from
those of other nodes.
:param sub: a dictionary of values to substitute in the code.
Most notably it contains a 'fail' entry that you
......@@ -345,17 +343,15 @@ class CLinkerOp(CLinkerObject):
raise utils.MethodNotDefined("c_init_code_apply", type(self),
self.__class__.__name__)
def c_support_code_struct(self, node, struct_id):
def c_support_code_struct(self, node, name):
"""Optional: Return utility code for use by an `Op` that will be
inserted at struct scope, that can be specialized for the
support of a particular `Apply` node.
:param node: an Apply instance in the graph being compiled
:param struct_id: a number that serves to uniquely identify
this code. The c_code will receive another
sub parameter named struct_id that will
contain this name.
:param name: a unique name to distinguish you variables from
those of other nodes.
:Exceptions:
- `MethodNotDefined`: Subclass does not implement this method
......@@ -364,17 +360,15 @@ class CLinkerOp(CLinkerObject):
raise utils.MethodNotDefined("c_support_code_struct",
type(self), self.__class__.__name__)
def c_cleanup_code_struct(self, node, struct_id):
def c_cleanup_code_struct(self, node, name):
"""
Optional: return a code string specific to the apply to be
inserted in the struct cleanup code.
:param node: an Apply instance in the graph being compiled
:param struct_id: a number that serves to uniquely identify
this code. The c_code will receive another
sub parameter named struct_id that will
contain this name.
:param name: a unique name to distinguish you variables from
those of other nodes.
:Exceptions:
- `MethodNotDefined`: the subclass does not override this method
......
......@@ -94,20 +94,20 @@ class StructOp(Op):
def make_node(self, i):
return Apply(self, [i], [scalar.uint64()])
def c_support_code_struct(self, node, struct_id):
return "npy_uint64 counter%d;" % (struct_id,)
def c_support_code_struct(self, node, name):
return "npy_uint64 counter%s;" % (name,)
def c_init_code_struct(self, node, struct_id, sub):
return "counter%d = 0;" % (struct_id,)
def c_init_code_struct(self, node, name, sub):
return "counter%s = 0;" % (name,)
def c_code(self, node, name, input_names, outputs_names, sub):
return """
%(out)s = counter%(sid)s;
counter%(sid)s++;
""" % dict(out=outputs_names[0], sid=sub['struct_id'])
%(out)s = counter%(name)s;
counter%(name)s++;
""" % dict(out=outputs_names[0], name=name)
def c_code_cache_version(self):
return (0,)
return (1,)
class TestOp:
......
......@@ -2000,12 +2000,6 @@ def local_gpu_extract_diagonal(node):
gpu_from_host(diag_node.inputs[0]))]
return False
def typeConstructor(broadcastable, dtype):
if dtype == 'float32':
return CudaNdarrayType(broadcastable=broadcastable)
else:
return tensor.TensorType(broadcastable=broadcastable, dtype=dtype)
@register_opt('scan')
@local_optimizer([gpu_from_host, scan_op.Scan])
def gpuScanOptimization(node):
......@@ -2065,9 +2059,7 @@ def gpuScanOptimization(node):
nw_op = scan_op.Scan(scan_ins,
scan_outs,
info,
typeConstructor=typeConstructor).make_node(
*nw_ins)
info).make_node(*nw_ins)
_outputs = nw_op.outputs
return _outputs
......@@ -2113,8 +2105,7 @@ def gpuScanOptimization(node):
_outputs = scan_op.Scan(
scan_ins,
scan_outs,
info,
typeConstructor=typeConstructor).make_node(*nw_ins).outputs
info).make_node(*nw_ins).outputs
outputs = []
for x, y in zip(_outputs, node.outputs):
if isinstance(y.type, CudaNdarrayType):
......@@ -2126,8 +2117,7 @@ def gpuScanOptimization(node):
optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
gpu_flag=True),
scan_opt.ScanInplaceOptimizer(gpu_flag=True),
75,
'gpu',
'fast_run',
......
......@@ -71,6 +71,11 @@ class CudaNdarrayType(Type):
self.name = name
self.dtype_specs() # error checking is done there
def clone(self, dtype=None, broadcastable=None):
if broadcastable is None:
broadcastable = self.broadcastable
return self.__class__(broadcastable, name=self.name, dtype=dtype)
def filter(self, data, strict=False, allow_downcast=None):
return self.filter_inplace(data, None, strict=strict,
allow_downcast=allow_downcast)
......
......@@ -607,7 +607,6 @@ class GpuAlloc(HideC, Alloc):
def __init__(self, memset_0=False):
"""memset_0 is only an optimized version. True, it mean the
value is always 0, so the c code call memset as it is faster.
"""
self.memset_0 = memset_0
......
......@@ -716,13 +716,11 @@ def local_scan_to_gpua(node):
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key)
nw_op = scan_op.Scan(scan_ins, scan_outs, info,
typeConstructor=GpuArrayType).make_node(*nw_ins)
nw_op = scan_op.Scan(scan_ins, scan_outs, info).make_node(*nw_ins)
return nw_op.outputs
optdb.register('gpua_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType,
gpua_flag=True),
scan_opt.ScanInplaceOptimizer(gpua_flag=True),
75,
'gpua',
'fast_run',
......
......@@ -15,6 +15,7 @@ from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
class T_Scan(TestCase):
def setUp(self):
utt.seed_rng()
super(T_Scan, self).setUp()
def test_one_sequence_one_output_weights_gpu1(self):
def f_rnn(u_t, x_tm1, W_in, W):
......
......@@ -28,6 +28,14 @@ class GpuArrayType(Type):
raise TypeError("Unsupported dtype for %s: %s" %
(self.__class__.__name__, self.dtype))
def clone(self, dtype=None, broadcastable=None):
if dtype is None:
dtype = self.dtype
if broadcastable is None:
broadcastable = self.broadcastable
return self.__class__(dtype=dtype, broadcastable=broadcastable,
name=self.name)
def __str__(self):
return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
......
......@@ -47,7 +47,6 @@ class Scan(PureOp):
inputs,
outputs,
info,
typeConstructor=None,
):
"""
:param inputs: inputs of the inner function of scan
......@@ -56,21 +55,6 @@ class Scan(PureOp):
the scan op (like number of different types of
arguments, name, mode, if it should run on GPU or
not, etc.)
:param typeConstructor: function that constructs an equivalent
to Theano TensorType
Note: ``typeConstructor`` had been added to refactor how
Theano deals with the GPU. If it runs on the GPU, scan needs
to construct certain outputs (those who reside in the GPU
memory) as the GPU-specific type. However we can not import
gpu code in this file (as it is in sandbox, and not available
on each machine) so the workaround is that the GPU
optimization passes to the constructor of this class a
function that is able to construct a GPU type. This way the
class Scan does not need to be aware of the details for the
GPU, it just constructs any tensor using this function (which
by default constructs normal tensors).
"""
if 'gpua' not in info:
info['gpua'] = False
......@@ -87,19 +71,13 @@ class Scan(PureOp):
self.output_types = []
idx = 0
jdx = 0
tensorConstructor = lambda broadcastable, dtype: TensorType(
broadcastable=broadcastable, dtype=dtype)
if typeConstructor is None:
typeConstructor = tensorConstructor
while idx < self.n_mit_mot_outs:
# Not that for mit_mot there are several output slices per
# output sequence
o = outputs[idx]
self.output_types.append(
typeConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
o.type.clone(broadcastable=(False,) + o.type.broadcastable))
idx += len(self.mit_mot_out_slices[jdx])
jdx += 1
......@@ -109,9 +87,7 @@ class Scan(PureOp):
for o in outputs[idx:end]:
self.output_types.append(
typeConstructor(
broadcastable=(False,) + o.type.broadcastable,
dtype=o.type.dtype))
o.type.clone(broadcastable=(False,) + o.type.broadcastable))
# shared outputs + possibly the ending condition
for o in outputs[end:]:
......@@ -232,10 +208,9 @@ class Scan(PureOp):
if rval.ndim == as_var.ndim:
rval = as_var.type.filter_variable(rval)
else:
tmp = as_var.type.__class__(
tmp = as_var.type.clone(
broadcastable=tuple(var.broadcastable[:1])+\
tuple(as_var.broadcastable),
dtype=as_var.dtype)
tuple(as_var.broadcastable))
rval = tmp.filter_variable(rval)
return rval
......
......@@ -916,9 +916,8 @@ class PushOutScanOutput(gof.Optimizer):
class ScanInplaceOptimizer(Optimizer):
"""Graph optimizer for Scan(makes it run inplace)"""
def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False):
def __init__(self, gpu_flag=False, gpua_flag=False):
Optimizer.__init__(self)
self.typeConstructor = typeConstructor
self.gpu_flag = gpu_flag
self.gpua_flag = gpua_flag
......@@ -960,8 +959,7 @@ class ScanInplaceOptimizer(Optimizer):
inputs = ls_begin + ls + ls_end
new_op = scan_op.Scan(op.inputs,
op.outputs,
info,
typeConstructor=self.typeConstructor)
info)
# Do not call make_node for test_value
new_outs = new_op(*inputs, **dict(return_list=True))
......@@ -2086,8 +2084,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB()
optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan')
optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan')
optdb.register('scanOp_make_inplace',
ScanInplaceOptimizer(typeConstructor=None,
gpu_flag=False),
ScanInplaceOptimizer(),
75,
'fast_run',
'inplace',
......
......@@ -2403,17 +2403,7 @@ class Alloc(gof.Op):
This Op is used to replace fill() during optimizations because after shapes
are lifted, the first argument to fill can often be pruned from the graph.
"""
def __init__(self):
pass
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
__props__ = ()
def make_node(self, value, *shape):
v = as_tensor_variable(value)
......
......@@ -52,6 +52,18 @@ class TensorType(Type):
" AdvancedSubtensor1 sparse_grad. Now use"
" theano.sparse_grad(a_tensor[an_int_vector]).")
def clone(self, dtype=None, broadcastable=None):
"""
Return a copy of the type optionally with a new dtype or
broadcastable pattern.
"""
if dtype is None:
dtype = self.dtype
if broadcastable is None:
broadcastable = self.broadcastable
return self.__class__(dtype, broadcastable, name=self.name,
sparse_grad=self.sparse_grad)
def filter(self, data, strict=False, allow_downcast=None):
"""Convert `data` to something which can be associated to a
`TensorVariable`.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论