提交 51964e4e authored 作者: abergeron's avatar abergeron

Merge pull request #1955 from nouiz/debugmode

Speed up Debugmode
......@@ -38,6 +38,7 @@ script:
- ulimit -a
- echo $PART
- theano-nose --with-timelimit -v $PART
- theano-cache list
#after_script:
......
......@@ -138,6 +138,13 @@ default values.
:return: the number of bytes taken by the object described by
``shape_info``.
.. method:: may_share_memory(a, b)
Optional. Only needed for DebugMode. Return True if the python
objects `a` and `b` could share memory. Return False
otherwise. It is used to debug when Ops didn't declare memory
aliaing between variables. Must be a static method.
For each method, the *default* is what ``Type`` defines
for you. So, if you create an instance of ``Type`` or an
instance of a subclass of ``Type``, you
......
......@@ -685,9 +685,10 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
actually_inplace_outputs = []
dmap = getattr(node.op, 'destroy_map', {})
for oo, ii in dmap.iteritems():
out_var = storage_map[node.outputs[oo]][0]
var = node.outputs[oo]
out_var = storage_map[var][0]
in_var = storage_map[node.inputs[ii[0]]][0]
if _may_share_memory(out_var, in_var):
if var.type.may_share_memory(out_var, in_var):
actually_inplace_outputs.append(node.outputs[oo])
if warn_input_not_reused and destroyed_res_list:
......@@ -702,9 +703,11 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
vmap = getattr(node.op, 'view_map', {})
for oo, ii in vmap.iteritems():
out_var = storage_map[node.outputs[oo]][0]
var = node.outputs[oo]
out_var = storage_map[var][0]
in_var = storage_map[node.inputs[ii[0]]][0]
if _may_share_memory(out_var, in_var):
may_share = var.type.may_share_memory(out_var, in_var)
if may_share:
actually_inplace_outputs.append(node.outputs[oo])
if warn_input_not_reused:
......@@ -717,7 +720,7 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
if isinstance(node.op, OutputGuard):
# This class is not in the final graph.
continue
if not _may_share_memory(out_var, in_var):
if not may_share:
_logger.warning("Optimization Warning: input idx %d marked "
"as viewed but new memory allocated by node '%s'",
ii[0], str(node))
......@@ -766,7 +769,7 @@ def _check_viewmap(node, storage_map):
for ii, inode in enumerate(node.inputs):
if _may_share_memory(outstorage, storage_map[inode][0]):
if inode.type.may_share_memory(outstorage, storage_map[inode][0]):
nodeid = id(inode)
bad_alias[nodeid] = ii
......@@ -794,26 +797,18 @@ def _check_viewmap(node, storage_map):
other_storage = storage_map[other_onode][0]
# check to see if we share memory with this other output
# this is not a problem if the node is not actually used
if _is_used_in_graph(other_onode) and \
_may_share_memory(outstorage, other_storage):
if (_is_used_in_graph(other_onode) and
other_onode.type.may_share_memory(outstorage,
other_storage)):
raise BadViewMap(node, oi, outstorage,
out_alias_idx=other_oi)
def _may_share_memory(a, b):
from theano.misc.may_share_memory import may_share_memory
return may_share_memory(a, b, False)
def _is_function_output(node):
def _is_used_in_graph(var):
"""
Returns True if the node in question is the a final output of the graph
Returns True if `var` is used by another node in the graph
"""
return node.clients == [('output', 1)]
def _is_used_in_graph(node):
return not(_is_function_output(node) or node.clients == [])
return not(var.clients == [('output', 1)] or var.clients == [])
def _check_strides_match(a, b, warn_err, op):
......@@ -1111,18 +1106,21 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# is less relevant.
# Dimensions should be align by the innermost index, so we iterate
# from the end of shapes.
max_ndim = 0
rev_out_broadcastable = []
for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
if max_ndim < r.ndim:
rev_out_broadcastable += [True] * (r.ndim - max_ndim)
max_ndim = r.ndim
assert len(rev_out_broadcastable) == max_ndim
for i, b in enumerate(r.broadcastable[::-1]):
rev_out_broadcastable[i] = rev_out_broadcastable[i] and b
out_broadcastable = rev_out_broadcastable[::-1]
if ('strided' in prealloc_modes or
'wrong_size' in prealloc_modes or
'ALL' in prealloc_modes):
max_ndim = 0
rev_out_broadcastable = []
for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
if max_ndim < r.ndim:
rev_out_broadcastable += [True] * (r.ndim - max_ndim)
max_ndim = r.ndim
assert len(rev_out_broadcastable) == max_ndim
for i, b in enumerate(r.broadcastable[::-1]):
rev_out_broadcastable[i] = rev_out_broadcastable[i] and b
out_broadcastable = rev_out_broadcastable[::-1]
if 'strided' in prealloc_modes or 'ALL' in prealloc_modes:
check_ndim = config.DebugMode.check_preallocated_output_ndim
......
......@@ -677,6 +677,11 @@ class CLinker(link.Linker):
raise NotImplementedError("%s cannot produce C code" % op)
assert isinstance(behavior, basestring), (
str(node.op) + " didn't return a string for c_code")
# To help understand what is following. It help read the c code.
# This prevent different op that generate the same c code
# to be merged, I suppose this won't happen...
behavior = ("// Op class " + node.op.__class__.__name__ + "\n" +
behavior)
try:
cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub)
......
......@@ -218,6 +218,7 @@ if __name__ == "__main__":
GTX Titan Black 0.05s
GTX Titan(D15U-50) 0.06s 0.06s don't work
GTX 780 0.06s
GTX 680 0.11s 0.12s 0.154s 0.218s
GTX 580 0.16s 0.16s 0.164s 0.203s
GTX 480 0.19s 0.19s 0.192s 0.237s 0.27s
......
......@@ -15,12 +15,14 @@ try:
def _is_sparse(a):
return scipy.sparse.issparse(a)
except ImportError:
#scipy not imported, their can be only ndarray and cudandarray
# scipy not imported, their can be only ndarray and cudandarray
def _is_sparse(a):
return False
from theano.sandbox import cuda
if cuda.cuda_available:
from theano.sandbox.cuda.type import CudaNdarrayType
def _is_cuda(a):
return isinstance(a, cuda.CudaNdarray)
else:
......@@ -40,13 +42,19 @@ else:
def may_share_memory(a, b, raise_other_type=True):
a_ndarray = isinstance(a, numpy.ndarray)
b_ndarray = isinstance(b, numpy.ndarray)
a_sparse = _is_sparse(a)
b_sparse = _is_sparse(b)
if a_ndarray and b_ndarray:
return TensorType.may_share_memory(a, b)
a_cuda = _is_cuda(a)
b_cuda = _is_cuda(b)
if a_cuda and b_cuda:
return CudaNdarrayType.may_share_memory(a, b)
a_gpua = _is_gpua(a)
b_gpua = _is_gpua(b)
if a_gpua and b_gpua:
return gpuarray.pygpu.gpuarray.may_share_memory(a, b)
a_sparse = _is_sparse(a)
b_sparse = _is_sparse(b)
if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or
not(b_ndarray or b_sparse or b_cuda or b_gpua)):
if raise_other_type:
......@@ -54,13 +62,6 @@ def may_share_memory(a, b, raise_other_type=True):
" and scipy.sparse, CudaNdarray or GpuArray type")
return False
if a_ndarray and b_ndarray:
return TensorType.may_share_memory(a, b)
if a_cuda and b_cuda:
from theano.sandbox.cuda.type import CudaNdarrayType
return CudaNdarrayType.may_share_memory(a, b)
if a_gpua and b_gpua:
return gpuarray.pygpu.gpuarray.may_share_memory(a, b)
if a_cuda or b_cuda or a_gpua or b_gpua:
return False
return SparseType.may_share_memory(a, b)
# This is work in progress
from theano import Op, Apply
from theano import Op, Apply, tensor
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp
......@@ -7,7 +7,8 @@ from theano.sandbox.neighbours import Images2Neibs
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
from theano.sandbox.cuda.basic_ops import (
as_cuda_ndarray_variable, host_from_gpu, gpu_from_host)
from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
......@@ -21,13 +22,16 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
self.mode = mode
def make_node(self, ten4, neib_shape, neib_step):
assert ten4.dtype == 'float32'
if not isinstance(ten4.type, CudaNdarrayType):
raise TypeError('ten4 must be cudandarray', ten4)
ten4 = as_cuda_ndarray_variable(ten4)
neib_shape = tensor.as_tensor_variable(neib_shape)
neib_step = tensor.as_tensor_variable(neib_step)
assert ten4.ndim == 4
assert ten4.dtype == 'float32'
assert neib_shape.ndim == 1
assert neib_step.ndim == 1
assert "int" in neib_shape.dtype
assert "int" in neib_step.dtype
return Apply(self, [ten4, neib_shape, neib_step],
[CudaNdarrayType(broadcastable=(False, False),
......
......@@ -29,6 +29,9 @@ class GpuImages2Neibs(Images2Neibs, Op):
self.mode = mode
def make_node(self, ten4, neib_shape, neib_step):
ten4 = as_gpuarray_variable(ten4)
neib_shape = T.as_tensor_variable(neib_shape)
neib_step = T.as_tensor_variable(neib_step)
assert ten4.ndim == 4
assert neib_shape.ndim == 1
......@@ -36,10 +39,6 @@ class GpuImages2Neibs(Images2Neibs, Op):
assert "int" in neib_shape.dtype
assert "int" in neib_step.dtype
ten4 = as_gpuarray_variable(ten4)
neib_shape = T.as_tensor_variable(neib_shape)
neib_step = T.as_tensor_variable(neib_step)
return Apply(self, [ten4, neib_shape, neib_step],
[GpuArrayType(broadcastable=(False, False),
dtype=ten4.type.dtype)()])
......
......@@ -145,6 +145,13 @@ class Scalar(Type):
self.dtype = dtype
self.dtype_specs() # error checking
@staticmethod
def may_share_memory(a, b):
# This class represent basic c type, represented in python
# with numpy.scalar. They are read only. So from python, they
# can never share memory.
return False
def filter(self, data, strict=False, allow_downcast=None):
py_type = self.dtype_specs()[0]
if strict and not isinstance(data, py_type):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论