提交 e764e8c2 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merge pull request #434 from nouiz/alloc

Make a machanism to don't have Alloc and GpuAlloc constant folded in som...
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
Since 0.5rc2 Since 0.5rc2
* Fixed a memory leak with shared variable (we kept a pointer to the original value) * Fixed a memory leak with shared variable (we kept a pointer to the original value)
* Alloc, GpuAlloc are not always pre-computed (constant_folding optimization) at compile time if all their inputs are constant
* The keys in our cache now store the hash of constants and not the constant values themselves. This is significantly more efficient for big constant arrays. * The keys in our cache now store the hash of constants and not the constant values themselves. This is significantly more efficient for big constant arrays.
* 'theano-cache list' lists key files bigger than 1M * 'theano-cache list' lists key files bigger than 1M
* 'theano-cache list' prints an histogram of the number of keys per compiled module * 'theano-cache list' prints an histogram of the number of keys per compiled module
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
Since 0.5rc2 Since 0.5rc2
* Fixed a memory leak with shared variable (we kept a pointer to the original value) * Fixed a memory leak with shared variable (we kept a pointer to the original value)
* Alloc, GpuAlloc are not always pre-computed (constant_folding optimization) at compile time if all their inputs are constant
* The keys in our cache now store the hash of constants and not the constant values themselves. This is significantly more efficient for big constant arrays. * The keys in our cache now store the hash of constants and not the constant values themselves. This is significantly more efficient for big constant arrays.
* 'theano-cache list' lists key files bigger than 1M * 'theano-cache list' lists key files bigger than 1M
* 'theano-cache list' prints an histogram of the number of keys per compiled module * 'theano-cache list' prints an histogram of the number of keys per compiled module
......
...@@ -217,6 +217,20 @@ following methods: ...@@ -217,6 +217,20 @@ following methods:
``__str__`` method include the name of the op and the Op's parameters' ``__str__`` method include the name of the op and the Op's parameters'
values. values.
.. function:: do_constant_folding(node)
*Default:* Return True
By default when optimizations are enabled, we remove during
function compilation Apply nodes whose inputs are all constants.
We replace the Apply node with a Theano constant variable.
This way, the Apply node is not executed at each function
call. If you want to force the execution of an op during the
function call, make do_constant_folding return False.
As done in the Alloc op, you can return False only in some cases by
analyzing the graph from the node parameter.
At a bare minimum, a new Op must define ``make_node`` and ``perform``, which At a bare minimum, a new Op must define ``make_node`` and ``perform``, which
have no defaults. have no defaults.
......
...@@ -509,6 +509,16 @@ class PureOp(object): ...@@ -509,6 +509,16 @@ class PureOp(object):
""" """
raise utils.MethodNotDefined("perform", type(self), self.__class__.__name__) raise utils.MethodNotDefined("perform", type(self), self.__class__.__name__)
def do_constant_folding(self, node):
"""
This allows each op to determine if it wants to be constant
folded when all its inputs are constant. This allows it to
choose where it puts its memory/speed trade-off. Also, it
could make things faster as constants can't be used for inplace
operations (see *IncSubtensor).
"""
return True
class Op(utils.object2, PureOp, CLinkerOp): class Op(utils.object2, PureOp, CLinkerOp):
"""Convenience class to bundle `PureOp` and `CLinkerOp`""" """Convenience class to bundle `PureOp` and `CLinkerOp`"""
......
...@@ -2004,6 +2004,17 @@ class GpuAlloc(Op): ...@@ -2004,6 +2004,17 @@ class GpuAlloc(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (3,)
def do_constant_folding(self, node):
if any([isinstance(client[0].op, (
tensor.IncSubtensor,
tensor.AdvancedIncSubtensor1,
GpuIncSubtensor,
GpuAdvancedIncSubtensor1
))
for client in node.outputs[0].clients]):
return False
return True
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
......
...@@ -728,7 +728,16 @@ def test_gpualloc_output_to_gpu(): ...@@ -728,7 +728,16 @@ def test_gpualloc_output_to_gpu():
assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9)) assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
assert numpy.allclose(f(5),f_gpu(5)) assert numpy.allclose(f(5),f_gpu(5))
import theano.tensor.tests.test_basic import theano.tensor.tests.test_basic
class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
dtype = "float32"
mode = mode_with_gpu
shared = staticmethod(cuda.shared_constructor)
allocs = [B.GpuAlloc, B.GpuAlloc, tensor.Alloc]
class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split): class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
def setUp(self): def setUp(self):
utt.seed_rng() utt.seed_rng()
......
...@@ -2616,6 +2616,16 @@ class Alloc(gof.Op): ...@@ -2616,6 +2616,16 @@ class Alloc(gof.Op):
return [None] return [None]
return self.make_node(eval_points[0], *inputs[1:]).outputs return self.make_node(eval_points[0], *inputs[1:]).outputs
def do_constant_folding(self, node):
if python_any([isinstance(client[0].op, (IncSubtensor,
AdvancedIncSubtensor1,
AdvancedIncSubtensor,
))
for client in node.outputs[0].clients]):
return False
return True
alloc = Alloc() alloc = Alloc()
pprint.assign(alloc, printing.FunctionPrinter('alloc')) pprint.assign(alloc, printing.FunctionPrinter('alloc'))
......
...@@ -3767,6 +3767,9 @@ def constant_folding(node): ...@@ -3767,6 +3767,9 @@ def constant_folding(node):
if not isinstance(input, Constant): if not isinstance(input, Constant):
return False return False
#condition: all inputs are constant #condition: all inputs are constant
if not node.op.do_constant_folding(node):
# The op asks not to be constant folded.
return False
storage_map = dict([(i, [i.data]) for i in node.inputs]) storage_map = dict([(i, [i.data]) for i in node.inputs])
compute_map = dict([(i, [True]) for i in node.inputs]) compute_map = dict([(i, [True]) for i in node.inputs])
......
...@@ -48,6 +48,11 @@ except ImportError: ...@@ -48,6 +48,11 @@ except ImportError:
mode_no_scipy = "FAST_RUN" mode_no_scipy = "FAST_RUN"
floatX = config.floatX floatX = config.floatX
if config.mode == "FAST_COMPILE":
mode_opt = "FAST_RUN"
else:
mode_opt = get_default_mode()
### seed random number generator so that unittests are deterministic ### ### seed random number generator so that unittests are deterministic ###
utt.seed_rng() utt.seed_rng()
...@@ -1266,6 +1271,48 @@ Alloc13GradTester = makeBroadcastTester( ...@@ -1266,6 +1271,48 @@ Alloc13GradTester = makeBroadcastTester(
), ),
) )
class TestAlloc(unittest.TestCase):
dtype = config.floatX
mode = mode_opt
shared = staticmethod(theano.shared)
allocs = [tensor.Alloc] * 3
def test_alloc_constant_folding(self):
test_params = numpy.asarray(numpy.random.randn(50 * 60),
self.dtype)
some_vector = vector('some_vector', dtype=self.dtype)
some_matrix = some_vector.reshape((60, 50))
variables = self.shared(numpy.ones((50,), dtype=self.dtype))
idx = tensor.constant(numpy.arange(50))
for alloc, (subtensor, n_alloc) in zip(self.allocs, [
#IncSubtensor1
(some_matrix[:60], 2),
#AdvancedIncSubtensor1
(some_matrix[arange(60)], 2),
#AdvancedIncSubtensor
(some_matrix[idx, idx], 1)]):
derp = sum(dot(subtensor, variables))
fobj = theano.function([some_vector], derp, mode=self.mode)
grad_derp = theano.grad(derp, some_vector)
fgrad = theano.function([some_vector], grad_derp,
mode=self.mode)
topo_obj = fobj.maker.env.toposort()
assert numpy.sum([isinstance(node.op, alloc)
for node in topo_obj]) == 0
topo_grad = fgrad.maker.env.toposort()
#print subtensor
#theano.printing.debugprint(fgrad)
assert numpy.sum([isinstance(node.op, alloc)
for node in topo_grad]) == n_alloc
fobj(test_params)
fgrad(test_params)
def test_eye(): def test_eye():
def check(dtype, N, M_=None, k=0): def check(dtype, N, M_=None, k=0):
# Theano does not accept None as a tensor. # Theano does not accept None as a tensor.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论