提交 918c51cb authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2195 from daemonmaker/issue1903

Updated local_alloc_elemwise to remove all allocs when possible and to a...
...@@ -462,6 +462,20 @@ import theano and print the config variable, as in: ...@@ -462,6 +462,20 @@ import theano and print the config variable, as in:
Link arguments to link against a (Fortran) level-3 blas implementation. Link arguments to link against a (Fortran) level-3 blas implementation.
.. attribute:: config.experimental.local_alloc_elemwise_assert
Bool value: either True or False
Default: True
When the local_alloc_optimization is applied, add an assert to highlight
shape errors.
Without such asserts this optimization could hide errors in the user code.
We add the assert only if we can't infer that the shapes are equivalent.
As such this optimization does not always introduce an assert in the graph.
Removing the assert could speed up execution.
.. attribute:: config.cuda.root .. attribute:: config.cuda.root
Default: $CUDA_ROOT or failing that, "/usr/local/cuda" Default: $CUDA_ROOT or failing that, "/usr/local/cuda"
......
...@@ -1408,6 +1408,7 @@ def local_useless_elemwise(node): ...@@ -1408,6 +1408,7 @@ def local_useless_elemwise(node):
return [node.inputs[0]] return [node.inputs[0]]
if node.op.scalar_op == theano.scalar.add and len(node.inputs) == 1: if node.op.scalar_op == theano.scalar.add and len(node.inputs) == 1:
return [node.inputs[0]] return [node.inputs[0]]
if (node.op.scalar_op == theano.scalar.identity if (node.op.scalar_op == theano.scalar.identity
and len(node.inputs) == 1): and len(node.inputs) == 1):
return [node.inputs[0]] return [node.inputs[0]]
...@@ -1529,14 +1530,15 @@ def local_remove_useless_assert(node): ...@@ -1529,14 +1530,15 @@ def local_remove_useless_assert(node):
return [assert_(node.inputs[0], *cond)] return [assert_(node.inputs[0], *cond)]
@register_specialize
@gof.local_optimizer([T.Elemwise]) @gof.local_optimizer([T.Elemwise])
def local_alloc_elemwise(node): def local_alloc_elemwise(node):
""" """
elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION)) elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION))
-> elemwise(x, y.TensorType(no broadcast flag)) -> elemwise(x, y.TensorType(BROADCAST CONDITION))
elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION)) elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION))
-> elemwise(x, y.TensorType(no broadcast flag)) -> elemwise(x.dimshuffle(...), y.TensorType(BROADCAST CONDITION))
BROADCAST CONDITION: the condition is that the one input that are BROADCAST CONDITION: the condition is that the one input that are
not to be optimized to have the same broadcast pattern as the not to be optimized to have the same broadcast pattern as the
...@@ -1548,99 +1550,122 @@ def local_alloc_elemwise(node): ...@@ -1548,99 +1550,122 @@ def local_alloc_elemwise(node):
""" """
if not isinstance(node.op, T.Elemwise): if not isinstance(node.op, T.Elemwise):
return False return False
if len(node.outputs) > 1: if len(node.outputs) > 1:
#This is a supposition this code make that I'm not sure is always true. # Ensure all outputs have the same broadcast pattern
assert all([list(o.type.broadcastable) == list( # This is a supposition that I'm not sure is always true.
node.outputs[0].type.broadcastable) for o in assert all([o.type.broadcastable ==
node.outputs[0].type.broadcastable for o in
node.outputs[1:]]) node.outputs[1:]])
if not any([list(i.type.broadcastable) == list( # The broadcast pattern of the ouptut must match the broadcast pattern of
node.outputs[0].type.broadcastable) for i in node.inputs]): # at least one of the inputs.
if not any([i.type.broadcastable ==
node.outputs[0].type.broadcastable for i in node.inputs]):
return False return False
if not any([i.owner and (isinstance(i.owner.op, T.Alloc) or \
(isinstance(i.owner.op, T.DimShuffle) and def dimshuffled_alloc(i):
i.owner.inputs[0].owner and \ return (isinstance(i.owner.op, T.DimShuffle) and
isinstance(i.owner.inputs[0].owner.op, T.Alloc))) i.owner.inputs[0].owner and \
isinstance(i.owner.inputs[0].owner.op, T.Alloc))
# At least one input must have an owner that is either a T.Alloc or a
# T.DimShuffle with an owner that is a T.Alloc -- otherwise there is
# nothing to optimize.
if not any([i.owner
and (isinstance(i.owner.op, T.Alloc) or dimshuffled_alloc(i))
for i in node.inputs]): for i in node.inputs]):
return False return False
no_broad_idx = -1
## Search for input that we can use as a baseline for the dimensions.
assert_op_idx = -1
for idx, i in enumerate(node.inputs): for idx, i in enumerate(node.inputs):
if not i.owner: if i.type.broadcastable == node.outputs[0].type.broadcastable:
if list(i.type.broadcastable) == [False, ] * i.type.ndim: # Prefer an input that is not a T.Alloc nor a T.DimShuffle of a
no_broad_idx = idx # T.Alloc so that all allocs can be optimized.
if not (i.owner
and (isinstance(i.owner.op, T.Alloc)
or dimshuffled_alloc(i))):
assert_op_idx = idx
break break
else:
continue
if not any(i.type.broadcastable) and not isinstance(i.owner.op,
T.Alloc):
no_broad_idx = idx
break
elif list(i.type.broadcastable) == list(
node.outputs[0].type.broadcastable) \
and not isinstance(i.owner.op, T.Alloc) \
and not (isinstance(i.owner.op, T.DimShuffle) and
i.owner.inputs[0].owner and \
isinstance(i.owner.inputs[0].owner.op, T.Alloc)):
no_broad_idx = idx
break
assert no_broad_idx >= 0 # It may be the case that only T.Allocs and T.DimShuffle of T.Allocs exist.
assert_op = node.inputs[no_broad_idx] if assert_op_idx < 0:
# We want to optimize as many allocs as possible. When there is more
# than one then do all but one.
# number of inputs with alloc or dimshuffle alloc
l2 = [i for i in node.inputs
if (i.owner and (isinstance(i.owner.op, T.Alloc)
or dimshuffled_alloc(i)))]
# If only 1 alloc or dimshuffle alloc, it is the one we will use for the shape
# So no alloc would be removed.
if len(l2) > 1:
# l containt inputs with alloc or dimshuffle alloc only.
# Its length will always be at least one, as we checked that before
l = [idx for idx, i in enumerate(node.inputs)
if i.type.broadcastable == node.outputs[0].type.broadcastable]
assert_op_idx = l[0] # The first one is as good as any to use.
else:
# Nothing would be optimized!
return False
assert_op = node.inputs[assert_op_idx]
cmp_op = assert_op cmp_op = assert_op
new = [] new_i = []
for i in node.inputs: for i in node.inputs:
# Remove alloc
if (i.owner and isinstance(i.owner.op, T.Alloc) if (i.owner and isinstance(i.owner.op, T.Alloc)
and i.owner.inputs[0].type != i.owner.outputs[0].type): and i.owner.inputs[0].type != i.owner.outputs[0].type):
# when i.owner.inputs[0].type == i.owner.outputs[0].type we # when i.owner.inputs[0].type == i.owner.outputs[0].type we
# will remove that alloc later # will remove that alloc later
assert i.type.ndim == cmp_op.ndim assert i.type.ndim == cmp_op.ndim
if theano.config.experimental.local_alloc_elemwise_assert: if (theano.config.experimental.local_alloc_elemwise_assert
and not node.fgraph.shape_feature.same_shape(i, cmp_op)):
assert_op = assert_(assert_op, assert_op = assert_(assert_op,
*[T.eq(i.shape[idx], cmp_op.shape[idx])\ *[T.eq(i.shape[idx], cmp_op.shape[idx])\
for idx in xrange(i.type.ndim) \ for idx in xrange(i.type.ndim) \
if not i.type.broadcastable[idx]]) if not i.type.broadcastable[idx]])
new.append(i.owner.inputs[0]) new_i.append(i.owner.inputs[0])
elif i.owner and isinstance(i.owner.op, T.DimShuffle) \
and i.owner.inputs[0].owner \ # Remove Alloc in DimShuffle
and isinstance(i.owner.inputs[0].owner.op, T.Alloc): elif i.owner and dimshuffled_alloc(i):
assert i.type.ndim == cmp_op.type.ndim assert i.type.ndim == cmp_op.type.ndim
if theano.config.experimental.local_alloc_elemwise_assert: if (theano.config.experimental.local_alloc_elemwise_assert
and not node.fgraph.shape_feature.same_shape(i, cmp_op)):
assert_op = assert_(assert_op, assert_op = assert_(assert_op,
*[T.eq(i.shape[idx], cmp_op.shape[idx]) *[T.eq(i.shape[idx], cmp_op.shape[idx])
for idx in xrange(i.type.ndim) for idx in xrange(i.type.ndim)
if not i.type.broadcastable[idx]]) if not i.type.broadcastable[idx]])
new.append(i.owner.inputs[0].owner.inputs[0]) new_i.append(i.owner.inputs[0].owner.inputs[0])
else: else:
new.append(i) new_i.append(i)
new[no_broad_idx] = assert_op new_i[assert_op_idx] = assert_op
if theano.config.experimental.local_alloc_elemwise_assert:
assert assert_op.owner.op is assert_ return node.op(*new_i, return_list=True)
return [node.op(*new)]
#TODO, global optimizer that lift the assert to the beginning of the graph. #TODO, global optimizer that lift the assert to the beginning of the graph.
#TODO, when all inputs can be optimized do all except one #TODO, optimize all inputs when possible -- currently when all inputs have
# an alloc all but one is optimized.
theano.configparser.AddConfigVar('experimental.local_alloc_elemwise', theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
"If True enable the experimental optimization local_alloc_elemwise", "DEPRECATED: If True, enable the experimental"
theano.configparser.BoolParam(False), " optimization local_alloc_elemwise."
in_c_key=False) " Generates error if not True. Use"
#This version if faster but not as save. " optimizer_excluding=local_alloc_elemwise"
" to dsiable.",
theano.configparser.BoolParam(
True,
is_valid=lambda x: x
),
in_c_key=False)
#This version if faster but not as safe.
theano.configparser.AddConfigVar('experimental.local_alloc_elemwise_assert', theano.configparser.AddConfigVar('experimental.local_alloc_elemwise_assert',
"If False enable the experimental optimization local_alloc_elemwise" "If False enable the experimental optimization local_alloc_elemwise"
" but WITHOUT assert into the graph!", " but WITHOUT assert into the graph!",
theano.configparser.BoolParam(True), theano.configparser.BoolParam(True),
in_c_key=False) in_c_key=False)
if theano.config.experimental.local_alloc_elemwise:
#enabled by default when the lifter of assert is done.
register_specialize(local_alloc_elemwise)
else:
#don't register them in fast_run by default to have them disabled
#by default disable them by default as we are not sure it is
#always a good idea to replace an alloc with multiple op.
compile.optdb['specialize'].register("local_alloc_elemwise",
local_alloc_elemwise)
############################ ############################
# Constant Canonicalization # Constant Canonicalization
......
...@@ -2512,6 +2512,156 @@ def test_local_subtensor_of_dot(): ...@@ -2512,6 +2512,156 @@ def test_local_subtensor_of_dot():
f = theano.function([m1, m2, idx], theano.dot(m1, m2)[1:4,:,idx:,idx], mode=mode) f = theano.function([m1, m2, idx], theano.dot(m1, m2)[1:4,:,idx:,idx], mode=mode)
assert test_equality(f(d1, d2, 1), numpy.dot(d1, d2)[1:4,:,1:,1]) assert test_equality(f(d1, d2, 1), numpy.dot(d1, d2)[1:4,:,1:,1])
class Test_local_alloc_elemwise(unittest.TestCase):
dtype = config.floatX
def setUp(self):
self.vec = T.vector('vec', dtype=theano.config.floatX)
self.mat = T.matrix('mat', dtype=theano.config.floatX)
self.tens = T.tensor3('tens', dtype=theano.config.floatX)
self.alloc_wo_dep = T.alloc(self.vec, 2, 2)
self.alloc_w_dep = T.alloc(self.vec, *self.mat.shape)
def _verify_alloc_count(self, f, count):
assert(
sum([isinstance(elem.op, T.Alloc)
for elem in f.maker.fgraph.toposort()
if elem.op is not None]) == count
)
def _verify_assert_count(self, f, count):
assert(
sum([isinstance(elem.op, T.opt.Assert)
for elem in f.maker.fgraph.toposort()
if elem.op is not None]) == count
)
def test_remove_alloc_wo_dimshuffle(self):
# No optimization on alloc
func = function(
[self.vec, self.mat],
self.alloc_wo_dep + self.mat,
mode='FAST_COMPILE'
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
# Optimization on alloc with assert
func = function(
[self.vec, self.mat],
self.alloc_wo_dep + self.mat,
mode='FAST_RUN'
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1)
# No optimization on alloc without assert
func = function(
[self.vec, self.mat],
self.alloc_w_dep + self.mat,
mode='FAST_COMPILE'
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
# Optimization on alloc without assert
func = function(
[self.vec, self.mat],
self.alloc_w_dep + self. mat,
mode='FAST_RUN'
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 0)
def test_remove_alloc_w_dimshuffle(self):
# No optimization on dimshuffle with assert
func = function(
[self.vec, self.tens],
T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_COMPILE'
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
# Optimization on dimshuffle with assert
func = function(
[self.vec, self.tens],
T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_RUN'
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1)
# No optimization on dimshuffle without assert
func = function(
[self.vec, self.tens],
T.alloc(
self.vec,
self.tens.shape[0],
self.tens.shape[1]
).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_COMPILE'
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
# Optimization on dimshuffle without assert
func = function(
[self.vec, self.tens],
T.alloc(
self.vec,
self.tens.shape[0],
self.tens.shape[1]
).dimshuffle(0, 1, 'x') + self.tens,
mode='FAST_RUN'
)
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 0)
def test_multi_input_single_alloc(self):
tv = T.alloc(self.vec, 5, 5)
tm = T.alloc(self.mat, 5, 5, 5)
func = function(
[self.vec, self.mat],
tv + tm,
mode='FAST_COMPILE'
)
self._verify_alloc_count(func, 2)
self._verify_assert_count(func, 0)
func = function(
[self.vec, self.mat],
tv + tm,
mode='FAST_RUN'
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
s = T.iscalar('s')
tv = T.alloc(self.vec, s, s)
tm = T.alloc(self.mat, 5, 5, 5)
func = function(
[self.vec, self.mat, s],
tv + tm,
mode='FAST_COMPILE'
)
self._verify_alloc_count(func, 2)
self._verify_assert_count(func, 0)
func = function(
[self.vec, self.mat, s],
tv + tm,
mode='FAST_RUN'
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 1)
def test_local_subtensor_of_alloc(): def test_local_subtensor_of_alloc():
# DebugMode should detect if something goes wrong. # DebugMode should detect if something goes wrong.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论