提交 19c0dbcb authored 作者: abergeron's avatar abergeron

Merge pull request #1782 from nouiz/alloc_cst_fold

Alloc cst fold
......@@ -3198,13 +3198,27 @@ class GpuAlloc(GpuOp):
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
elif (not isinstance(client[0], basestring)
and isinstance(client[0].op, (
tensor.IncSubtensor,
tensor.AdvancedIncSubtensor1,
GpuIncSubtensor,
GpuAdvancedIncSubtensor1
))):
elif (#The following ops work inplace of their input id 0.
client[1] == 0 and
isinstance(client[0].op, (
#Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
#theano.tensor.subtensor.AdvancedIncSubtensor,
GpuIncSubtensor,
GpuAdvancedIncSubtensor1,
theano.sandbox.cuda.blas.GpuGemm,
theano.sandbox.cuda.blas.GpuGemv,
theano.sandbox.cuda.blas.GpuGer,
))):
return False
#If the clients is a transfer, we don't want to fold. We
#let the moving opt finish before deciding what to do.
elif isinstance(client[0].op, HostFromGpu):
return False
return True
......
......@@ -2625,13 +2625,31 @@ class Alloc(gof.Op):
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
elif (not isinstance(client[0], basestring)
and isinstance(client[0].op, (
elif (#The following ops work inplace of their input id 0.
client[1] == 0 and
isinstance(client[0].op, (
#Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
theano.tensor.subtensor.IncSubtensor,
theano.tensor.subtensor.AdvancedIncSubtensor1,
theano.tensor.subtensor.AdvancedIncSubtensor,
theano.tensor.blas.Gemv,
theano.tensor.blas_c.CGemv,
theano.tensor.blas.Ger,
theano.tensor.blas_c.CGer,
theano.tensor.blas_scipy.ScipyGer
))):
return False
#If the clients is a transfer to the GPU, we don't want to
#fold. We let the Alloc being moved to the GPU, then we
#let the GPU algo decide if it need to fold it or not.
elif client[0].op.__class__.__name__.lower().startswith("gpu"):
return False
return True
......
......@@ -1928,7 +1928,8 @@ class TestAlloc(unittest.TestCase):
#AdvancedIncSubtensor1
(some_matrix[arange(60)], 2),
#AdvancedIncSubtensor
(some_matrix[idx, idx], 1)]):
(some_matrix[idx, idx], 1)
]):
derp = sum(dot(subtensor, variables))
fobj = theano.function([some_vector], derp, mode=self.mode)
......@@ -1936,14 +1937,18 @@ class TestAlloc(unittest.TestCase):
fgrad = theano.function([some_vector], grad_derp,
mode=self.mode)
topo_obj = fobj.maker.fgraph.toposort()
#<= is needed as the GPU currently don't implement
#AdvancedIncSubtensor. When this is the case it can be
#replaced with ==.
assert numpy.sum([isinstance(node.op, alloc)
for node in topo_obj]) == 0
for node in topo_obj]) <= 1
topo_grad = fgrad.maker.fgraph.toposort()
#print subtensor
#theano.printing.debugprint(fgrad)
assert numpy.sum([isinstance(node.op, alloc)
for node in topo_grad]) == n_alloc
for node in topo_grad]) == n_alloc, (
alloc, subtensor, n_alloc, topo_grad)
fobj(test_params)
fgrad(test_params)
......
......@@ -1091,7 +1091,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
# Assert that the dot was optimized somehow
self.assertFunctionContains0(f, T.dot)
self.assertFunctionContains1(f, Gemv(False))
self.assertFunctionContains1(f, Gemv(True))
# Assert they produce the same output
assert numpy.allclose(f(), numpy.dot(v.get_value(), w.get_value()))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论