提交 9161c88e authored 作者: Frederic's avatar Frederic

Disable constant folding of [Gpu]Alloc in more case.

Since we do constant folding earlier, this make a GpuAlloc test fail. This could also speed some code as this allow more op to work inplace.
上级 e93c61d1
......@@ -3198,13 +3198,28 @@ class GpuAlloc(GpuOp):
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
elif (not isinstance(client[0], basestring)
and isinstance(client[0].op, (
tensor.IncSubtensor,
tensor.AdvancedIncSubtensor1,
GpuIncSubtensor,
GpuAdvancedIncSubtensor1
))):
elif (not isinstance(client[0], basestring) and
#It is the inputs id 0 of the following op
client[1] == 0 and
isinstance(client[0].op, (
#Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
#theano.tensor.subtensor.AdvancedIncSubtensor,
GpuIncSubtensor,
GpuAdvancedIncSubtensor1,
theano.sandbox.cuda.blas.GpuGemm,
theano.sandbox.cuda.blas.GpuGemv,
theano.sandbox.cuda.blas.GpuGer,
))):
return False
#If the clients is a transfer, we don't want to fold. We
#let the moving opt finish before deciding what to do.
elif isinstance(client[0].op, HostFromGpu):
return False
return True
......
......@@ -2625,13 +2625,32 @@ class Alloc(gof.Op):
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
elif (not isinstance(client[0], basestring)
and isinstance(client[0].op, (
elif (not isinstance(client[0], basestring) and
#It is the inputs id 0 of the following op
client[1] == 0 and
isinstance(client[0].op, (
#Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
theano.tensor.subtensor.IncSubtensor,
theano.tensor.subtensor.AdvancedIncSubtensor1,
theano.tensor.subtensor.AdvancedIncSubtensor,
theano.tensor.blas.Gemv,
theano.tensor.blas_c.CGemv,
theano.tensor.blas.Ger,
theano.tensor.blas_c.CGer,
theano.tensor.blas_scipy.ScipyGer
))):
return False
#If the clients is a transfer to the GPU, we don't want to
#fold. We let the Alloc being moved to the GPU, then we
#let the GPU algo decide if it need to fold it or not.
elif client[0].op.__class__.__name__.lower().startswith("gpu"):
return False
return True
......
......@@ -1928,7 +1928,8 @@ class TestAlloc(unittest.TestCase):
#AdvancedIncSubtensor1
(some_matrix[arange(60)], 2),
#AdvancedIncSubtensor
(some_matrix[idx, idx], 1)]):
(some_matrix[idx, idx], 1)
]):
derp = sum(dot(subtensor, variables))
fobj = theano.function([some_vector], derp, mode=self.mode)
......@@ -1936,14 +1937,18 @@ class TestAlloc(unittest.TestCase):
fgrad = theano.function([some_vector], grad_derp,
mode=self.mode)
topo_obj = fobj.maker.fgraph.toposort()
#<= is needed as the GPU currently don't implement
#AdvancedIncSubtensor. When this is the case it can be
#replaced with ==.
assert numpy.sum([isinstance(node.op, alloc)
for node in topo_obj]) == 0
for node in topo_obj]) <= 1
topo_grad = fgrad.maker.fgraph.toposort()
#print subtensor
#theano.printing.debugprint(fgrad)
assert numpy.sum([isinstance(node.op, alloc)
for node in topo_grad]) == n_alloc
for node in topo_grad]) == n_alloc, (
alloc, subtensor, n_alloc, topo_grad)
fobj(test_params)
fgrad(test_params)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论