提交 9161c88e authored 作者: Frederic's avatar Frederic

Disable constant folding of [Gpu]Alloc in more case.

Since we do constant folding earlier, this make a GpuAlloc test fail. This could also speed some code as this allow more op to work inplace.
上级 e93c61d1
...@@ -3198,13 +3198,28 @@ class GpuAlloc(GpuOp): ...@@ -3198,13 +3198,28 @@ class GpuAlloc(GpuOp):
# If the output is a constant, it will have to be deepcopied # If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold. # each time the function is called. So we do not fold.
return False return False
elif (not isinstance(client[0], basestring) elif (not isinstance(client[0], basestring) and
and isinstance(client[0].op, ( #It is the inputs id 0 of the following op
tensor.IncSubtensor, client[1] == 0 and
tensor.AdvancedIncSubtensor1, isinstance(client[0].op, (
GpuIncSubtensor, #Ops that will work inplace on the Alloc. So if they
GpuAdvancedIncSubtensor1 #get constant_folded, they would copy the
))): #constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
#theano.tensor.subtensor.AdvancedIncSubtensor,
GpuIncSubtensor,
GpuAdvancedIncSubtensor1,
theano.sandbox.cuda.blas.GpuGemm,
theano.sandbox.cuda.blas.GpuGemv,
theano.sandbox.cuda.blas.GpuGer,
))):
return False
#If the clients is a transfer, we don't want to fold. We
#let the moving opt finish before deciding what to do.
elif isinstance(client[0].op, HostFromGpu):
return False return False
return True return True
......
...@@ -2625,13 +2625,32 @@ class Alloc(gof.Op): ...@@ -2625,13 +2625,32 @@ class Alloc(gof.Op):
# If the output is a constant, it will have to be deepcopied # If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold. # each time the function is called. So we do not fold.
return False return False
elif (not isinstance(client[0], basestring) elif (not isinstance(client[0], basestring) and
and isinstance(client[0].op, ( #It is the inputs id 0 of the following op
client[1] == 0 and
isinstance(client[0].op, (
#Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
theano.tensor.subtensor.IncSubtensor, theano.tensor.subtensor.IncSubtensor,
theano.tensor.subtensor.AdvancedIncSubtensor1, theano.tensor.subtensor.AdvancedIncSubtensor1,
theano.tensor.subtensor.AdvancedIncSubtensor, theano.tensor.subtensor.AdvancedIncSubtensor,
theano.tensor.blas.Gemv,
theano.tensor.blas_c.CGemv,
theano.tensor.blas.Ger,
theano.tensor.blas_c.CGer,
theano.tensor.blas_scipy.ScipyGer
))): ))):
return False return False
#If the clients is a transfer to the GPU, we don't want to
#fold. We let the Alloc being moved to the GPU, then we
#let the GPU algo decide if it need to fold it or not.
elif client[0].op.__class__.__name__.lower().startswith("gpu"):
return False
return True return True
......
...@@ -1928,7 +1928,8 @@ class TestAlloc(unittest.TestCase): ...@@ -1928,7 +1928,8 @@ class TestAlloc(unittest.TestCase):
#AdvancedIncSubtensor1 #AdvancedIncSubtensor1
(some_matrix[arange(60)], 2), (some_matrix[arange(60)], 2),
#AdvancedIncSubtensor #AdvancedIncSubtensor
(some_matrix[idx, idx], 1)]): (some_matrix[idx, idx], 1)
]):
derp = sum(dot(subtensor, variables)) derp = sum(dot(subtensor, variables))
fobj = theano.function([some_vector], derp, mode=self.mode) fobj = theano.function([some_vector], derp, mode=self.mode)
...@@ -1936,14 +1937,18 @@ class TestAlloc(unittest.TestCase): ...@@ -1936,14 +1937,18 @@ class TestAlloc(unittest.TestCase):
fgrad = theano.function([some_vector], grad_derp, fgrad = theano.function([some_vector], grad_derp,
mode=self.mode) mode=self.mode)
topo_obj = fobj.maker.fgraph.toposort() topo_obj = fobj.maker.fgraph.toposort()
#<= is needed as the GPU currently don't implement
#AdvancedIncSubtensor. When this is the case it can be
#replaced with ==.
assert numpy.sum([isinstance(node.op, alloc) assert numpy.sum([isinstance(node.op, alloc)
for node in topo_obj]) == 0 for node in topo_obj]) <= 1
topo_grad = fgrad.maker.fgraph.toposort() topo_grad = fgrad.maker.fgraph.toposort()
#print subtensor #print subtensor
#theano.printing.debugprint(fgrad) #theano.printing.debugprint(fgrad)
assert numpy.sum([isinstance(node.op, alloc) assert numpy.sum([isinstance(node.op, alloc)
for node in topo_grad]) == n_alloc for node in topo_grad]) == n_alloc, (
alloc, subtensor, n_alloc, topo_grad)
fobj(test_params) fobj(test_params)
fgrad(test_params) fgrad(test_params)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论