提交 19c0dbcb authored 作者: abergeron's avatar abergeron

Merge pull request #1782 from nouiz/alloc_cst_fold

Alloc cst fold
...@@ -3198,14 +3198,28 @@ class GpuAlloc(GpuOp): ...@@ -3198,14 +3198,28 @@ class GpuAlloc(GpuOp):
# If the output is a constant, it will have to be deepcopied # If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold. # each time the function is called. So we do not fold.
return False return False
elif (not isinstance(client[0], basestring) elif (#The following ops work inplace of their input id 0.
and isinstance(client[0].op, ( client[1] == 0 and
tensor.IncSubtensor, isinstance(client[0].op, (
tensor.AdvancedIncSubtensor1, #Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
#theano.tensor.subtensor.AdvancedIncSubtensor,
GpuIncSubtensor, GpuIncSubtensor,
GpuAdvancedIncSubtensor1 GpuAdvancedIncSubtensor1,
theano.sandbox.cuda.blas.GpuGemm,
theano.sandbox.cuda.blas.GpuGemv,
theano.sandbox.cuda.blas.GpuGer,
))): ))):
return False return False
#If the clients is a transfer, we don't want to fold. We
#let the moving opt finish before deciding what to do.
elif isinstance(client[0].op, HostFromGpu):
return False
return True return True
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
......
...@@ -2625,13 +2625,31 @@ class Alloc(gof.Op): ...@@ -2625,13 +2625,31 @@ class Alloc(gof.Op):
# If the output is a constant, it will have to be deepcopied # If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold. # each time the function is called. So we do not fold.
return False return False
elif (not isinstance(client[0], basestring) elif (#The following ops work inplace of their input id 0.
and isinstance(client[0].op, ( client[1] == 0 and
isinstance(client[0].op, (
#Ops that will work inplace on the Alloc. So if they
#get constant_folded, they would copy the
#constant and this is less efficients.
#Not doing the constant folding could also lower
#the peak memory usage, as we the "constant" won't
#always exists.
theano.tensor.subtensor.IncSubtensor, theano.tensor.subtensor.IncSubtensor,
theano.tensor.subtensor.AdvancedIncSubtensor1, theano.tensor.subtensor.AdvancedIncSubtensor1,
theano.tensor.subtensor.AdvancedIncSubtensor, theano.tensor.subtensor.AdvancedIncSubtensor,
theano.tensor.blas.Gemv,
theano.tensor.blas_c.CGemv,
theano.tensor.blas.Ger,
theano.tensor.blas_c.CGer,
theano.tensor.blas_scipy.ScipyGer
))): ))):
return False return False
#If the clients is a transfer to the GPU, we don't want to
#fold. We let the Alloc being moved to the GPU, then we
#let the GPU algo decide if it need to fold it or not.
elif client[0].op.__class__.__name__.lower().startswith("gpu"):
return False
return True return True
......
...@@ -1928,7 +1928,8 @@ class TestAlloc(unittest.TestCase): ...@@ -1928,7 +1928,8 @@ class TestAlloc(unittest.TestCase):
#AdvancedIncSubtensor1 #AdvancedIncSubtensor1
(some_matrix[arange(60)], 2), (some_matrix[arange(60)], 2),
#AdvancedIncSubtensor #AdvancedIncSubtensor
(some_matrix[idx, idx], 1)]): (some_matrix[idx, idx], 1)
]):
derp = sum(dot(subtensor, variables)) derp = sum(dot(subtensor, variables))
fobj = theano.function([some_vector], derp, mode=self.mode) fobj = theano.function([some_vector], derp, mode=self.mode)
...@@ -1936,14 +1937,18 @@ class TestAlloc(unittest.TestCase): ...@@ -1936,14 +1937,18 @@ class TestAlloc(unittest.TestCase):
fgrad = theano.function([some_vector], grad_derp, fgrad = theano.function([some_vector], grad_derp,
mode=self.mode) mode=self.mode)
topo_obj = fobj.maker.fgraph.toposort() topo_obj = fobj.maker.fgraph.toposort()
#<= is needed as the GPU currently don't implement
#AdvancedIncSubtensor. When this is the case it can be
#replaced with ==.
assert numpy.sum([isinstance(node.op, alloc) assert numpy.sum([isinstance(node.op, alloc)
for node in topo_obj]) == 0 for node in topo_obj]) <= 1
topo_grad = fgrad.maker.fgraph.toposort() topo_grad = fgrad.maker.fgraph.toposort()
#print subtensor #print subtensor
#theano.printing.debugprint(fgrad) #theano.printing.debugprint(fgrad)
assert numpy.sum([isinstance(node.op, alloc) assert numpy.sum([isinstance(node.op, alloc)
for node in topo_grad]) == n_alloc for node in topo_grad]) == n_alloc, (
alloc, subtensor, n_alloc, topo_grad)
fobj(test_params) fobj(test_params)
fgrad(test_params) fgrad(test_params)
......
...@@ -1091,7 +1091,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -1091,7 +1091,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
# Assert that the dot was optimized somehow # Assert that the dot was optimized somehow
self.assertFunctionContains0(f, T.dot) self.assertFunctionContains0(f, T.dot)
self.assertFunctionContains1(f, Gemv(False)) self.assertFunctionContains1(f, Gemv(True))
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(f(), numpy.dot(v.get_value(), w.get_value())) assert numpy.allclose(f(), numpy.dot(v.get_value(), w.get_value()))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论