提交 7e04466d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #4401 from nouiz/gpu_contiguous

Opt CpuContiguous -> GpuContiguous and add CpuContiguous.ard
...@@ -2280,6 +2280,22 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -2280,6 +2280,22 @@ def local_gpu_contiguous_gpu_contiguous(node):
return [inp] return [inp]
@register_opt('fast_compile')
@local_optimizer([GpuFromHost, tensor.extra_ops.CpuContiguous])
def local_gpu_contiguous(node):
if isinstance(node.op, tensor.extra_ops.CpuContiguous):
x, = node.inputs
if x.owner and isinstance(x.owner.op, HostFromGpu):
gpu_x, = x.owner.inputs
return [tensor.as_tensor_variable(gpu_contiguous(gpu_x))]
if isinstance(node.op, GpuFromHost):
x, = node.inputs
if x.owner and isinstance(x.owner.op, tensor.extra_ops.CpuContiguous):
gpu_x, = x.owner.inputs
return [gpu_contiguous(gpu_x)]
return False
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.Eye]) @local_optimizer([gpu_from_host, tensor.Eye])
def local_gpu_eye(node): def local_gpu_eye(node):
......
...@@ -108,6 +108,16 @@ def test_local_gpu_contiguous_gpu_contiguous(): ...@@ -108,6 +108,16 @@ def test_local_gpu_contiguous_gpu_contiguous():
if isinstance(node.op, basic_ops.GpuContiguous)]) if isinstance(node.op, basic_ops.GpuContiguous)])
def test_local_gpu_contiguous():
a = tensor.fmatrix()
o = tensor.extra_ops.cpu_contiguous(a)
for o in [o, cuda.gpu_from_host(o)]:
f = theano.function([a], o, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.toposort()
if isinstance(node.op, basic_ops.GpuContiguous)])
f([[2.]])
def test_local_assert_no_cpu_op(): def test_local_assert_no_cpu_op():
numpy.random.seed(1) numpy.random.seed(1)
m = numpy.random.uniform(-1, 1, (10, 10)).astype("float32") m = numpy.random.uniform(-1, 1, (10, 10)).astype("float32")
......
...@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayConstant, get_context, ...@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayConstant, get_context,
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
host_from_gpu, GpuToGpu, host_from_gpu, GpuToGpu,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, GpuSplit, GpuContiguous, gpu_contiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch, from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
...@@ -343,6 +343,12 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -343,6 +343,12 @@ def local_gpu_contiguous_gpu_contiguous(node):
return [inp] return [inp]
@register_opt('fast_compile')
@op_lifter([tensor.extra_ops.CpuContiguous])
def local_gpu_contiguous(node, context_name):
return gpu_contiguous
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
def local_gpureshape(node, context_name): def local_gpureshape(node, context_name):
......
...@@ -64,6 +64,15 @@ def test_local_gpu_contiguous_gpu_contiguous(): ...@@ -64,6 +64,15 @@ def test_local_gpu_contiguous_gpu_contiguous():
if isinstance(node.op, basic_ops.GpuContiguous)]) if isinstance(node.op, basic_ops.GpuContiguous)])
def test_local_gpu_contiguous():
a = tensor.fmatrix()
o = tensor.extra_ops.cpu_contiguous(a)
f = theano.function([a], o, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.toposort()
if isinstance(node.op, basic_ops.GpuContiguous)])
f([[2.]])
def test_flatten(): def test_flatten():
m = theano.tensor.fmatrix() m = theano.tensor.fmatrix()
f = theano.function([m], m.flatten(), mode=mode_with_gpu) f = theano.function([m], m.flatten(), mode=mode_with_gpu)
......
...@@ -35,6 +35,9 @@ class CpuContiguous(theano.Op): ...@@ -35,6 +35,9 @@ class CpuContiguous(theano.Op):
assert x.flags['C_CONTIGUOUS'] assert x.flags['C_CONTIGUOUS']
y[0] = x y[0] = x
def grad(self, inputs, dout):
return [theano.tensor.as_tensor_variable(dout[0])]
def c_code(self, node, name, inames, onames, sub): def c_code(self, node, name, inames, onames, sub):
x, = inames x, = inames
y, = onames y, = onames
......
...@@ -25,12 +25,16 @@ def test_cpu_contiguous(): ...@@ -25,12 +25,16 @@ def test_cpu_contiguous():
a = T.fmatrix('a') a = T.fmatrix('a')
i = T.iscalar('i') i = T.iscalar('i')
a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32') a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
f = theano.function([a, i], cpu_contiguous(a.reshape((5,4))[::i])) f = theano.function([a, i], cpu_contiguous(a.reshape((5, 4))[::i]))
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([isinstance(node.op, CpuContiguous) for node in topo]) assert any([isinstance(node.op, CpuContiguous) for node in topo])
assert f(a_val, 1).flags['C_CONTIGUOUS'] assert f(a_val, 1).flags['C_CONTIGUOUS']
assert f(a_val, 2).flags['C_CONTIGUOUS'] assert f(a_val, 2).flags['C_CONTIGUOUS']
assert f(a_val, 3).flags['C_CONTIGUOUS'] assert f(a_val, 3).flags['C_CONTIGUOUS']
# Test the grad:
theano.tests.unittest_tools.verify_grad(cpu_contiguous,
[numpy.random.rand(5, 7, 2)])
class TestCumsumOp(utt.InferShapeTester): class TestCumsumOp(utt.InferShapeTester):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论