提交 85f71330 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2482 from craffel/master

Adding GpuSplit op to cuda submodule
......@@ -288,7 +288,7 @@ if cuda_available:
GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
GpuSubtensor, GpuIncSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
GpuFlatten, GpuShape, GpuAlloc,
GpuFlatten, GpuShape, GpuAlloc, GpuSplit,
GpuJoin, fscalar, fvector, fmatrix, frow, fcol,
ftensor3, ftensor4,
scalar, vector, matrix, row, col,
......
......@@ -3229,6 +3229,16 @@ class GpuJoin(tensor.Join, GpuOp):
gpu_join = GpuJoin()
class GpuSplit(tensor.Split, GpuOp):
def make_node(self, x, axis, splits):
x = as_cuda_ndarray_variable(x)
node = tensor.Split.make_node(self, x, axis, splits)
outs = [CudaNdarrayType(dtype=o.dtype,
broadcastable=o.type.broadcastable)()
for o in node.outputs]
return Apply(self, [x] + node.inputs[1:], outs)
class GpuAlloc(GpuOp):
"""Implement Alloc on the gpu.
......
......@@ -24,7 +24,7 @@ from theano.sandbox.cuda.basic_ops import (
GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
......@@ -299,6 +299,25 @@ def local_gpu_elemwise_1(node):
return False
@register_opt()
@local_optimizer([tensor.Split, gpu_from_host])
def local_gpu_split(node):
if isinstance(node.op, tensor.Split):
input = node.inputs[0]
if input.owner and isinstance(input.owner.op, HostFromGpu):
new_op = GpuSplit(node.op.len_splits)
split_res = new_op(gpu_from_host(input), *node.inputs[1:])
return [host_from_gpu(o) for o in split_res]
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, tensor.Split):
split_node = host_input.owner
new_op = GpuSplit(split_node.op.len_splits)
return [new_op(gpu_from_host(split_node.inputs[0]),
*split_node.inputs[1:])[host_input.index]]
return False
@register_opt()
@local_optimizer([tensor.DimShuffle, gpu_from_host])
def local_gpu_dimshuffle_0(node):
......
......@@ -291,6 +291,27 @@ def test_local_gpu_subtensor():
assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
def test_local_split():
""" Test that the GpuSplit op is being applied and works """
# Construct symbolic split
x = tensor.fvector()
splits = tensor.lvector()
ra, rb, rc = tensor.split(x, splits, n_splits=3, axis=0)
# Compile function to use CPU
f = theano.function([x, splits], [ra, rb, rc], mode=mode_without_gpu)
# Get values for CPU version
cpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
l = f.maker.fgraph.toposort()
# Ensure that one op is theano.tensor.Split
assert any([isinstance(o.op, theano.tensor.Split) for o in l])
# GPU version
f = theano.function([x, splits], [ra, rb, rc], mode=mode_with_gpu)
gpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
l = f.maker.fgraph.toposort()
assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
# Check equality
assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
def test_print_op():
""" Test that print ops don't block gpu optimization"""
b = tensor.fmatrix()
......
......@@ -3213,7 +3213,7 @@ class Split(Op):
for i in xrange(self.len_splits):
upper_idx = lower_idx + splits[i]
general_key[axis] = slice(lower_idx, upper_idx, None)
outputs[i][0] = x.__getitem__(general_key).copy()
outputs[i][0] = x.__getitem__(tuple(general_key)).copy()
lower_idx = upper_idx
def infer_shape(self, node, in_shapes):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论