提交 71014302 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1831 from abergeron/gpuarray_joinsplit

GpuArray join/split
...@@ -6,7 +6,7 @@ import theano ...@@ -6,7 +6,7 @@ import theano
from theano import Op, Apply from theano import Op, Apply
from theano import tensor, scalar, config from theano import tensor, scalar, config
from theano.scalar import Scalar from theano.scalar import Scalar
from theano.tensor.basic import Alloc from theano.tensor.basic import Alloc, Join, Split
from theano.gof.python25 import any from theano.gof.python25 import any
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
...@@ -725,6 +725,62 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -725,6 +725,62 @@ class GpuReshape(HideC, tensor.Reshape):
out[0] = x.reshape(tuple(shp)) out[0] = x.reshape(tuple(shp))
class GpuJoin(HideC, Join):
def make_node(self, axis, *tensors):
node = Join.make_node(self, axis, *tensors)
return Apply(self, [node.inputs[0]] + map(as_gpuarray_variable,
tensors),
[GpuArrayType(broadcastable=node.outputs[0].broadcastable,
dtype=node.outputs[0].dtype)()])
def perform(self, node, axis_and_tensors, out_):
out, = out_
axis = int(axis_and_tensors[0])
tensors = axis_and_tensors[1:]
out[0] = pygpu.concatenate(tensors, axis=axis).astype(
node.outputs[0].dtype)
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, inputs, out_, sub):
copy_to_list = []
restype=pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
for i, inp in enumerate(inputs[1:]):
copy_to_list.append("als[%s] = &%s->ga;" % (i, inp))
return """
GpuArray **als = (GpuArray **)PyMem_Malloc(sizeof(GpuArray *) * %(n)s);
if (als == NULL) {
PyErr_NoMemory();
%(fail)s
}
%(copy_inputs_to_list)s
Py_XDECREF(%(out)s);
%(out)s = pygpu_concatenate(als, %(n)s, PyInt_AsLong((PyObject *)%(axis)s),
%(restype)s, (PyObject *)&PyGpuArrayType,
pygpu_default_context());
PyMem_Free(als);
if (%(out)s == NULL)
%(fail)s
""" % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
restype=restype)
gpu_join = GpuJoin()
class GpuSplit(HideC, Split):
def make_node(self, x, axis, splits):
node = Split.make_node(self, x, axis, splits)
x = as_gpuarray_variable(x)
outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable)()
for o in node.outputs]
return Apply(self, [x] + node.inputs[1:], outs)
# we reuse the perform of the CPU op, which is suitable
class GpuEye(GpuKernelBase, Op): class GpuEye(GpuKernelBase, Op):
def __init__(self, dtype=None): def __init__(self, dtype=None):
if dtype is None: if dtype is None:
......
...@@ -21,7 +21,7 @@ from theano.tensor.nnet.conv import ConvOp ...@@ -21,7 +21,7 @@ from theano.tensor.nnet.conv import ConvOp
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import ( from theano.sandbox.gpuarray.basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu, host_from_gpu, gpu_from_host, HostFromGpu,
gpu_alloc, GpuAlloc, GpuReshape, GpuEye gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin,
) )
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
from theano.sandbox.gpuarray.conv import GpuConv from theano.sandbox.gpuarray.conv import GpuConv
...@@ -152,9 +152,27 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua', ...@@ -152,9 +152,27 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, 'fast_run', 'gpuarray') local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpuaalloc2(node):
"""
Join(axis, Alloc, Alloc, ...) -> Join(axis, GpuAlloc, Alloc, ...)
Moves an alloc that is an input to join to the gpu.
"""
if (isinstance(node.op, tensor.Alloc) and
all(c != 'output' and
c.op == tensor.join and
all(i.owner and
i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:])
for c, idx in node.outputs[0].clients)):
return [host_from_gpu(gpu_alloc(*node.inputs))]
@register_opt() @register_opt()
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
def local_gpualloc(node): def local_gpuaalloc(node):
new_out = gpu_alloc(*node.inputs) new_out = gpu_alloc(*node.inputs)
# We need to hide new broadcastable dimensions because # We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change. # ReplaceValidate doesn't like when they change.
...@@ -267,6 +285,26 @@ def local_gpua_specifyShape(node): ...@@ -267,6 +285,26 @@ def local_gpua_specifyShape(node):
return tensor.specify_shape return tensor.specify_shape
@register_opt()
@op_lifter([tensor.Join])
def local_gpua_join(node):
return gpu_join
@register_opt()
@local_optimizer([GpuJoin])
def local_gpuajoin_1(node):
# join of a single element
if (isinstance(node.op, GpuJoin) and
len(node.inputs) == 2):
return [node.inputs[1]]
@register_opt()
@op_lifter([tensor.Split])
def local_gpua_split(node):
return GpuSplit(node.op.len_splits)
@register_opt() @register_opt()
@op_lifter([tensor.Subtensor]) @op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node): def local_gpua_subtensor(node):
......
...@@ -7,7 +7,9 @@ import theano ...@@ -7,7 +7,9 @@ import theano
import theano.tensor as T import theano.tensor as T
from theano.tensor import TensorType from theano.tensor import TensorType
from theano.tensor.basic import alloc from theano.tensor.basic import alloc
from theano.tensor.tests.test_basic import rand, safe_make_node, T_reshape from theano.tensor.tests.test_basic import (
rand, safe_make_node, T_reshape, T_Join_and_Split
)
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
from numpy.testing.noseclasses import KnownFailureTest from numpy.testing.noseclasses import KnownFailureTest
...@@ -16,6 +18,8 @@ import theano.sandbox.gpuarray ...@@ -16,6 +18,8 @@ import theano.sandbox.gpuarray
if theano.sandbox.gpuarray.pygpu is None: if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed") raise SkipTest("pygpu not installed")
# If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it.
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated: if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number: if not cuda_ndarray.use.device_number:
...@@ -38,7 +42,7 @@ from theano.sandbox.gpuarray.basic_ops import ( ...@@ -38,7 +42,7 @@ from theano.sandbox.gpuarray.basic_ops import (
gpu_from_cuda, gpu_from_cuda,
cuda_from_gpu, HostFromGpu, cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape, GpuFromHost, GpuReshape,
GpuEye) gpu_join, GpuJoin, GpuSplit, GpuEye)
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
utt.seed_rng() utt.seed_rng()
...@@ -339,6 +343,46 @@ class G_reshape(T_reshape): ...@@ -339,6 +343,46 @@ class G_reshape(T_reshape):
assert self.op == GpuReshape assert self.op == GpuReshape
class G_Join_and_Split(T_Join_and_Split):
def setUp(self):
super(G_Join_and_Split, self).setUp()
self.mode = mode_with_gpu.excluding('constant_folding')
self.join_op = GpuJoin
self.split_op = GpuSplit
# Use join instead of MakeVector since there is no MakeVector on GPU
self.make_vector_op = GpuJoin
# this is to avoid errors with limited devices
self.floatX = 'float32'
self.hide_error = theano.config.mode not in ['DebugMode', 'DEBUG_MODE']
self.shared = gpuarray_shared_constructor
def test_gpujoin_gpualloc():
a = T.fmatrix('a')
a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
b = T.fmatrix('b')
b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
f = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
mode=mode_without_gpu)
f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
mode=mode_with_gpu)
f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
T.ones_like(b)) + 4,
mode=mode_with_gpu)
assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
assert sum([isinstance(node.op, GpuAlloc)
for node in f_gpu.maker.fgraph.toposort()]) == 2
assert sum([node.op == gpu_join
for node in f_gpu.maker.fgraph.toposort()]) == 1
assert sum([isinstance(node.op, GpuAlloc)
for node in f_gpu2.maker.fgraph.toposort()]) == 2
assert sum([node.op == gpu_join
for node in f_gpu2.maker.fgraph.toposort()]) == 1
assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
def test_gpueye(): def test_gpueye():
def check(dtype, N, M_=None): def check(dtype, N, M_=None):
# Theano does not accept None as a tensor. # Theano does not accept None as a tensor.
......
...@@ -3448,11 +3448,11 @@ class T_Join_and_Split(unittest.TestCase): ...@@ -3448,11 +3448,11 @@ class T_Join_and_Split(unittest.TestCase):
[a_val, b_val, c_val, d_val, e_val], rng=rng) [a_val, b_val, c_val, d_val, e_val], rng=rng)
# Should raise an error if length of dimension 0 is not 1 # Should raise an error if length of dimension 0 is not 1
bad_val = rng.rand(2, 1, 1, 1, 2, 1).astype(self.floatX) bad_val = rng.rand(2, 1, 1, 1, 2, 1).astype(self.floatX)
self.assertRaises(TypeError, g, bad_val, b_val, c_val, d_val, e_val) self.assertRaises(TypeError, f, bad_val, b_val, c_val, d_val, e_val)
self.assertRaises(TypeError, g, a_val, bad_val, c_val, d_val, e_val) self.assertRaises(TypeError, f, a_val, bad_val, c_val, d_val, e_val)
self.assertRaises(TypeError, g, a_val, b_val, bad_val, d_val, e_val) self.assertRaises(TypeError, f, a_val, b_val, bad_val, d_val, e_val)
self.assertRaises(TypeError, g, a_val, b_val, c_val, bad_val, e_val) self.assertRaises(TypeError, f, a_val, b_val, c_val, bad_val, e_val)
self.assertRaises(TypeError, g, a_val, b_val, c_val, d_val, bad_val) self.assertRaises(TypeError, f, a_val, b_val, c_val, d_val, bad_val)
# Should raise an error if any dimension other than 4 has length != 1 # Should raise an error if any dimension other than 4 has length != 1
bad_a_val = rng.rand(1, 2, 1, 1, 2, 1).astype(self.floatX) bad_a_val = rng.rand(1, 2, 1, 1, 2, 1).astype(self.floatX)
bad_b_val = rng.rand(1, 1, 1, 1, 2, 2).astype(self.floatX) bad_b_val = rng.rand(1, 1, 1, 1, 2, 2).astype(self.floatX)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论