提交 71014302 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1831 from abergeron/gpuarray_joinsplit

GpuArray join/split
......@@ -6,7 +6,7 @@ import theano
from theano import Op, Apply
from theano import tensor, scalar, config
from theano.scalar import Scalar
from theano.tensor.basic import Alloc
from theano.tensor.basic import Alloc, Join, Split
from theano.gof.python25 import any
from theano.gof.utils import MethodNotDefined
......@@ -725,6 +725,62 @@ class GpuReshape(HideC, tensor.Reshape):
out[0] = x.reshape(tuple(shp))
class GpuJoin(HideC, Join):
def make_node(self, axis, *tensors):
node = Join.make_node(self, axis, *tensors)
return Apply(self, [node.inputs[0]] + map(as_gpuarray_variable,
tensors),
[GpuArrayType(broadcastable=node.outputs[0].broadcastable,
dtype=node.outputs[0].dtype)()])
def perform(self, node, axis_and_tensors, out_):
out, = out_
axis = int(axis_and_tensors[0])
tensors = axis_and_tensors[1:]
out[0] = pygpu.concatenate(tensors, axis=axis).astype(
node.outputs[0].dtype)
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, inputs, out_, sub):
copy_to_list = []
restype=pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
for i, inp in enumerate(inputs[1:]):
copy_to_list.append("als[%s] = &%s->ga;" % (i, inp))
return """
GpuArray **als = (GpuArray **)PyMem_Malloc(sizeof(GpuArray *) * %(n)s);
if (als == NULL) {
PyErr_NoMemory();
%(fail)s
}
%(copy_inputs_to_list)s
Py_XDECREF(%(out)s);
%(out)s = pygpu_concatenate(als, %(n)s, PyInt_AsLong((PyObject *)%(axis)s),
%(restype)s, (PyObject *)&PyGpuArrayType,
pygpu_default_context());
PyMem_Free(als);
if (%(out)s == NULL)
%(fail)s
""" % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
restype=restype)
gpu_join = GpuJoin()
class GpuSplit(HideC, Split):
def make_node(self, x, axis, splits):
node = Split.make_node(self, x, axis, splits)
x = as_gpuarray_variable(x)
outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable)()
for o in node.outputs]
return Apply(self, [x] + node.inputs[1:], outs)
# we reuse the perform of the CPU op, which is suitable
class GpuEye(GpuKernelBase, Op):
def __init__(self, dtype=None):
if dtype is None:
......
......@@ -21,7 +21,7 @@ from theano.tensor.nnet.conv import ConvOp
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu,
gpu_alloc, GpuAlloc, GpuReshape, GpuEye
gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin,
)
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
from theano.sandbox.gpuarray.conv import GpuConv
......@@ -152,9 +152,27 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpuaalloc2(node):
"""
Join(axis, Alloc, Alloc, ...) -> Join(axis, GpuAlloc, Alloc, ...)
Moves an alloc that is an input to join to the gpu.
"""
if (isinstance(node.op, tensor.Alloc) and
all(c != 'output' and
c.op == tensor.join and
all(i.owner and
i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:])
for c, idx in node.outputs[0].clients)):
return [host_from_gpu(gpu_alloc(*node.inputs))]
@register_opt()
@op_lifter([tensor.Alloc])
def local_gpualloc(node):
def local_gpuaalloc(node):
new_out = gpu_alloc(*node.inputs)
# We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change.
......@@ -267,6 +285,26 @@ def local_gpua_specifyShape(node):
return tensor.specify_shape
@register_opt()
@op_lifter([tensor.Join])
def local_gpua_join(node):
return gpu_join
@register_opt()
@local_optimizer([GpuJoin])
def local_gpuajoin_1(node):
# join of a single element
if (isinstance(node.op, GpuJoin) and
len(node.inputs) == 2):
return [node.inputs[1]]
@register_opt()
@op_lifter([tensor.Split])
def local_gpua_split(node):
return GpuSplit(node.op.len_splits)
@register_opt()
@op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node):
......
......@@ -7,7 +7,9 @@ import theano
import theano.tensor as T
from theano.tensor import TensorType
from theano.tensor.basic import alloc
from theano.tensor.tests.test_basic import rand, safe_make_node, T_reshape
from theano.tensor.tests.test_basic import (
rand, safe_make_node, T_reshape, T_Join_and_Split
)
from theano.tests.unittest_tools import SkipTest
from numpy.testing.noseclasses import KnownFailureTest
......@@ -16,6 +18,8 @@ import theano.sandbox.gpuarray
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
# If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it.
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
......@@ -38,7 +42,7 @@ from theano.sandbox.gpuarray.basic_ops import (
gpu_from_cuda,
cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape,
GpuEye)
gpu_join, GpuJoin, GpuSplit, GpuEye)
from theano.tests import unittest_tools as utt
utt.seed_rng()
......@@ -339,6 +343,46 @@ class G_reshape(T_reshape):
assert self.op == GpuReshape
class G_Join_and_Split(T_Join_and_Split):
def setUp(self):
super(G_Join_and_Split, self).setUp()
self.mode = mode_with_gpu.excluding('constant_folding')
self.join_op = GpuJoin
self.split_op = GpuSplit
# Use join instead of MakeVector since there is no MakeVector on GPU
self.make_vector_op = GpuJoin
# this is to avoid errors with limited devices
self.floatX = 'float32'
self.hide_error = theano.config.mode not in ['DebugMode', 'DEBUG_MODE']
self.shared = gpuarray_shared_constructor
def test_gpujoin_gpualloc():
a = T.fmatrix('a')
a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
b = T.fmatrix('b')
b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
f = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
mode=mode_without_gpu)
f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
mode=mode_with_gpu)
f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
T.ones_like(b)) + 4,
mode=mode_with_gpu)
assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
assert sum([isinstance(node.op, GpuAlloc)
for node in f_gpu.maker.fgraph.toposort()]) == 2
assert sum([node.op == gpu_join
for node in f_gpu.maker.fgraph.toposort()]) == 1
assert sum([isinstance(node.op, GpuAlloc)
for node in f_gpu2.maker.fgraph.toposort()]) == 2
assert sum([node.op == gpu_join
for node in f_gpu2.maker.fgraph.toposort()]) == 1
assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
def test_gpueye():
def check(dtype, N, M_=None):
# Theano does not accept None as a tensor.
......
......@@ -3448,11 +3448,11 @@ class T_Join_and_Split(unittest.TestCase):
[a_val, b_val, c_val, d_val, e_val], rng=rng)
# Should raise an error if length of dimension 0 is not 1
bad_val = rng.rand(2, 1, 1, 1, 2, 1).astype(self.floatX)
self.assertRaises(TypeError, g, bad_val, b_val, c_val, d_val, e_val)
self.assertRaises(TypeError, g, a_val, bad_val, c_val, d_val, e_val)
self.assertRaises(TypeError, g, a_val, b_val, bad_val, d_val, e_val)
self.assertRaises(TypeError, g, a_val, b_val, c_val, bad_val, e_val)
self.assertRaises(TypeError, g, a_val, b_val, c_val, d_val, bad_val)
self.assertRaises(TypeError, f, bad_val, b_val, c_val, d_val, e_val)
self.assertRaises(TypeError, f, a_val, bad_val, c_val, d_val, e_val)
self.assertRaises(TypeError, f, a_val, b_val, bad_val, d_val, e_val)
self.assertRaises(TypeError, f, a_val, b_val, c_val, bad_val, e_val)
self.assertRaises(TypeError, f, a_val, b_val, c_val, d_val, bad_val)
# Should raise an error if any dimension other than 4 has length != 1
bad_a_val = rng.rand(1, 2, 1, 1, 2, 1).astype(self.floatX)
bad_b_val = rng.rand(1, 1, 1, 1, 2, 2).astype(self.floatX)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论