提交 7af47dd8 authored 作者: abergeron's avatar abergeron

Merge pull request #1925 from nouiz/gpuarray

GpuContiguous, tests, better opencl support
......@@ -5,6 +5,7 @@ import numpy
import theano
from theano import Op, Apply
from theano import tensor, scalar, config
from theano.gradient import grad_undefined
from theano.scalar import Scalar
from theano.tensor.basic import Alloc, Join, Split
......@@ -516,7 +517,7 @@ class CudaFromGpu(Op):
return [gpu_from_cuda(gz)]
def R_op(self, inputs, eval_points):
from theano.sandbox.cuda import CudaNdArrayType
from theano.sandbox.cuda import CudaNdarrayType
ev, = eval_points
if (isinstance(ev, CudaNdarrayType)):
return [gpu_from_cuda(ev)]
......@@ -750,6 +751,73 @@ class GpuAlloc(HideC, Alloc):
gpu_alloc = GpuAlloc()
class GpuContiguous(Op):
"""
Always return a c contiguous output. Copy the input only if it is
not already c contiguous.
"""
view_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def grad(self, inputs, dout):
x, = inputs
dout, = dout
dout = as_gpuarray_variable(dout)
return [dout]
def __str__(self):
return self.__class__.__name__
def make_node(self, input):
input = as_gpuarray_variable(input)
return Apply(self, [input], [input.type()])
def c_headers(self):
return ['<numpy_compat.h>']
def c_code_cache_version(self):
return (3,)
def c_code(self, node, name, inp, out, sub):
input, = inp
z, = out
fail = sub['fail']
str = """
{
if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))){
Py_XDECREF(%(z)s);
%(z)s = %(input)s;
Py_INCREF(%(z)s);
} else if ((NULL == %(z)s)""" % locals()
for i in xrange(len(node.inputs[0].type.broadcastable)):
str += "\n|| (PyGpuArray_DIMS(%(input)s)[%(i)s] != PyGpuArray_DIMS(%(z)s)[%(i)s])" % locals()
str += """
|| !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga)))
{
Py_XDECREF(%(z)s);
%(z)s = pygpu_copy(%(input)s, GA_C_ORDER);
if (!%(z)s)
{
%(fail)s;
}
}else if(pygpu_move(%(z)s, %(input)s) == -1) {
%(fail)s;
}
}
""" % locals()
return str
gpu_contiguous = GpuContiguous()
class GpuReshape(HideC, tensor.Reshape):
"""
Implement Reshape on the gpu.
......@@ -769,7 +837,6 @@ class GpuReshape(HideC, tensor.Reshape):
raise ValueError('shape argument to GpuReshape.perform'
' has incorrect length %i'
', should be %i' % (len(shp), self.ndim), shp)
s = shp.prod()
if shp.prod() != x.size:
# We need to do check here to raise the same error as NumPy.
......@@ -872,7 +939,8 @@ class GpuEye(GpuKernelBase, Op):
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i]) for i in xrange(3)]
return [grad_undefined(self, i, inp[i])
for i in xrange(3)]
def __eq__(self, other):
return type(self) == type(other) and self.dtype == other.dtype
......
......@@ -31,7 +31,8 @@ from theano.sandbox.gpuarray.nnet import (
GpuSoftmaxWithBias, GpuSoftmax
)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduceCuda)
GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY)
from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20)
......@@ -366,15 +367,25 @@ def local_gpua_advanced_incsubtensor(node):
def local_gpua_careduce(node):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)):
dev = theano.sandbox.gpuarray.init_dev.device
if dev.startswith('opencl'):
op = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code.
return
else:
op = GpuCAReduceCuda
x, = node.inputs
greduce = GpuCAReduceCuda(
greduce = op(
node.op.scalar_op, axis=node.op.axis,
dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None))
gvar = greduce(x)
# We need to have the make node called, otherwise the mask can
# be None
if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([gpu_from_host(x)])):
return greduce
else:
# Try to make a simpler pattern based on reshaping
......@@ -407,7 +418,7 @@ def local_gpua_careduce(node):
for idx, m in enumerate(new_mask):
if m == 1:
new_axis.append(idx)
greduce = GpuCAReduceCuda(
greduce = op(
node.op.scalar_op,
axis=new_axis, reduce_mask=new_mask,
dtype=getattr(node.op, 'dtype', None),
......
......@@ -42,7 +42,8 @@ from theano.sandbox.gpuarray.basic_ops import (
gpu_from_cuda,
cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape,
gpu_join, GpuJoin, GpuSplit, GpuEye)
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.tests import unittest_tools as utt
utt.seed_rng()
......@@ -73,6 +74,7 @@ def may_fail(msg, EClass):
return wrapper
return test_decorator
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None):
if mode is None:
......@@ -93,6 +95,7 @@ def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
except TypeError:
continue
def rand_gpuarray(*shape, **kwargs):
r = rng.rand(*shape) * 2 - 1
dtype = kwargs.pop('dtype', theano.config.floatX)
......@@ -208,10 +211,10 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
def test_transfer_cpu_gpu():
a = T.fmatrix('a')
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
av = numpy.asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
f = theano.function([a], gpu_from_host(a))
fv = f(av)
assert GpuArrayType.values_eq(fv, gv)
......@@ -231,8 +234,8 @@ def test_transfer_strided():
av = numpy.asarray(rng.rand(5, 8), dtype='float32')
gv = gpuarray.array(av)
av = av[:,::2]
gv = gv[:,::2]
av = av[:, ::2]
gv = gv[:, ::2]
f = theano.function([a], gpu_from_host(a))
fv = f(av)
......@@ -247,7 +250,7 @@ def test_transfer_strided():
"that the tests will be run this way", ValueError)
def test_transfer_cuda_gpu():
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
if cuda_ndarray.cuda_available is False:
raise SkipTest("Can't test interaction with cuda if cuda not present")
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
c = cuda_ndarray.CudaNdarrayType((False, False))('c')
......@@ -255,8 +258,8 @@ def test_transfer_cuda_gpu():
av = theano._asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
cv = cuda_ndarray.CudaNdarray(av)
gvs = gv[:,::-2]
cvs = cv[:,::-2]
gvs = gv[:, ::-2]
cvs = cv[:, ::-2]
f = theano.function([c], gpu_from_cuda(c))
fv = f(cv)
......@@ -324,6 +327,19 @@ def test_shape():
assert isinstance(topo[0].op, T.Shape)
def test_gpu_contiguous():
a = T.fmatrix('a')
i = T.iscalar('i')
a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
f = theano.function([a, i], gpu_contiguous(a[::i]),
mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert any([isinstance(node.op, GpuSubtensor) for node in topo])
assert f(a_val, 1).flags.c_contiguous
assert f(a_val, 2).flags.c_contiguous
assert f(a_val, 2).flags.c_contiguous
class G_reshape(T_reshape):
def shortDescription(self):
return None
......@@ -335,11 +351,11 @@ class G_reshape(T_reshape):
mode=mode_with_gpu,
# avoid errors with limited devices
# dtype='float32',
ignore_topo=(HostFromGpu, GpuFromHost,
theano.compile.DeepCopyOp,
theano.sandbox.gpuarray.elemwise.GpuElemwise,
theano.tensor.opt.Shape_i,
theano.tensor.opt.MakeVector))
ignore_topo=(HostFromGpu, GpuFromHost,
theano.compile.DeepCopyOp,
theano.sandbox.gpuarray.elemwise.GpuElemwise,
theano.tensor.opt.Shape_i,
theano.tensor.opt.MakeVector))
assert self.op == GpuReshape
......@@ -429,7 +445,8 @@ def test_hostfromgpu_shape_i():
"""
m = mode_with_gpu.including('local_dot_to_dot22',
'local_dot22_to_dot22scalar','specialize')
'local_dot22_to_dot22scalar',
'specialize')
a = T.fmatrix('a')
ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
......
import theano
from theano import scalar, gof
from theano.gof.python25 import all, any
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
test_CAReduce, T_reduce_dtype)
......@@ -19,17 +21,32 @@ class test_gpu_Broadcast(test_Broadcast):
type = GpuArrayType
cop = GpuElemwise
ctype = GpuArrayType
# The order is important
linkers = [gof.PerformLinker, gof.CLinker]
def setUp(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
self.linkers = [gof.PerformLinker]
def rand_val(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray))
# no c_code() yet
#cop = GpuElemwise
#ctype = GpuArrayType
def rand_cval(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray))
def test_c(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c()
def test_c_inplace(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c_inplace()
class test_GpuDimShuffle(test_DimShuffle):
op = GpuDimShuffle
......@@ -149,7 +166,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
# ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
# ((5,4,3,10,11),[1,2]),
]
]
op = GpuCAReduceCuda
reds = [scalar.add, scalar.mul,
scalar.maximum, scalar.minimum]
......@@ -161,6 +178,12 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
def test_perform_nan(self):
return
def setUp(self):
super(test_GpuCAReduceCuda, self).setUp()
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests")
class T_gpureduce_dtype(T_reduce_dtype):
mode = mode_with_gpu.excluding('local_cut_useless_reduce')
......@@ -172,6 +195,11 @@ class T_gpureduce_dtype(T_reduce_dtype):
'uint8', 'uint16', 'uint32', 'uint64',
'float32', 'float64']
def setUp(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests")
def speed_reduce10():
import numpy
......
......@@ -7,7 +7,8 @@ import theano.sandbox.gpuarray
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (
GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise
from theano.sandbox.gpuarray.elemwise import (
GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
from theano.sandbox.gpuarray.tests.test_basic_ops import (
rand_gpuarray, mode_with_gpu, mode_without_gpu
)
......@@ -50,17 +51,26 @@ def test_flatten():
def test_reduce():
for method in ['sum', 'prod', 'max', 'min']:
dev = theano.sandbox.gpuarray.init_dev.device
for method, param in [('sum', dict(acc_dtype='float32')),
('prod', dict(acc_dtype='float32')),
('max', {}), ('min', {})]:
m = theano.tensor.fmatrix()
f = theano.function([m], getattr(m, method)(axis=0),
f = theano.function([m], getattr(m, method)(axis=0,
**param),
mode=mode_with_gpu)
val = numpy.random.rand(10, 11).astype("float32")
res = f(val)
utt.assert_allclose(res, getattr(val, method)(axis=0))
assert res.shape == (11,)
topo = f.maker.fgraph.toposort()
assert GpuCAReduceCuda in [type(node.op)
for node in topo], topo
ops = [type(node.op) for node in topo]
if dev.startswith('opencl') and method in ["max", "min"]:
assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
else:
assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
def test_local_gpualloc_memset_0():
......
......@@ -33,3 +33,10 @@ def test_values_eq_approx():
b = a.copy()
b[0] = -numpy.asarray(b[0])
assert not GpuArrayType.values_eq_approx(a, b)
def test_specify_shape():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
f = theano.function([g], theano.tensor.specify_shape(g, [20]))
f(a)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论