提交 7af47dd8 authored 作者: abergeron's avatar abergeron

Merge pull request #1925 from nouiz/gpuarray

GpuContiguous, tests, better opencl support
...@@ -5,6 +5,7 @@ import numpy ...@@ -5,6 +5,7 @@ import numpy
import theano import theano
from theano import Op, Apply from theano import Op, Apply
from theano import tensor, scalar, config from theano import tensor, scalar, config
from theano.gradient import grad_undefined
from theano.scalar import Scalar from theano.scalar import Scalar
from theano.tensor.basic import Alloc, Join, Split from theano.tensor.basic import Alloc, Join, Split
...@@ -516,7 +517,7 @@ class CudaFromGpu(Op): ...@@ -516,7 +517,7 @@ class CudaFromGpu(Op):
return [gpu_from_cuda(gz)] return [gpu_from_cuda(gz)]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
from theano.sandbox.cuda import CudaNdArrayType from theano.sandbox.cuda import CudaNdarrayType
ev, = eval_points ev, = eval_points
if (isinstance(ev, CudaNdarrayType)): if (isinstance(ev, CudaNdarrayType)):
return [gpu_from_cuda(ev)] return [gpu_from_cuda(ev)]
...@@ -750,6 +751,73 @@ class GpuAlloc(HideC, Alloc): ...@@ -750,6 +751,73 @@ class GpuAlloc(HideC, Alloc):
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
class GpuContiguous(Op):
"""
Always return a c contiguous output. Copy the input only if it is
not already c contiguous.
"""
view_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def grad(self, inputs, dout):
x, = inputs
dout, = dout
dout = as_gpuarray_variable(dout)
return [dout]
def __str__(self):
return self.__class__.__name__
def make_node(self, input):
input = as_gpuarray_variable(input)
return Apply(self, [input], [input.type()])
def c_headers(self):
return ['<numpy_compat.h>']
def c_code_cache_version(self):
return (3,)
def c_code(self, node, name, inp, out, sub):
input, = inp
z, = out
fail = sub['fail']
str = """
{
if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))){
Py_XDECREF(%(z)s);
%(z)s = %(input)s;
Py_INCREF(%(z)s);
} else if ((NULL == %(z)s)""" % locals()
for i in xrange(len(node.inputs[0].type.broadcastable)):
str += "\n|| (PyGpuArray_DIMS(%(input)s)[%(i)s] != PyGpuArray_DIMS(%(z)s)[%(i)s])" % locals()
str += """
|| !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga)))
{
Py_XDECREF(%(z)s);
%(z)s = pygpu_copy(%(input)s, GA_C_ORDER);
if (!%(z)s)
{
%(fail)s;
}
}else if(pygpu_move(%(z)s, %(input)s) == -1) {
%(fail)s;
}
}
""" % locals()
return str
gpu_contiguous = GpuContiguous()
class GpuReshape(HideC, tensor.Reshape): class GpuReshape(HideC, tensor.Reshape):
""" """
Implement Reshape on the gpu. Implement Reshape on the gpu.
...@@ -769,7 +837,6 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -769,7 +837,6 @@ class GpuReshape(HideC, tensor.Reshape):
raise ValueError('shape argument to GpuReshape.perform' raise ValueError('shape argument to GpuReshape.perform'
' has incorrect length %i' ' has incorrect length %i'
', should be %i' % (len(shp), self.ndim), shp) ', should be %i' % (len(shp), self.ndim), shp)
s = shp.prod()
if shp.prod() != x.size: if shp.prod() != x.size:
# We need to do check here to raise the same error as NumPy. # We need to do check here to raise the same error as NumPy.
...@@ -872,7 +939,8 @@ class GpuEye(GpuKernelBase, Op): ...@@ -872,7 +939,8 @@ class GpuEye(GpuKernelBase, Op):
return [out_shape] return [out_shape]
def grad(self, inp, grads): def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i]) for i in xrange(3)] return [grad_undefined(self, i, inp[i])
for i in xrange(3)]
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) and self.dtype == other.dtype return type(self) == type(other) and self.dtype == other.dtype
......
...@@ -31,7 +31,8 @@ from theano.sandbox.gpuarray.nnet import ( ...@@ -31,7 +31,8 @@ from theano.sandbox.gpuarray.nnet import (
GpuSoftmaxWithBias, GpuSoftmax GpuSoftmaxWithBias, GpuSoftmax
) )
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduceCuda) GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY)
from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor, from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
...@@ -366,15 +367,25 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -366,15 +367,25 @@ def local_gpua_advanced_incsubtensor(node):
def local_gpua_careduce(node): def local_gpua_careduce(node):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)): scalar.Maximum, scalar.Minimum)):
dev = theano.sandbox.gpuarray.init_dev.device
if dev.startswith('opencl'):
op = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code.
return
else:
op = GpuCAReduceCuda
x, = node.inputs x, = node.inputs
greduce = GpuCAReduceCuda(
greduce = op(
node.op.scalar_op, axis=node.op.axis, node.op.scalar_op, axis=node.op.axis,
dtype=getattr(node.op, 'dtype', None), dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(node.op, 'acc_dtype', None))
gvar = greduce(x) gvar = greduce(x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if gvar.owner.op.supports_c_code([gpu_from_host(x)]): if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([gpu_from_host(x)])):
return greduce return greduce
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
...@@ -407,7 +418,7 @@ def local_gpua_careduce(node): ...@@ -407,7 +418,7 @@ def local_gpua_careduce(node):
for idx, m in enumerate(new_mask): for idx, m in enumerate(new_mask):
if m == 1: if m == 1:
new_axis.append(idx) new_axis.append(idx)
greduce = GpuCAReduceCuda( greduce = op(
node.op.scalar_op, node.op.scalar_op,
axis=new_axis, reduce_mask=new_mask, axis=new_axis, reduce_mask=new_mask,
dtype=getattr(node.op, 'dtype', None), dtype=getattr(node.op, 'dtype', None),
......
...@@ -42,7 +42,8 @@ from theano.sandbox.gpuarray.basic_ops import ( ...@@ -42,7 +42,8 @@ from theano.sandbox.gpuarray.basic_ops import (
gpu_from_cuda, gpu_from_cuda,
cuda_from_gpu, HostFromGpu, cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape, GpuFromHost, GpuReshape,
gpu_join, GpuJoin, GpuSplit, GpuEye) gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
utt.seed_rng() utt.seed_rng()
...@@ -73,6 +74,7 @@ def may_fail(msg, EClass): ...@@ -73,6 +74,7 @@ def may_fail(msg, EClass):
return wrapper return wrapper
return test_decorator return test_decorator
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False, def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None): on_unused_input='raise', name=None):
if mode is None: if mode is None:
...@@ -93,6 +95,7 @@ def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs): ...@@ -93,6 +95,7 @@ def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
except TypeError: except TypeError:
continue continue
def rand_gpuarray(*shape, **kwargs): def rand_gpuarray(*shape, **kwargs):
r = rng.rand(*shape) * 2 - 1 r = rng.rand(*shape) * 2 - 1
dtype = kwargs.pop('dtype', theano.config.floatX) dtype = kwargs.pop('dtype', theano.config.floatX)
...@@ -208,10 +211,10 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu, ...@@ -208,10 +211,10 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
def test_transfer_cpu_gpu(): def test_transfer_cpu_gpu():
a = T.fmatrix('a') a = T.fmatrix('a')
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
av = numpy.asarray(rng.rand(5, 4), dtype='float32') av = numpy.asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av) gv = gpuarray.array(av)
f = theano.function([a], gpu_from_host(a)) f = theano.function([a], gpu_from_host(a))
fv = f(av) fv = f(av)
assert GpuArrayType.values_eq(fv, gv) assert GpuArrayType.values_eq(fv, gv)
...@@ -231,8 +234,8 @@ def test_transfer_strided(): ...@@ -231,8 +234,8 @@ def test_transfer_strided():
av = numpy.asarray(rng.rand(5, 8), dtype='float32') av = numpy.asarray(rng.rand(5, 8), dtype='float32')
gv = gpuarray.array(av) gv = gpuarray.array(av)
av = av[:,::2] av = av[:, ::2]
gv = gv[:,::2] gv = gv[:, ::2]
f = theano.function([a], gpu_from_host(a)) f = theano.function([a], gpu_from_host(a))
fv = f(av) fv = f(av)
...@@ -247,7 +250,7 @@ def test_transfer_strided(): ...@@ -247,7 +250,7 @@ def test_transfer_strided():
"that the tests will be run this way", ValueError) "that the tests will be run this way", ValueError)
def test_transfer_cuda_gpu(): def test_transfer_cuda_gpu():
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest("Can't test interaction with cuda if cuda not present") raise SkipTest("Can't test interaction with cuda if cuda not present")
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
c = cuda_ndarray.CudaNdarrayType((False, False))('c') c = cuda_ndarray.CudaNdarrayType((False, False))('c')
...@@ -255,8 +258,8 @@ def test_transfer_cuda_gpu(): ...@@ -255,8 +258,8 @@ def test_transfer_cuda_gpu():
av = theano._asarray(rng.rand(5, 4), dtype='float32') av = theano._asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av) gv = gpuarray.array(av)
cv = cuda_ndarray.CudaNdarray(av) cv = cuda_ndarray.CudaNdarray(av)
gvs = gv[:,::-2] gvs = gv[:, ::-2]
cvs = cv[:,::-2] cvs = cv[:, ::-2]
f = theano.function([c], gpu_from_cuda(c)) f = theano.function([c], gpu_from_cuda(c))
fv = f(cv) fv = f(cv)
...@@ -324,6 +327,19 @@ def test_shape(): ...@@ -324,6 +327,19 @@ def test_shape():
assert isinstance(topo[0].op, T.Shape) assert isinstance(topo[0].op, T.Shape)
def test_gpu_contiguous():
a = T.fmatrix('a')
i = T.iscalar('i')
a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
f = theano.function([a, i], gpu_contiguous(a[::i]),
mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert any([isinstance(node.op, GpuSubtensor) for node in topo])
assert f(a_val, 1).flags.c_contiguous
assert f(a_val, 2).flags.c_contiguous
assert f(a_val, 2).flags.c_contiguous
class G_reshape(T_reshape): class G_reshape(T_reshape):
def shortDescription(self): def shortDescription(self):
return None return None
...@@ -335,11 +351,11 @@ class G_reshape(T_reshape): ...@@ -335,11 +351,11 @@ class G_reshape(T_reshape):
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
# dtype='float32', # dtype='float32',
ignore_topo=(HostFromGpu, GpuFromHost, ignore_topo=(HostFromGpu, GpuFromHost,
theano.compile.DeepCopyOp, theano.compile.DeepCopyOp,
theano.sandbox.gpuarray.elemwise.GpuElemwise, theano.sandbox.gpuarray.elemwise.GpuElemwise,
theano.tensor.opt.Shape_i, theano.tensor.opt.Shape_i,
theano.tensor.opt.MakeVector)) theano.tensor.opt.MakeVector))
assert self.op == GpuReshape assert self.op == GpuReshape
...@@ -429,7 +445,8 @@ def test_hostfromgpu_shape_i(): ...@@ -429,7 +445,8 @@ def test_hostfromgpu_shape_i():
""" """
m = mode_with_gpu.including('local_dot_to_dot22', m = mode_with_gpu.including('local_dot_to_dot22',
'local_dot22_to_dot22scalar','specialize') 'local_dot22_to_dot22scalar',
'specialize')
a = T.fmatrix('a') a = T.fmatrix('a')
ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))() ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32') av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
......
import theano
from theano import scalar, gof from theano import scalar, gof
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle, from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
test_CAReduce, T_reduce_dtype) test_CAReduce, T_reduce_dtype)
...@@ -19,17 +21,32 @@ class test_gpu_Broadcast(test_Broadcast): ...@@ -19,17 +21,32 @@ class test_gpu_Broadcast(test_Broadcast):
type = GpuArrayType type = GpuArrayType
cop = GpuElemwise cop = GpuElemwise
ctype = GpuArrayType ctype = GpuArrayType
# The order is important
linkers = [gof.PerformLinker, gof.CLinker]
def setUp(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
self.linkers = [gof.PerformLinker]
def rand_val(self, shp): def rand_val(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray)) return rand_gpuarray(*shp, **dict(cls=gpuarray))
# no c_code() yet
#cop = GpuElemwise
#ctype = GpuArrayType
def rand_cval(self, shp): def rand_cval(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray)) return rand_gpuarray(*shp, **dict(cls=gpuarray))
def test_c(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c()
def test_c_inplace(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c_inplace()
class test_GpuDimShuffle(test_DimShuffle): class test_GpuDimShuffle(test_DimShuffle):
op = GpuDimShuffle op = GpuDimShuffle
...@@ -149,7 +166,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -149,7 +166,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
# ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001 # ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111 # ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
# ((5,4,3,10,11),[1,2]), # ((5,4,3,10,11),[1,2]),
] ]
op = GpuCAReduceCuda op = GpuCAReduceCuda
reds = [scalar.add, scalar.mul, reds = [scalar.add, scalar.mul,
scalar.maximum, scalar.minimum] scalar.maximum, scalar.minimum]
...@@ -161,6 +178,12 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -161,6 +178,12 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
def test_perform_nan(self): def test_perform_nan(self):
return return
def setUp(self):
super(test_GpuCAReduceCuda, self).setUp()
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests")
class T_gpureduce_dtype(T_reduce_dtype): class T_gpureduce_dtype(T_reduce_dtype):
mode = mode_with_gpu.excluding('local_cut_useless_reduce') mode = mode_with_gpu.excluding('local_cut_useless_reduce')
...@@ -172,6 +195,11 @@ class T_gpureduce_dtype(T_reduce_dtype): ...@@ -172,6 +195,11 @@ class T_gpureduce_dtype(T_reduce_dtype):
'uint8', 'uint16', 'uint32', 'uint64', 'uint8', 'uint16', 'uint32', 'uint64',
'float32', 'float64'] 'float32', 'float64']
def setUp(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests")
def speed_reduce10(): def speed_reduce10():
import numpy import numpy
......
...@@ -7,7 +7,8 @@ import theano.sandbox.gpuarray ...@@ -7,7 +7,8 @@ import theano.sandbox.gpuarray
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import ( from theano.sandbox.gpuarray.basic_ops import (
GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu) GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise from theano.sandbox.gpuarray.elemwise import (
GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
from theano.sandbox.gpuarray.tests.test_basic_ops import ( from theano.sandbox.gpuarray.tests.test_basic_ops import (
rand_gpuarray, mode_with_gpu, mode_without_gpu rand_gpuarray, mode_with_gpu, mode_without_gpu
) )
...@@ -50,17 +51,26 @@ def test_flatten(): ...@@ -50,17 +51,26 @@ def test_flatten():
def test_reduce(): def test_reduce():
for method in ['sum', 'prod', 'max', 'min']: dev = theano.sandbox.gpuarray.init_dev.device
for method, param in [('sum', dict(acc_dtype='float32')),
('prod', dict(acc_dtype='float32')),
('max', {}), ('min', {})]:
m = theano.tensor.fmatrix() m = theano.tensor.fmatrix()
f = theano.function([m], getattr(m, method)(axis=0), f = theano.function([m], getattr(m, method)(axis=0,
**param),
mode=mode_with_gpu) mode=mode_with_gpu)
val = numpy.random.rand(10, 11).astype("float32") val = numpy.random.rand(10, 11).astype("float32")
res = f(val) res = f(val)
utt.assert_allclose(res, getattr(val, method)(axis=0)) utt.assert_allclose(res, getattr(val, method)(axis=0))
assert res.shape == (11,) assert res.shape == (11,)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert GpuCAReduceCuda in [type(node.op) ops = [type(node.op) for node in topo]
for node in topo], topo
if dev.startswith('opencl') and method in ["max", "min"]:
assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
else:
assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
def test_local_gpualloc_memset_0(): def test_local_gpualloc_memset_0():
......
...@@ -33,3 +33,10 @@ def test_values_eq_approx(): ...@@ -33,3 +33,10 @@ def test_values_eq_approx():
b = a.copy() b = a.copy()
b[0] = -numpy.asarray(b[0]) b[0] = -numpy.asarray(b[0])
assert not GpuArrayType.values_eq_approx(a, b) assert not GpuArrayType.values_eq_approx(a, b)
def test_specify_shape():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
f = theano.function([g], theano.tensor.specify_shape(g, [20]))
f(a)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论