提交 2faeb62c authored 作者: lamblin's avatar lamblin

Merge pull request #1437 from nouiz/gpu_iadd

Gpu iadd for 6d tensor
......@@ -1389,6 +1389,45 @@ __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3
}
}
template <int operator_num>
__global__ void k_ielem_6(const int d0, const int d1,
const int d2, const int d3,
const int d4, const int d5,
float* a, const int sA0, const int sA1,
const int sA2, const int sA3,
const int sA4, const int sA5,
const float* b, const int sB0, const int sB1,
const int sB2, const int sB3,
const int sB4, const int sB5
){
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
switch (operator_num) {
case IADD:
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
+= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
break;
case IDIV:
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
/= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
break;
case CPY:
a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
break;
}
}
}
}
}
}
}
}
/*
CudaNdarray_inplace_elemwise
Compute elemwise, working inplace on A.
......@@ -1415,19 +1454,31 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
const int, const int,
const float*, const int, const int,
const int, const int);
void (*k6)(const int, const int,
const int, const int,
const int, const int,
float*, const int, const int,
const int, const int,
const int, const int,
const float*, const int, const int,
const int, const int,
const int, const int);
switch (fct_nb)
{
case IADD:
k3 = k_ielem_3<IADD>;
k4 = k_ielem_4<IADD>;
k6 = k_ielem_6<IADD>;
break;
case IDIV:
k3 = k_ielem_3<IDIV>;
k4 = k_ielem_4<IDIV>;
k6 = k_ielem_6<IDIV>;
break;
case CPY:
k3 = k_ielem_3<CPY>;
k4 = k_ielem_4<CPY>;
k6 = k_ielem_6<CPY>;
break;
default:
assert (0);
......@@ -1769,6 +1820,61 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
}
}
break;
case 6:
{
dim3 n_blocks(
std::min(
CudaNdarray_HOST_DIMS(self)[0],
NUM_VECTOR_OP_BLOCKS),
CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_HOST_DIMS(self)[2]
);
while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
n_blocks.y /= 2;
while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
n_blocks.z /= 2;
dim3 n_threads(
std::min(
CudaNdarray_HOST_DIMS(self)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK)
//TODO: DON"T YOU NEED OT PUT DIMS[4] in here???
//TODO: DON"T YOU NEED OT PUT DIMS[5] in here???
);
k6<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(self)[0],
CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_HOST_DIMS(self)[2],
CudaNdarray_HOST_DIMS(self)[3],
CudaNdarray_HOST_DIMS(self)[4],
CudaNdarray_HOST_DIMS(self)[5],
CudaNdarray_DEV_DATA(self),
CudaNdarray_HOST_STRIDES(self)[0],
CudaNdarray_HOST_STRIDES(self)[1],
CudaNdarray_HOST_STRIDES(self)[2],
CudaNdarray_HOST_STRIDES(self)[3],
CudaNdarray_HOST_STRIDES(self)[4],
CudaNdarray_HOST_STRIDES(self)[5],
CudaNdarray_DEV_DATA(other),
other_strides[0],
other_strides[1],
other_strides[2],
other_strides[3],
other_strides[4],
other_strides[5]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s: %s.\n",
"k4",
cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1;
}
}
break;
default:
{
PyErr_Format(
......
......@@ -10,7 +10,7 @@ import theano.sandbox.cuda as cuda_ndarray
from theano.tensor.basic import _allclose
from theano.tests import unittest_tools as utt
if cuda_ndarray.cuda_available == False:
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda disabled')
......@@ -29,14 +29,16 @@ def advantage(cpu_dt, gpu_dt):
else:
return cpu_dt / gpu_dt
def test_host_to_device():
#print >>sys.stdout, 'starting test_host_to_dev'
for shape in ((), (3,), (2,3), (3,4,5,6)):
for shape in ((), (3,), (2, 3), (3, 4, 5, 6)):
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
c = numpy.asarray(b)
assert numpy.all(a == c)
def test_add_iadd_idiv():
for shapes in (
[(5,5),(5,1)],
......@@ -51,8 +53,18 @@ def test_add_iadd_idiv():
(3,34,35,36,37),
(33,34,3,36,37),
(33,34,35,36,3),
(0,0,0,0,0,0),
(3,34,35,36,37,2),
(33,34,3,36,37,2),
(33,34,35,36,3,2),
(3,4,5,6,7,1025),
(3,4,5,6,1025,7),
(3,4,5,1025,6,7),
(3,4,1025,5,6,7),
(3,1025,4,5,6,7),
(1025,3,4,5,6,7),
):
if isinstance(shapes,tuple):
if isinstance(shapes, tuple):
shape = shapes
shape2 = shapes
a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
......@@ -91,7 +103,7 @@ def test_add_iadd_idiv():
#should raise not implemented.
a0 = a0_orig.copy()
b0 = cuda_ndarray.CudaNdarray(a0)
if len(shape)==0:
if len(shape) == 0:
continue
elif len(shape) == 1:
_b = b1[::-1]
......@@ -103,6 +115,10 @@ def test_add_iadd_idiv():
_b = b1[::, ::, ::, ::-1]
elif len(shape) == 5:
_b = b1[::, ::, ::, ::, ::-1]
elif len(shape) == 6:
_b = b1[::, ::, ::, ::, ::, ::-1]
else:
raise Exception("You need to modify this case!")
# TODO: b0[...,::-1] don't work
if shape == shape2:
......@@ -141,11 +157,15 @@ def test_add_iadd_idiv():
b0 /= _b
a0 /= a1[..., ::-1]
assert numpy.allclose(a0, numpy.asarray(b0))
assert numpy.allclose(a0, ((a0_orig+a1)/a1+a1[..., ::-1])/a1[..., ::-1])
assert numpy.allclose(a0, ((a0_orig + a1) / a1 +
a1[..., ::-1]) / a1[..., ::-1])
def test_exp():
#print >>sys.stdout, 'starting test_exp'
for shape in ((), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
for shape in ((), (3,), (2, 3),
(1, 10000000), (10, 1000000),
(100, 100000), (1000, 10000), (10000, 1000)):
a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a1 = a0.copy()
b0 = cuda_ndarray.CudaNdarray(a0)
......@@ -181,90 +201,108 @@ def test_copy():
assert numpy.allclose(a, numpy.asarray(b))
assert numpy.allclose(a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d))
b+=b
b += b
assert numpy.allclose(a+a, numpy.asarray(b))
assert numpy.allclose(a+a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d))
def test_nvcc_bug():
"""
The fct k_elemwise_unary_rowmajor_copy(used by cuda.copy()) in cuda_ndarray.cu
is not well compiled with nvcc 3.0 and 3.1 beta. We found a workaround, so it
sould work correctly. Without the workaround, this test fail.
"""
shape = (5,4)
shape = (5, 4)
aa = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a = aa[::,::-1]
a = aa[::, ::-1]
b = cuda_ndarray.CudaNdarray(aa)[::,::-1]
b = cuda_ndarray.CudaNdarray(aa)[::, ::-1]
c = copy.copy(b)
d = copy.deepcopy(b)
assert numpy.allclose(a, numpy.asarray(b))
assert numpy.allclose(a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d))
b+=b
b += b
assert numpy.allclose(a+a, numpy.asarray(b))
assert numpy.allclose(a+a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d))
class test_DimShuffle(unittest.TestCase):
def test_dimshuffle(self):
utt.seed_rng()
rng = numpy.random.RandomState(utt.fetch_seed())
# 2d -> 0d
a = theano._asarray(rng.randn(1,1), dtype='float32')
a = theano._asarray(rng.randn(1, 1), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,()))
assert numpy.allclose(numpy.transpose(a),
cuda_ndarray.dimshuffle(b, ()))
# Test when we drop a axis that don't have shape 1
a = theano._asarray(rng.randn(2,1), dtype='float32')
a = theano._asarray(rng.randn(2, 1), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b,())
self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b, ())
# Test that we can't take a dimensions multiple time
a = theano._asarray(rng.randn(2,1), dtype='float32')
a = theano._asarray(rng.randn(2, 1), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b,(1,1))
self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b, (1, 1))
# 1d
a = theano._asarray(rng.randn(3,), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,(0,)))
assert numpy.allclose(a[None,:,None], cuda_ndarray.dimshuffle(b,(-1,0,-1)))
assert numpy.allclose(numpy.transpose(a),
cuda_ndarray.dimshuffle(b, (0,)))
assert numpy.allclose(a[None, :, None],
cuda_ndarray.dimshuffle(b, (-1, 0, -1)))
# 2d
a = theano._asarray(rng.randn(3,11), dtype='float32')
a = theano._asarray(rng.randn(3, 11), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,(1,0)))
assert numpy.allclose(numpy.transpose(a)[None,:,None,:,None], cuda_ndarray.dimshuffle(b,(-1,1,-1,0,-1)))
assert numpy.allclose(numpy.transpose(a),
cuda_ndarray.dimshuffle(b, (1, 0)))
assert numpy.allclose(numpy.transpose(a)[None, :, None, :, None],
cuda_ndarray.dimshuffle(b, (-1, 1, -1, 0, -1)))
# 2d -> 1d
a = theano._asarray(rng.randn(1,11), dtype='float32')
a = theano._asarray(rng.randn(1, 11), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
assert numpy.allclose(a[:,], cuda_ndarray.dimshuffle(b,(1,)))
a = theano._asarray(rng.randn(11,1), dtype='float32')
assert numpy.allclose(a[:],
cuda_ndarray.dimshuffle(b, (1,)))
a = theano._asarray(rng.randn(11, 1), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
assert numpy.allclose(a.reshape((11,)), cuda_ndarray.dimshuffle(b,(0,)))
assert numpy.allclose(a.reshape((11,)),
cuda_ndarray.dimshuffle(b, (0,)))
# 3d
a = theano._asarray(rng.randn(3,4,5), dtype='float32')
a = theano._asarray(rng.randn(3, 4, 5), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
assert numpy.allclose(a, cuda_ndarray.dimshuffle(b,(0,1,2)))
assert numpy.allclose(numpy.swapaxes(a,0,1), cuda_ndarray.dimshuffle(b,(1,0,2)))
assert numpy.allclose(numpy.swapaxes(a,0,2), cuda_ndarray.dimshuffle(b,(2,1,0)))
assert numpy.allclose(numpy.swapaxes(a,1,2), cuda_ndarray.dimshuffle(b,(0,2,1)))
assert numpy.allclose(numpy.swapaxes(a,1,2)[None,:,None,:,:,None], cuda_ndarray.dimshuffle(b,(-1,0,-1,2,1,-1)))
assert numpy.allclose(a, cuda_ndarray.dimshuffle(b, (0, 1, 2)))
assert numpy.allclose(numpy.swapaxes(a, 0, 1),
cuda_ndarray.dimshuffle(b, (1, 0, 2)))
assert numpy.allclose(numpy.swapaxes(a, 0, 2),
cuda_ndarray.dimshuffle(b, (2, 1, 0)))
assert numpy.allclose(numpy.swapaxes(a, 1, 2),
cuda_ndarray.dimshuffle(b, (0, 2, 1)))
assert numpy.allclose(numpy.swapaxes(a, 1, 2)[None, :, None, :, :, None],
cuda_ndarray.dimshuffle(b, (-1, 0, -1, 2, 1, -1)))
# 4d
a = theano._asarray(rng.randn(3,11,4,5), dtype='float32')
a = theano._asarray(rng.randn(3, 11, 4, 5), dtype='float32')
b = cuda_ndarray.CudaNdarray(a)
assert numpy.allclose(numpy.swapaxes(a,0,1), cuda_ndarray.dimshuffle(b,(1,0,2,3)))
assert numpy.allclose(numpy.swapaxes(a,0,2), cuda_ndarray.dimshuffle(b,(2,1,0,3)))
assert numpy.allclose(numpy.swapaxes(a,0,3), cuda_ndarray.dimshuffle(b,(3,1,2,0)))
assert numpy.allclose(numpy.swapaxes(a,0,3), cuda_ndarray.dimshuffle(b,(3,1,2,0)))
assert numpy.allclose(numpy.swapaxes(a,0,3)[None,:,None,:,:,:], cuda_ndarray.dimshuffle(b,(-1,3,-1,1,2,0)))
assert numpy.allclose(numpy.swapaxes(a, 0, 1),
cuda_ndarray.dimshuffle(b, (1, 0, 2, 3)))
assert numpy.allclose(numpy.swapaxes(a, 0, 2),
cuda_ndarray.dimshuffle(b, (2, 1, 0, 3)))
assert numpy.allclose(numpy.swapaxes(a, 0, 3),
cuda_ndarray.dimshuffle(b, (3, 1, 2, 0)))
assert numpy.allclose(numpy.swapaxes(a, 0, 3),
cuda_ndarray.dimshuffle(b, (3, 1, 2, 0)))
assert numpy.allclose(numpy.swapaxes(a, 0, 3)[None, :, None, :, :, :],
cuda_ndarray.dimshuffle(b, (-1, 3, -1, 1, 2, 0)))
def test_dot():
......@@ -281,66 +319,72 @@ def test_dot():
assert _allclose(numpy.dot(a0, a1), cuda_ndarray.dot(b0, b1))
a1 = theano._asarray(rng.randn(6, 7), dtype='float32')
b1 = cuda_ndarray.CudaNdarray(a1)
numpy_version = numpy.dot(a0, a1.T)
transposed = cuda_ndarray.dimshuffle(b1,(1,0))
cuda_version = cuda_ndarray.dot(b0, transposed)
transposed = cuda_ndarray.dimshuffle(b1, (1, 0))
cuda_version = cuda_ndarray.dot(b0, transposed)
assert _allclose(numpy_version, cuda_version)
a1 = theano._asarray(rng.randn(7, 6), dtype='float32')
b1 = cuda_ndarray.CudaNdarray(a1)
a0 = theano._asarray(rng.randn(7, 4), dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert _allclose(numpy.dot(a0.T, a1),
cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0,(1,0)), b1))
cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)), b1))
a1 = theano._asarray(rng.randn(6, 7), dtype='float32')
b1 = cuda_ndarray.CudaNdarray(a1)
assert _allclose(numpy.dot(a0.T, a1.T),
cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0,(1,0)),
cuda_ndarray.dimshuffle(b1,(1,0))))
cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)),
cuda_ndarray.dimshuffle(b1, (1, 0))))
def test_sum():
shape = (2,3)
a0 = theano._asarray(numpy.arange(shape[0]*shape[1]).reshape(shape), dtype='float32')
shape = (2, 3)
a0 = theano._asarray(numpy.arange(shape[0] * shape[1]).reshape(shape),
dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
assert numpy.allclose(a0.sum(),
numpy.asarray(b0.reduce_sum([1, 1])))
a0sum = a0.sum(axis=0)
b0sum = b0.reduce_sum([1,0])
b0sum = b0.reduce_sum([1, 0])
#print 'asum\n',a0sum
#print 'bsum\n',numpy.asarray(b0sum)
assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
assert numpy.allclose(a0.sum(axis=1), numpy.asarray(b0.reduce_sum([0,1])))
assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0,0])))
assert numpy.allclose(a0.sum(axis=0),
numpy.asarray(b0.reduce_sum([1, 0])))
assert numpy.allclose(a0.sum(axis=1),
numpy.asarray(b0.reduce_sum([0, 1])))
assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0, 0])))
shape = (3,4,5,6,7,8)
a0 = theano._asarray(numpy.arange(3*4*5*6*7*8).reshape(shape), dtype='float32')
shape = (3, 4, 5, 6, 7, 8)
a0 = theano._asarray(numpy.arange(3 * 4 * 5 * 6 * 7 * 8).reshape(shape),
dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0), numpy.asarray(b0.reduce_sum([1,0,0,1,0,1])))
assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0),
numpy.asarray(b0.reduce_sum([1, 0, 0, 1, 0, 1])))
shape = (16,2048)
a0 = theano._asarray(numpy.arange(16*2048).reshape(shape), dtype='float32')
shape = (16, 2048)
a0 = theano._asarray(numpy.arange(16 * 2048).reshape(shape),
dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1, 0])))
shape = (16,10)
shape = (16, 10)
a0 = theano._asarray(numpy.arange(160).reshape(shape), dtype='float32')
b0 = cuda_ndarray.CudaNdarray(a0)
assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1, 1])))
def test_reshape():
shapelist = [
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论