提交 85b0821d authored 作者: James Bergstra's avatar James Bergstra

merge

......@@ -1414,10 +1414,12 @@ class DebugMode(Mode):
check_c_code=None,
check_py_code=None,
check_isfinite=None,
require_matching_strides=None):
require_matching_strides=None,
linker=None):
"""Initialize member variables.
If any of these arguments (except optimizer) is not None, it overrides the class default.
The linker arguments is not used. It is set their to allow Mode.requiring() and some other fct to work with DebugMode too.
"""
super(DebugMode, self).__init__(
optimizer=optimizer,
......
......@@ -12,8 +12,6 @@ from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
import opt
import cuda_ndarray
import theano.compile.sandbox
import os
import theano.config as config
from theano.compile import optdb
......
......@@ -583,7 +583,7 @@ class GpuSum(Op):
def _k_init(self, *args):
return """
const int threadCount = blockDim.x * blockDim.y * blockDim.y;
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
......@@ -835,6 +835,38 @@ class GpuSum(Op):
}
""" % locals()
def c_code_reduce_1111(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{
if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
break;
n_threads.y += 1;
}
n_threads.y -= 1;
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{
if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
break;
n_threads.z += 1;
}
n_threads.z -= 1;
dim3 n_blocks(1,1,1);
%(makecall)s
}
""" % locals()
def c_code_reduce_1011(self, sio, node, name, x, z, fail):
print >> sio, """
{
......@@ -892,7 +924,7 @@ class GpuSum(Op):
def c_code_cache_version(self):
#return ()
return (7,)
return (8,)
def c_support_code_apply(self, node, nodename):
......@@ -900,6 +932,7 @@ class GpuSum(Op):
if self.reduce_mask == (1,):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """
static __global__ void kernel_reduce_sum_1_%(nodename)s(
const unsigned int d0,
......@@ -921,36 +954,13 @@ class GpuSum(Op):
float Ai = A[i0 * sA0];
mysum += Ai;
}
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[0] = buf[0];
}
}
}
%(reducebuf)s
}
""" %locals()
if self.reduce_mask == (1,1):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """
static __global__ void kernel_reduce_sum_11_%(nodename)s(
const int d0,
......@@ -976,31 +986,7 @@ class GpuSum(Op):
mysum += Ai;
}
}
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[0] = buf[0];
}
}
}
%(reducebuf)s
}
""" %locals()
if self.reduce_mask == (1,0):
......@@ -1010,6 +996,7 @@ class GpuSum(Op):
#TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
print >> sio, """
static __global__ void kernel_reduce_sum_10_%(nodename)s(
const int d0,
......@@ -1032,31 +1019,7 @@ class GpuSum(Op):
float Ai = A[i0 * sA0 + blockIdx.x * sA1];
mysum += Ai;
}
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[blockIdx.x * sZ0] = buf[0];
}
}
}
%(reducebuf)s
}
""" %locals()
if self.reduce_mask == (1,1,0):
......@@ -1146,6 +1109,7 @@ class GpuSum(Op):
if self.reduce_mask == (0,0,1):
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
print >> sio, """
static __global__ void kernel_reduce_sum_001_%(nodename)s(
const int d0,
......@@ -1172,36 +1136,36 @@ class GpuSum(Op):
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
}
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
%(reducebuf)s
}
}
}
""" %locals()
if self.reduce_mask == (1,1,1,1):
reducebuf = self._k_reduce_buf('Z[0]')
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
%(decl)s
{
%(init)s
mysum = 0;
for (int i0 = 0; i0 < d0; i0++)
for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
{
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[i0 * sZ0 + i1 * sZ1] = buf[0];
}
}
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
}
}
}
%(reducebuf)s
}
""" %locals()
if self.reduce_mask == (1,0,1,1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
print >> sio, """
static __global__ void kernel_reduce_sum_1011_%(nodename)s(
const unsigned int d0,
......@@ -1232,31 +1196,7 @@ class GpuSum(Op):
}
}
}
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[blockIdx.x*sZ0] = buf[0];
}
}
}
%(reducebuf)s
}
""" %locals()
return sio.getvalue()
......
"""
This file implement 3 different version of the elemwise op on the gpu. Only NaiveAlgo is used and it is not very naive now.
The elemwise fct are also used with scalar operation! So it can happen that ndim is 0 as with all scalar type.
"""
import StringIO, sys
import numpy
from theano import Op, Type, Apply, Variable, Constant
......
import sys, time
from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc
from theano import shared
from theano.compile.pfunc import pfunc
from theano import tensor
import numpy
import theano
import theano.tensor as T
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
......@@ -13,6 +16,7 @@ except ImportError:
raise SkipTest('Optional package cuda_ndarray not available')
import theano.sandbox.cuda as tcn
import cuda_ndarray as cuda
import theano.compile.mode
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
......@@ -20,6 +24,63 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def tes_use():
tcn.use()
def test_sum():
"""
test sum pattern 1, 11, 10, 100, 110, 001, 111, 1011, 1111
TODO: test with broadcast
"""
for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
a = tensor.TensorType('float32',(False,)*len(shape))()
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = numpy.asarray(val,dtype='float32')
f = theano.function([a],b, mode=mode_with_gpu)
f2 = theano.function([a],b)
assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
assert numpy.allclose(f2(val),f(val))
#test with broadcast
for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
shape = numpy.asarray(shape)*2
a = tensor.TensorType('float32',(False,)*len(shape))()
a2 = tcn.CudaNdarrayType((False,)*len(shape))()
b = T.Sum(pattern)(a)
b2 = T.Sum(pattern)(a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = numpy.asarray(val,dtype='float32')
val2 = cuda.CudaNdarray(val)
if len(shape)==1:
val = val[::2]
val2 = val2[::2]
elif len(shape)==2:
val = val[::2,::2]
val2 = val2[::2,::2]
elif len(shape)==3:
val = val[::2,::2,::2]
val2 = val2[::2,::2,::2]
elif len(shape)==4:
val = val[::2,::2,::2,::2]
val2 = val2[::2,::2,::2,::2]
f = theano.function([a],b)
f2 = theano.function([a2],b2, mode=mode_with_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()]
assert numpy.allclose(f2(val2),f(val))
def test_elemwise0():
a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')
......
......@@ -2,7 +2,7 @@ import numpy
from theano import Op, Type, Apply, Variable, Constant
from theano import tensor
from theano.compile.sandbox.sharedvalue import shared, SharedVariable, shared_constructor
from theano.compile import shared, SharedVariable, shared_constructor
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.type_support import filter as type_support_filter
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论