提交 85b0821d authored 作者: James Bergstra's avatar James Bergstra

merge

...@@ -1414,10 +1414,12 @@ class DebugMode(Mode): ...@@ -1414,10 +1414,12 @@ class DebugMode(Mode):
check_c_code=None, check_c_code=None,
check_py_code=None, check_py_code=None,
check_isfinite=None, check_isfinite=None,
require_matching_strides=None): require_matching_strides=None,
linker=None):
"""Initialize member variables. """Initialize member variables.
If any of these arguments (except optimizer) is not None, it overrides the class default. If any of these arguments (except optimizer) is not None, it overrides the class default.
The linker arguments is not used. It is set their to allow Mode.requiring() and some other fct to work with DebugMode too.
""" """
super(DebugMode, self).__init__( super(DebugMode, self).__init__(
optimizer=optimizer, optimizer=optimizer,
......
...@@ -12,8 +12,6 @@ from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise, ...@@ -12,8 +12,6 @@ from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
import opt import opt
import cuda_ndarray import cuda_ndarray
import theano.compile.sandbox
import os import os
import theano.config as config import theano.config as config
from theano.compile import optdb from theano.compile import optdb
......
...@@ -583,7 +583,7 @@ class GpuSum(Op): ...@@ -583,7 +583,7 @@ class GpuSum(Op):
def _k_init(self, *args): def _k_init(self, *args):
return """ return """
const int threadCount = blockDim.x * blockDim.y * blockDim.y; const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[]; extern __shared__ float buf[];
float mysum = 0.0f; float mysum = 0.0f;
...@@ -835,6 +835,38 @@ class GpuSum(Op): ...@@ -835,6 +835,38 @@ class GpuSum(Op):
} }
""" % locals() """ % locals()
def c_code_reduce_1111(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{
if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
break;
n_threads.y += 1;
}
n_threads.y -= 1;
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{
if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
break;
n_threads.z += 1;
}
n_threads.z -= 1;
dim3 n_blocks(1,1,1);
%(makecall)s
}
""" % locals()
def c_code_reduce_1011(self, sio, node, name, x, z, fail): def c_code_reduce_1011(self, sio, node, name, x, z, fail):
print >> sio, """ print >> sio, """
{ {
...@@ -892,7 +924,7 @@ class GpuSum(Op): ...@@ -892,7 +924,7 @@ class GpuSum(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (7,) return (8,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -900,6 +932,7 @@ class GpuSum(Op): ...@@ -900,6 +932,7 @@ class GpuSum(Op):
if self.reduce_mask == (1,): if self.reduce_mask == (1,):
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_1_%(nodename)s( static __global__ void kernel_reduce_sum_1_%(nodename)s(
const unsigned int d0, const unsigned int d0,
...@@ -921,36 +954,13 @@ class GpuSum(Op): ...@@ -921,36 +954,13 @@ class GpuSum(Op):
float Ai = A[i0 * sA0]; float Ai = A[i0 * sA0];
mysum += Ai; mysum += Ai;
} }
buf[threadNum] = mysum; %(reducebuf)s
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[0] = buf[0];
}
}
}
} }
""" %locals() """ %locals()
if self.reduce_mask == (1,1): if self.reduce_mask == (1,1):
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_11_%(nodename)s( static __global__ void kernel_reduce_sum_11_%(nodename)s(
const int d0, const int d0,
...@@ -976,31 +986,7 @@ class GpuSum(Op): ...@@ -976,31 +986,7 @@ class GpuSum(Op):
mysum += Ai; mysum += Ai;
} }
} }
buf[threadNum] = mysum; %(reducebuf)s
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[0] = buf[0];
}
}
}
} }
""" %locals() """ %locals()
if self.reduce_mask == (1,0): if self.reduce_mask == (1,0):
...@@ -1010,6 +996,7 @@ class GpuSum(Op): ...@@ -1010,6 +996,7 @@ class GpuSum(Op):
#TODO: This kernel is pretty inefficient in terms of reading, because if A is #TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous # c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column). # memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_10_%(nodename)s( static __global__ void kernel_reduce_sum_10_%(nodename)s(
const int d0, const int d0,
...@@ -1032,31 +1019,7 @@ class GpuSum(Op): ...@@ -1032,31 +1019,7 @@ class GpuSum(Op):
float Ai = A[i0 * sA0 + blockIdx.x * sA1]; float Ai = A[i0 * sA0 + blockIdx.x * sA1];
mysum += Ai; mysum += Ai;
} }
buf[threadNum] = mysum; %(reducebuf)s
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[blockIdx.x * sZ0] = buf[0];
}
}
}
} }
""" %locals() """ %locals()
if self.reduce_mask == (1,1,0): if self.reduce_mask == (1,1,0):
...@@ -1146,6 +1109,7 @@ class GpuSum(Op): ...@@ -1146,6 +1109,7 @@ class GpuSum(Op):
if self.reduce_mask == (0,0,1): if self.reduce_mask == (0,0,1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_001_%(nodename)s( static __global__ void kernel_reduce_sum_001_%(nodename)s(
const int d0, const int d0,
...@@ -1172,36 +1136,36 @@ class GpuSum(Op): ...@@ -1172,36 +1136,36 @@ class GpuSum(Op):
{ {
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
} }
buf[threadNum] = mysum; %(reducebuf)s
__syncthreads(); }
}
// rest of function is handled by one warp }
if (threadNum < warpSize) """ %locals()
if self.reduce_mask == (1,1,1,1):
reducebuf = self._k_reduce_buf('Z[0]')
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
%(decl)s
{ {
for (int i = threadNum + warpSize; i < threadCount; i += warpSize) %(init)s
mysum = 0;
for (int i0 = 0; i0 < d0; i0++)
for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
{ {
mysum += buf[i]; for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
}
buf[threadNum] = mysum;
if (threadNum < 16)
{ {
//reduce so that threadNum 0 has the sum of everything for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{ {
Z[i0 * sZ0 + i1 * sZ1] = buf[0]; mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
}
}
} }
} }
} }
%(reducebuf)s
} }
""" %locals() """ %locals()
if self.reduce_mask == (1,0,1,1): if self.reduce_mask == (1,0,1,1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_1011_%(nodename)s( static __global__ void kernel_reduce_sum_1011_%(nodename)s(
const unsigned int d0, const unsigned int d0,
...@@ -1232,31 +1196,7 @@ class GpuSum(Op): ...@@ -1232,31 +1196,7 @@ class GpuSum(Op):
} }
} }
} }
buf[threadNum] = mysum; %(reducebuf)s
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[blockIdx.x*sZ0] = buf[0];
}
}
}
} }
""" %locals() """ %locals()
return sio.getvalue() return sio.getvalue()
......
"""
This file implement 3 different version of the elemwise op on the gpu. Only NaiveAlgo is used and it is not very naive now.
The elemwise fct are also used with scalar operation! So it can happen that ndim is 0 as with all scalar type.
"""
import StringIO, sys import StringIO, sys
import numpy import numpy
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Type, Apply, Variable, Constant
......
import sys, time import sys, time
from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc from theano import shared
from theano.compile.pfunc import pfunc
from theano import tensor from theano import tensor
import numpy import numpy
import theano
import theano.tensor as T
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
...@@ -13,6 +16,7 @@ except ImportError: ...@@ -13,6 +16,7 @@ except ImportError:
raise SkipTest('Optional package cuda_ndarray not available') raise SkipTest('Optional package cuda_ndarray not available')
import theano.sandbox.cuda as tcn import theano.sandbox.cuda as tcn
import cuda_ndarray as cuda
import theano.compile.mode import theano.compile.mode
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
...@@ -20,6 +24,63 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') ...@@ -20,6 +24,63 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def tes_use(): def tes_use():
tcn.use() tcn.use()
def test_sum():
"""
test sum pattern 1, 11, 10, 100, 110, 001, 111, 1011, 1111
TODO: test with broadcast
"""
for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
a = tensor.TensorType('float32',(False,)*len(shape))()
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = numpy.asarray(val,dtype='float32')
f = theano.function([a],b, mode=mode_with_gpu)
f2 = theano.function([a],b)
assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
assert numpy.allclose(f2(val),f(val))
#test with broadcast
for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
shape = numpy.asarray(shape)*2
a = tensor.TensorType('float32',(False,)*len(shape))()
a2 = tcn.CudaNdarrayType((False,)*len(shape))()
b = T.Sum(pattern)(a)
b2 = T.Sum(pattern)(a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = numpy.asarray(val,dtype='float32')
val2 = cuda.CudaNdarray(val)
if len(shape)==1:
val = val[::2]
val2 = val2[::2]
elif len(shape)==2:
val = val[::2,::2]
val2 = val2[::2,::2]
elif len(shape)==3:
val = val[::2,::2,::2]
val2 = val2[::2,::2,::2]
elif len(shape)==4:
val = val[::2,::2,::2,::2]
val2 = val2[::2,::2,::2,::2]
f = theano.function([a],b)
f2 = theano.function([a2],b2, mode=mode_with_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()]
assert numpy.allclose(f2(val2),f(val))
def test_elemwise0(): def test_elemwise0():
a = tcn.shared_constructor(numpy.random.rand(4,4), 'a') a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')
......
...@@ -2,7 +2,7 @@ import numpy ...@@ -2,7 +2,7 @@ import numpy
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Type, Apply, Variable, Constant
from theano import tensor from theano import tensor
from theano.compile.sandbox.sharedvalue import shared, SharedVariable, shared_constructor from theano.compile import shared, SharedVariable, shared_constructor
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.type_support import filter as type_support_filter from theano.sandbox.cuda.type_support import filter as type_support_filter
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论