提交 91fd3c4f authored 作者: James Bergstra's avatar James Bergstra

merge

""" Helper routines for generating gpu kernels for nvcc.
"""
def nvcc_kernel(name, params, body):
"""Return the c code of a kernel function.
:param params: the parameters to the function as one or more strings
:param body: the [nested] list of statements for the body of the function. These will be
separated by ';' characters.
"""
paramstr = ', '.join(params)
def flatbody():
for b in body:
if isinstance(b, (list, tuple)):
for bb in b:
yield bb
else:
yield b
bodystr = ';\n'.join(flatbody())
return """__global__ void %(name)s (%(paramstr)s)
{
%(bodystr)s;
}
""" %locals()
def code_version(version):
"""decorator to support version-based cache mechanism"""
if not isinstance(version, tuple):
raise TypeError('version must be tuple', version)
def deco(f):
f.code_version = version
return f
return deco
UNVERSIONED = ()
@code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn):
"""
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer
:param pos: index of executing thread
:param count: number of executing threads
:param manner_fn: a function that accepts strings of arguments a and b, and returns c code
for their reduction. (Example: return "%(a)s + %(b)s" for a sum reduction).
:postcondition:
This function leaves the answer in position 0 of the buffer. The rest of the buffer is
trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
"""
loop_line = manner_fn("%s[%s]"%(buf,pos), "%s[i]" %(buf))
r_16 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+16]" %(buf, pos))
r_8 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+8]" %(buf, pos))
r_4 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+4]" %(buf, pos))
r_2 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+2]" %(buf, pos))
r_1 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+1]" %(buf, pos))
return """
{
// This function trashes buf[1..N], leaving the max in buf[0].
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the sum of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
}
""" % locals()
@code_version(inline_reduce.code_version)
def inline_reduce_max(N, buf, pos, count):
return inline_reduce(N, buf, pos, count, lambda a, b: "max(%s, %s)"%(a,b))
@code_version(inline_reduce.code_version)
def inline_reduce_sum(N, buf, pos, count):
return inline_reduce(N, buf, pos, count, lambda a, b: "%s + %s"%(a,b))
@code_version(inline_reduce.code_version)
def inline_reduce_min(N, buf, pos, count):
return inline_reduce(N, buf, pos, count, lambda a, b: "min(%s, %s)"%(a,b))
@code_version(inline_reduce.code_version)
def inline_reduce_prod(N, buf, pos, count):
return inline_reduce(N, buf, pos, count, lambda a, b: "%s * %s"%(a,b))
@code_version((1,) + inline_reduce_max.code_version + inline_reduce_sum.code_version)
def inline_softmax(N, buf, buf2, threadPos, threadCount):
"""
:Precondition: buf and buf2 contain two identical copies of the input to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times.
"""
return [
#get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_max = '+buf+'[0]',
'__syncthreads()',
buf+'['+threadPos+'] = exp('+buf2+'['+threadPos+'] - row_max)',
buf2+'['+threadPos+'] = '+buf+'['+threadPos+']',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_sum = '+buf+'[0]',
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
buf+'['+threadPos+'] = '+buf2+'['+threadPos+'] / row_sum'
]
......@@ -5,6 +5,8 @@ import StringIO
import cuda_ndarray
from .type import CudaNdarrayType
from .kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax
class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
nin=3
nout=3
......@@ -287,3 +289,81 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
}
""" % locals()
class GpuSoftmax (Op):
"""Writeme"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x):
return Apply(self, [x],[x.type()])
def c_code_cache_version(self):
#return ()
return (1,) + inline_softmax.code_version
def c_code(self, node, nodename, (x,), (z,), sub):
fail = sub['fail']
return """
if (%(x)s->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 2, CudaNdarray_HOST_DIMS(%(x)s)))
{
Py_XDECREF(%(z)s);
%(z)s = NULL;
%(fail)s;
}
}
{
kSoftmax_%(nodename)s
<<<
// todo: cap these at the card limits, implement loops in kernel
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
%(fail)s;
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmax_%s"%nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm'],
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
"buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
"buf2[threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
])
......@@ -7,7 +7,8 @@ from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv
from theano_cuda_ndarray.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from theano_cuda_ndarray.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx)
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax)
from theano.compile import optdb
#optdb.print_summary() # this shows what is currently registered (in a so-far crude way...)
......@@ -285,6 +286,16 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
return [host_from_gpu(gpu_dx)]
return False
@register_opt()
@local_optimizer([])
def local_gpu_softmax(node):
if isinstance(node.op, tensor.nnet.Softmax):
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
gpu_sm = GpuSoftmax()(gpu_x)
return [host_from_gpu(gpu_sm)]
return False
#### Convolution, maxpooling
import theano.sandbox.conv
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论