提交 91b67a6d authored 作者: Frederic Bastien's avatar Frederic Bastien

added GpuSoftmaxWithBias to make a real speed up to the tutorial logistic_cg

上级 80fc79ec
...@@ -368,3 +368,103 @@ class GpuSoftmax (Op): ...@@ -368,3 +368,103 @@ class GpuSoftmax (Op):
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'), inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]" "sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
]) ])
class GpuSoftmaxWithBias (Op):
"""Writeme"""
nin = 2
nout = 1
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x, b):
return Apply(self, [x, b],[x.type()])
def infer_shape(self, node, shape):
return [shape[0]]
def c_code_cache_version(self):
#return ()
return (1,) + inline_softmax.code_version
def c_code(self, node, nodename, (x,b), (z,), sub):
fail = sub['fail']
return """
if (%(x)s->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "rank error input");
%(fail)s;
}
if (%(b)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "rank error for the bias");
%(fail)s;
}
if ((CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0]))
{
PyErr_Format(PyExc_ValueError, "number of columns in x (%%ld) does not match length of b (%%ld)",
(long int)CudaNdarray_HOST_DIMS(%(x)s)[1], (long int)CudaNdarray_HOST_DIMS(%(b)s)[0]);
%(fail)s;
}
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 2, CudaNdarray_HOST_DIMS(%(x)s)))
{
Py_XDECREF(%(z)s);
%(z)s = NULL;
%(fail)s;
}
}
{
kSoftmaxWithBias_%(nodename)s
<<<
// todo: cap these at the card limits, implement loops in kernel
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(b)s),
CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
%(fail)s;
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmaxWithBias_%s"%nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm'],
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
"buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
"buf[threadIdx.x] += b[threadIdx.x * sb0]",
"buf2[threadIdx.x] = buf[threadIdx.x]",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
])
...@@ -11,7 +11,7 @@ from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactor ...@@ -11,7 +11,7 @@ from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactor
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax) GpuSoftmax, GpuSoftmaxWithBias)
from theano.compile import optdb from theano.compile import optdb
#optdb.print_summary() # this shows what is currently registered (in a so-far crude way...) #optdb.print_summary() # this shows what is currently registered (in a so-far crude way...)
...@@ -386,6 +386,18 @@ def local_gpu_softmax(node): ...@@ -386,6 +386,18 @@ def local_gpu_softmax(node):
return [host_from_gpu(gpu_sm)] return [host_from_gpu(gpu_sm)]
return False return False
@register_opt()
@local_optimizer([])
def local_gpu_softmax_with_bias(node):
if isinstance(node.op, tensor.nnet.SoftmaxWithBias):
x, b = node.inputs
x_on_gpu = x.owner and x.owner.op == host_from_gpu
b_on_gpu = b.owner and b.owner.op == host_from_gpu
if x_on_gpu or b_on_gpu:
gpu_sm = GpuSoftmaxWithBias()(gpu_from_host(x), gpu_from_host(b))
return [host_from_gpu(gpu_sm)]
return False
#### Convolution, maxpooling #### Convolution, maxpooling
from theano.tensor.nnet import conv from theano.tensor.nnet import conv
@register_opt() @register_opt()
......
...@@ -16,8 +16,10 @@ from theano.sandbox.cuda.type import CudaNdarrayType ...@@ -16,8 +16,10 @@ from theano.sandbox.cuda.type import CudaNdarrayType
if theano.config.mode=='FAST_COMPILE': if theano.config.mode=='FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
...@@ -49,3 +51,13 @@ def test_int_pow(): ...@@ -49,3 +51,13 @@ def test_int_pow():
#theano.printing.debugprint(f) #theano.printing.debugprint(f)
def test_softmax_with_bias():
x = tensor.fmatrix()
b = tensor.fvector()
f = theano.function([x,b],tensor.nnet.nnet.SoftmaxWithBias()(x,b), mode=mode_with_gpu)
f2 = theano.function([x,b],tensor.nnet.nnet.SoftmaxWithBias()(x,b), mode=mode_without_gpu)
assert isinstance(f.maker.env.toposort()[2].op,cuda.nnet.GpuSoftmaxWithBias)
xv=numpy.random.rand(7,8)
bv=numpy.random.rand(8)
assert numpy.allclose(f(xv,bv),f2(xv,bv))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论