提交 ae8ce3c6 authored 作者: Pierre Luc Carrier's avatar Pierre Luc Carrier

Correct various bugs with ops GpuSoftmax and GpuSoftmaxWithBias. Current version…

Correct various bugs with ops GpuSoftmax and GpuSoftmaxWithBias. Current version passes tests with float32
上级 6ef5d264
""" Helper routines for generating gpu kernels for nvcc.
"""
def nvcc_kernel(name, params, body):
"""Return the c code of a kernel function.
:param params: the parameters to the function as one or more strings
:param body: the [nested] list of statements for the body of the
function. These will be separated by ';' characters.
"""
paramstr = ', '.join(params)
def flatbody():
for b in body:
if isinstance(b, (list, tuple)):
for bb in b:
yield bb
else:
yield b
bodystr = ';\n'.join(flatbody())
return """__global__ void %(name)s (%(paramstr)s)
{
%(bodystr)s;
}
""" % locals()
def code_version(version):
"""decorator to support version-based cache mechanism"""
if not isinstance(version, tuple):
raise TypeError('version must be tuple', version)
def deco(f):
f.code_version = version
return f
return deco
UNVERSIONED = ()
@code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer
:param pos: index of executing thread
:param count: number of executing threads
:param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
"""
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
return """
{
// This function trashes buf[1..warpSize],
// leaving the reduction result in buf[0].
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the sum of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
}
""" % locals()
@code_version(inline_reduce.code_version)
def inline_reduce_max(N, buf, pos, count):
return inline_reduce(N, buf, pos, count,
lambda a, b: "max(%s, %s)" % (a, b))
@code_version(inline_reduce.code_version)
def inline_reduce_sum(N, buf, pos, count):
return inline_reduce(N, buf, pos, count,
lambda a, b: "%s + %s" % (a, b))
@code_version(inline_reduce.code_version)
def inline_reduce_min(N, buf, pos, count):
return inline_reduce(N, buf, pos, count,
lambda a, b: "min(%s, %s)" % (a, b))
@code_version(inline_reduce.code_version)
def inline_reduce_prod(N, buf, pos, count):
return inline_reduce(N, buf, pos, count,
lambda a, b: "%s * %s" % (a, b))
@code_version((2,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version)
def inline_softmax(N, buf, buf2, threadPos, threadCount):
"""
:param N: length of the buffer
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:Precondition: buf and buf2 contain two identical copies of the input
to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times
:note2: We use __i as an int variable in a loop
"""
return [
#get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_max = ' + buf + '[0]',
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]',
'}',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_sum = ' + buf + '[0]',
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'}',
'__syncthreads()',
]
@code_version((1,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
manner_fn, manner_init,
b='', stride_b=''):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(float)
:param pos: index of executing thread
:param count: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
:param manner_init: a function that accepts strings of arguments a
and return c code for its initialization
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
"""
if b:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
" %(b)s[%(pos)s * %(stride_b)s]" % locals())
loop_line = manner_fn("red",
manner_init("%(x)s[i * %(stride_x)s] + "
"%(b)s[i * %(stride_b)s]" %
locals()))
else:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
return """
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
float red = %(init)s;
#pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
red = %(loop_line)s;
}
buf[%(pos)s] = red;
__syncthreads();
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line2)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the reduction of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
}
""" % locals()
@code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
b='', stride_b=''):
return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
lambda a, b: "max(%s, %s)" % (a, b),
lambda a: a,
b, stride_b)
@code_version((1,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version)
def inline_softmax_fixed_shared(N, buf, x, stride_x,
sm, sm_stride,
threadPos, threadCount,
b='', stride_b=''):
"""
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(float)
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:Precondition: buf is empty
:Postcondition: buf[0] contains the softmax,
buf2 contains un-normalized softmax
:note: buf should be in gpu shared memory, we access it many times.
:note2: We use tx as an int variable in a loop
"""
ret = [
#get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x,
threadPos, threadCount, b, stride_b),
'__syncthreads()',
'float row_max = ' + buf + '[0]',
'__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
lambda a, b: "%s + %s" % (a, b),
lambda a: "exp(%s - row_max)" % a,
b, stride_b),
'__syncthreads()',
'float row_sum = ' + buf + '[0]',
'__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
]
# This set all value correctly
if b:
ret += [
"%(sm)s[tx * %(sm_stride)s] = "
" exp(%(x)s[tx * %(stride_x)s] +"
" %(b)s[tx * %(stride_b)s] - row_max)"
" / row_sum" % locals()]
else:
ret += [
"%(sm)s[tx * %(sm_stride)s] = "
"exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
ret += [
"}",
'__syncthreads()',
]
return ret
import numpy import numpy
from theano import Op, Apply from theano import Op, Apply, config
from theano.compat.six import StringIO from theano.compat.six import StringIO
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
...@@ -14,6 +13,10 @@ except ImportError: ...@@ -14,6 +13,10 @@ except ImportError:
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.kernel_codegen import (nvcc_kernel,
inline_softmax,
inline_softmax_fixed_shared)
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
...@@ -456,16 +459,17 @@ class GpuSoftmax (Op): ...@@ -456,16 +459,17 @@ class GpuSoftmax (Op):
return self.__class__.__name__ return self.__class__.__name__
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x)
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
return (9,) + inline_softmax.code_version return (10,) + inline_softmax.code_version
def c_headers(self): def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>'] return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>']
def c_compiler(self): def c_compiler(self):
return NVCC_compiler return NVCC_compiler
...@@ -474,11 +478,16 @@ class GpuSoftmax (Op): ...@@ -474,11 +478,16 @@ class GpuSoftmax (Op):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'] return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dtype = self.dtype dtype_x = node.inputs[0].dtype
typecode = pygpu.gpuarray.dtype_to_typecode(dtype) dtype_z = node.outputs[0].dtype
typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
x, = inp x, = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
else:
cnda_thread_sync = ""
return """ return """
if (PyGpuArray_NDIM(%(x)s) != 2) if (PyGpuArray_NDIM(%(x)s) != 2)
{ {
...@@ -496,17 +505,17 @@ class GpuSoftmax (Op): ...@@ -496,17 +505,17 @@ class GpuSoftmax (Op):
%(typecode)s, %(typecode)s,
GA_C_ORDER, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if (!%(nll)s) { if (!%(z)s) {
%(fail)s %(fail)s
} }
} }
{ {
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], int n_blocks = std::min((int)PyGpuArray_DIMS(%(x)s)[0],
32 * 1024); 32 * 1024);
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], 512); int n_threads = std::min((int)PyGpuArray_DIMS(%(x)s)[1], 512);
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] * int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(dtype)s); 2 * sizeof(npy_%(dtype_x)s);
if (PyGpuArray_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
...@@ -523,38 +532,42 @@ class GpuSoftmax (Op): ...@@ -523,38 +532,42 @@ class GpuSoftmax (Op):
PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
(dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) + (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset), %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0], PyGpuArray_STRIDES(%(x)s)[0] / 4,
PyGpuArray_STRIDES(%(x)s)[1], PyGpuArray_STRIDES(%(x)s)[1] / 4,
(dtype_%(z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) + (npy_%(dtype_z)s*)(
%(z)s->ga.offset); ((char *)cuda_get_ptr(%(z)s->ga.data)) +
PyGpuArray_STRIDES(%(z)s)[0], %(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[1] PyGpuArray_STRIDES(%(z)s)[0] / 4,
PyGpuArray_STRIDES(%(z)s)[1] / 4
); );
}else{ }else{
kSoftmax_fixed_shared%(nodename)s kSoftmax_fixed_shared%(nodename)s
<<< <<<
n_blocks, n_blocks,
n_threads, n_threads,
n_threads * sizeof(npy_%(dtype)s) n_threads * sizeof(npy_%(dtype_x)s)
>>>( >>>(
PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
(dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) + (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset), %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0], PyGpuArray_STRIDES(%(x)s)[0] / 4,
PyGpuArray_STRIDES(%(x)s)[1], PyGpuArray_STRIDES(%(x)s)[1] / 4,
(dtype_%(z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) + (npy_%(dtype_z)s*)(
%(z)s->ga.offset); ((char *)cuda_get_ptr(%(z)s->ga.data)) +
PyGpuArray_STRIDES(%(z)s)[0], %(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[1] PyGpuArray_STRIDES(%(z)s)[0] / 4,
PyGpuArray_STRIDES(%(z)s)[1] / 4
); );
} }
CNDA_THREAD_SYNC; %(cnda_thread_sync)s
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
...@@ -572,6 +585,7 @@ class GpuSoftmax (Op): ...@@ -572,6 +585,7 @@ class GpuSoftmax (Op):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
dtype = node.inputs[0].dtype
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1',
...@@ -612,12 +626,12 @@ class GpuSoftmax (Op): ...@@ -612,12 +626,12 @@ class GpuSoftmax (Op):
"}", "}",
]) ])
ret3 = "CUdeviceptr (*cuda_get_ptr)(gpudata *g);" ret3 = "CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
return ret1 + "\n" + ret2 + "\n"= ret3 return (ret1 + "\n" + ret2 + "\n" + ret3) % locals()
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (GpuOp): class GpuSoftmaxWithBias (Op):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
""" """
...@@ -634,17 +648,18 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -634,17 +648,18 @@ class GpuSoftmaxWithBias (GpuOp):
return self.__class__.__name__ return self.__class__.__name__
def make_node(self, x, b): def make_node(self, x, b):
x = as_gpuarray_variable(x)
b = as_gpuarray_variable(b)
return Apply(self, [x, b], [x.type()]) return Apply(self, [x, b], [x.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return (9,) + inline_softmax.code_version
return (8,) + inline_softmax.code_version
def c_headers(self): def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>'] return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>']
def c_compiler(self): def c_compiler(self):
return NVCC_compiler return NVCC_compiler
...@@ -653,11 +668,17 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -653,11 +668,17 @@ class GpuSoftmaxWithBias (GpuOp):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'] return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dtype = self.dtype dtype_x = node.inputs[0].dtype
typecode = pygpu.gpuarray.dtype_to_typecode(dtype) dtype_b = node.inputs[1].dtype
dtype_z = node.outputs[0].dtype
typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
x, b = inp x, b = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
else:
cnda_thread_sync = ""
return """ return """
if (PyGpuArray_NDIM(%(x)s) != 2) if (PyGpuArray_NDIM(%(x)s) != 2)
{ {
...@@ -690,16 +711,16 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -690,16 +711,16 @@ class GpuSoftmaxWithBias (GpuOp):
%(typecode)s, %(typecode)s,
GA_C_ORDER, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if (!%(nll)s) { if (!%(z)s) {
%(fail)s %(fail)s
} }
} }
{ {
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],32*1024); int n_blocks = std::min((int)PyGpuArray_DIMS(%(x)s)[0], 32*1024);
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], 512); int n_threads = std::min((int)PyGpuArray_DIMS(%(x)s)[1], 512);
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] * int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(dtype)s); 2 * sizeof(npy_%(dtype_x)s);
if (PyGpuArray_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
if(n_shared_bytes < (32 * 1024 - 500)){ if(n_shared_bytes < (32 * 1024 - 500)){
...@@ -712,46 +733,50 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -712,46 +733,50 @@ class GpuSoftmaxWithBias (GpuOp):
PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
(dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) + (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset), %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0], PyGpuArray_STRIDES(%(x)s)[0] / 4,
PyGpuArray_STRIDES(%(x)s)[1], PyGpuArray_STRIDES(%(x)s)[1] / 4,
(dtype_%(b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) + (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset), %(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0], PyGpuArray_STRIDES(%(b)s)[0] / 4,
(dtype_%(z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) + (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset), %(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0], PyGpuArray_STRIDES(%(z)s)[0] / 4,
PyGpuArray_STRIDES(%(z)s)[1] PyGpuArray_STRIDES(%(z)s)[1] / 4
); );
}else{ }else{
kSoftmaxWithBias_fixed_shared%(nodename)s kSoftmaxWithBias_fixed_shared%(nodename)s
<<< <<<
n_blocks, n_blocks,
n_threads, n_threads,
n_threads * sizeof(npy_%(dtype)s) n_threads * sizeof(npy_%(dtype_x)s)
>>>( >>>(
PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
(dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) + (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset), %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0], PyGpuArray_STRIDES(%(x)s)[0] / 4,
PyGpuArray_STRIDES(%(x)s)[1], PyGpuArray_STRIDES(%(x)s)[1] / 4,
(dtype_%(b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) + (npy_%(dtype_b)s*)(
((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset), %(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0], PyGpuArray_STRIDES(%(b)s)[0] / 4,
(dtype_%(z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) + (npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset), %(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0], PyGpuArray_STRIDES(%(z)s)[0] / 4,
PyGpuArray_STRIDES(%(z)s)[1] PyGpuArray_STRIDES(%(z)s)[1] / 4
); );
} }
CNDA_THREAD_SYNC; %(cnda_thread_sync)s
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
...@@ -767,7 +792,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -767,7 +792,7 @@ class GpuSoftmaxWithBias (GpuOp):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
dtype = self.dtype dtype = node.inputs[0].dtype
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename, ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1',
...@@ -815,6 +840,6 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -815,6 +840,6 @@ class GpuSoftmaxWithBias (GpuOp):
"}", "}",
]) ])
ret3 = "CUdeviceptr (*cuda_get_ptr)(gpudata *g);" ret3 = "CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
return ret1 + "\n" + ret2 + "\n"= ret3 return (ret1 + "\n" + ret2 + "\n" + ret3) % locals()
gpu_softmax_with_bias = GpuSoftmaxWithBias() gpu_softmax_with_bias = GpuSoftmaxWithBias()
\ No newline at end of file
...@@ -344,12 +344,12 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node): ...@@ -344,12 +344,12 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return GpuCrossentropySoftmax1HotWithBiasDx() return GpuCrossentropySoftmax1HotWithBiasDx()
@register_opt() @register_opt()
@op_lifter([tensor.nnet.GpuSoftmax]) @op_lifter([tensor.nnet.Softmax])
def local_gpua_softmax(node): def local_gpua_softmax(node):
return GpuSoftmax() return GpuSoftmax()
@register_opt() @register_opt()
@op_lifter([tensor.nnet.GpuSoftmaxWithBias]) @op_lifter([tensor.nnet.SoftmaxWithBias])
def local_gpua_softmaxwithbias(node): def local_gpua_softmaxwithbias(node):
return GpuSoftmaxWithBias() return GpuSoftmaxWithBias()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论