Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
ae8ce3c6
提交
ae8ce3c6
authored
2月 07, 2014
作者:
Pierre Luc Carrier
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Correct various bugs with ops GpuSoftmax and GpuSoftmaxWithBias. Current version…
Correct various bugs with ops GpuSoftmax and GpuSoftmaxWithBias. Current version passes tests with float32
上级
6ef5d264
显示空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
401 行增加
和
59 行删除
+401
-59
kernel_codegen.py
theano/sandbox/gpuarray/kernel_codegen.py
+316
-0
nnet.py
theano/sandbox/gpuarray/nnet.py
+83
-57
opt.py
theano/sandbox/gpuarray/opt.py
+2
-2
没有找到文件。
theano/sandbox/gpuarray/kernel_codegen.py
0 → 100644
浏览文件 @
ae8ce3c6
""" Helper routines for generating gpu kernels for nvcc.
"""
def
nvcc_kernel
(
name
,
params
,
body
):
"""Return the c code of a kernel function.
:param params: the parameters to the function as one or more strings
:param body: the [nested] list of statements for the body of the
function. These will be separated by ';' characters.
"""
paramstr
=
', '
.
join
(
params
)
def
flatbody
():
for
b
in
body
:
if
isinstance
(
b
,
(
list
,
tuple
)):
for
bb
in
b
:
yield
bb
else
:
yield
b
bodystr
=
';
\n
'
.
join
(
flatbody
())
return
"""__global__ void
%(name)
s (
%(paramstr)
s)
{
%(bodystr)
s;
}
"""
%
locals
()
def
code_version
(
version
):
"""decorator to support version-based cache mechanism"""
if
not
isinstance
(
version
,
tuple
):
raise
TypeError
(
'version must be tuple'
,
version
)
def
deco
(
f
):
f
.
code_version
=
version
return
f
return
deco
UNVERSIONED
=
()
@code_version
((
1
,))
def
inline_reduce
(
N
,
buf
,
pos
,
count
,
manner_fn
):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer
:param pos: index of executing thread
:param count: number of executing threads
:param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "
%(a)
s +
%(b)
s" for a sum reduction).
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
"""
loop_line
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
(
buf
))
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+2]"
%
(
buf
,
pos
))
r_1
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+1]"
%
(
buf
,
pos
))
return
"""
{
// This function trashes buf[1..warpSize],
// leaving the reduction result in buf[0].
if (
%(pos)
s < warpSize)
{
for (int i =
%(pos)
s + warpSize; i <
%(N)
s; i += warpSize)
{
%(buf)
s[
%(pos)
s] =
%(loop_line)
s;
}
if (
%(pos)
s < 16)
{
//reduce so that
%(pos)
s 0 has the sum of everything
if(
%(pos)
s + 16 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_16)
s;
if(
%(pos)
s + 8 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_8)
s;
if(
%(pos)
s + 4 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_4)
s;
if(
%(pos)
s + 2 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_2)
s;
if(
%(pos)
s + 1 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_1)
s;
}
}
}
"""
%
locals
()
@code_version
(
inline_reduce
.
code_version
)
def
inline_reduce_max
(
N
,
buf
,
pos
,
count
):
return
inline_reduce
(
N
,
buf
,
pos
,
count
,
lambda
a
,
b
:
"max(
%
s,
%
s)"
%
(
a
,
b
))
@code_version
(
inline_reduce
.
code_version
)
def
inline_reduce_sum
(
N
,
buf
,
pos
,
count
):
return
inline_reduce
(
N
,
buf
,
pos
,
count
,
lambda
a
,
b
:
"
%
s +
%
s"
%
(
a
,
b
))
@code_version
(
inline_reduce
.
code_version
)
def
inline_reduce_min
(
N
,
buf
,
pos
,
count
):
return
inline_reduce
(
N
,
buf
,
pos
,
count
,
lambda
a
,
b
:
"min(
%
s,
%
s)"
%
(
a
,
b
))
@code_version
(
inline_reduce
.
code_version
)
def
inline_reduce_prod
(
N
,
buf
,
pos
,
count
):
return
inline_reduce
(
N
,
buf
,
pos
,
count
,
lambda
a
,
b
:
"
%
s *
%
s"
%
(
a
,
b
))
@code_version
((
2
,)
+
inline_reduce_max
.
code_version
+
inline_reduce_sum
.
code_version
)
def
inline_softmax
(
N
,
buf
,
buf2
,
threadPos
,
threadCount
):
"""
:param N: length of the buffer
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:Precondition: buf and buf2 contain two identical copies of the input
to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times
:note2: We use __i as an int variable in a loop
"""
return
[
#get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'float row_max = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
'__syncthreads()'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'float row_sum = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
# divide each exp() result by the sum to complete the job.
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
'__syncthreads()'
,
]
@code_version
((
1
,))
def
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
manner_fn
,
manner_init
,
b
=
''
,
stride_b
=
''
):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(float)
:param pos: index of executing thread
:param count: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "
%(a)
s +
%(b)
s" for a sum reduction).
:param manner_init: a function that accepts strings of arguments a
and return c code for its initialization
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
"""
if
b
:
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s] +"
"
%(b)
s[
%(pos)
s *
%(stride_b)
s]"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s] + "
"
%(b)
s[i *
%(stride_b)
s]"
%
locals
()))
else
:
init
=
manner_init
(
"
%(x)
s[
%(pos)
s *
%(stride_x)
s]"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(x)
s[i *
%(stride_x)
s]"
%
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+2]"
%
(
buf
,
pos
))
r_1
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+1]"
%
(
buf
,
pos
))
return
"""
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
float red =
%(init)
s;
#pragma unroll 16
for (int i =
%(pos)
s +
%(count)
s; i<
%(N)
s; i +=
%(count)
s){
red =
%(loop_line)
s;
}
buf[
%(pos)
s] = red;
__syncthreads();
if (
%(pos)
s < warpSize)
{
for (int i =
%(pos)
s + warpSize; i <
%(count)
s; i += warpSize)
{
%(buf)
s[
%(pos)
s] =
%(loop_line2)
s;
}
if (
%(pos)
s < 16)
{
//reduce so that
%(pos)
s 0 has the reduction of everything
if(
%(pos)
s + 16 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_16)
s;
if(
%(pos)
s + 8 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_8)
s;
if(
%(pos)
s + 4 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_4)
s;
if(
%(pos)
s + 2 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_2)
s;
if(
%(pos)
s + 1 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_1)
s;
}
}
}
"""
%
locals
()
@code_version
(
inline_reduce_fixed_shared
.
code_version
)
def
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
b
=
''
,
stride_b
=
''
):
return
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
lambda
a
,
b
:
"max(
%
s,
%
s)"
%
(
a
,
b
),
lambda
a
:
a
,
b
,
stride_b
)
@code_version
((
1
,)
+
inline_reduce_max
.
code_version
+
inline_reduce_sum
.
code_version
)
def
inline_softmax_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
sm
,
sm_stride
,
threadPos
,
threadCount
,
b
=
''
,
stride_b
=
''
):
"""
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(float)
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:Precondition: buf is empty
:Postcondition: buf[0] contains the softmax,
buf2 contains un-normalized softmax
:note: buf should be in gpu shared memory, we access it many times.
:note2: We use tx as an int variable in a loop
"""
ret
=
[
#get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
threadPos
,
threadCount
,
b
,
stride_b
),
'__syncthreads()'
,
'float row_max = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
threadPos
,
threadCount
,
lambda
a
,
b
:
"
%
s +
%
s"
%
(
a
,
b
),
lambda
a
:
"exp(
%
s - row_max)"
%
a
,
b
,
stride_b
),
'__syncthreads()'
,
'float row_sum = '
+
buf
+
'[0]'
,
'__syncthreads()'
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
]
# This set all value correctly
if
b
:
ret
+=
[
"
%(sm)
s[tx *
%(sm_stride)
s] = "
" exp(
%(x)
s[tx *
%(stride_x)
s] +"
"
%(b)
s[tx *
%(stride_b)
s] - row_max)"
" / row_sum"
%
locals
()]
else
:
ret
+=
[
"
%(sm)
s[tx *
%(sm_stride)
s] = "
"exp(
%(x)
s[tx *
%(stride_x)
s] - row_max) / row_sum"
%
locals
()]
ret
+=
[
"}"
,
'__syncthreads()'
,
]
return
ret
theano/sandbox/gpuarray/nnet.py
浏览文件 @
ae8ce3c6
import
numpy
import
numpy
from
theano
import
Op
,
Apply
from
theano
import
Op
,
Apply
,
config
from
theano.compat.six
import
StringIO
from
theano.compat.six
import
StringIO
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
...
@@ -14,6 +13,10 @@ except ImportError:
...
@@ -14,6 +13,10 @@ except ImportError:
from
theano.sandbox.gpuarray.basic_ops
import
as_gpuarray_variable
from
theano.sandbox.gpuarray.basic_ops
import
as_gpuarray_variable
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.kernel_codegen
import
(
nvcc_kernel
,
inline_softmax
,
inline_softmax_fixed_shared
)
class
GpuCrossentropySoftmaxArgmax1HotWithBias
(
Op
):
class
GpuCrossentropySoftmaxArgmax1HotWithBias
(
Op
):
...
@@ -456,16 +459,17 @@ class GpuSoftmax (Op):
...
@@ -456,16 +459,17 @@ class GpuSoftmax (Op):
return
self
.
__class__
.
__name__
return
self
.
__class__
.
__name__
def
make_node
(
self
,
x
):
def
make_node
(
self
,
x
):
x
=
as_gpuarray_variable
(
x
)
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
def
infer_shape
(
self
,
node
,
shape
):
def
infer_shape
(
self
,
node
,
shape
):
return
shape
return
shape
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
9
,)
+
inline_softmax
.
code_version
return
(
10
,)
+
inline_softmax
.
code_version
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<compyte/extension.h>'
,
'<
compyte/
numpy_compat.h>'
]
return
[
'cuda.h'
,
'<compyte/extension.h>'
,
'<numpy_compat.h>'
]
def
c_compiler
(
self
):
def
c_compiler
(
self
):
return
NVCC_compiler
return
NVCC_compiler
...
@@ -474,11 +478,16 @@ class GpuSoftmax (Op):
...
@@ -474,11 +478,16 @@ class GpuSoftmax (Op):
return
[
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'
]
return
[
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
dtype
=
self
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
typecode
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
dtype
)
dtype_z
=
node
.
outputs
[
0
]
.
dtype
typecode
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
x
,
=
inp
x
,
=
inp
z
,
=
out
z
,
=
out
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
if
config
.
gpuarray
.
sync
:
cnda_thread_sync
=
"GpuArray_sync(&
%(zz)
s->ga);"
%
dict
(
zz
=
zz
)
else
:
cnda_thread_sync
=
""
return
"""
return
"""
if (PyGpuArray_NDIM(
%(x)
s) != 2)
if (PyGpuArray_NDIM(
%(x)
s) != 2)
{
{
...
@@ -496,17 +505,17 @@ class GpuSoftmax (Op):
...
@@ -496,17 +505,17 @@ class GpuSoftmax (Op):
%(typecode)
s,
%(typecode)
s,
GA_C_ORDER,
GA_C_ORDER,
pygpu_default_context(), Py_None);
pygpu_default_context(), Py_None);
if (!
%(
nll
)
s) {
if (!
%(
z
)
s) {
%(fail)
s
%(fail)
s
}
}
}
}
{
{
int n_blocks = std::min(PyGpuArray_DIMS(
%(x)
s)[0],
int n_blocks = std::min(
(int)
PyGpuArray_DIMS(
%(x)
s)[0],
32 * 1024);
32 * 1024);
//TODO, detect the maximum number of thread per block.
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(
%(x)
s)[1], 512);
int n_threads = std::min(
(int)
PyGpuArray_DIMS(
%(x)
s)[1], 512);
int n_shared_bytes = PyGpuArray_DIMS(
%(x)
s)[1] *
int n_shared_bytes = PyGpuArray_DIMS(
%(x)
s)[1] *
2 * sizeof(npy_
%(dtype)
s);
2 * sizeof(npy_
%(dtype
_x
)
s);
if (PyGpuArray_DIMS(
%(x)
s)[0] > 0)
if (PyGpuArray_DIMS(
%(x)
s)[0] > 0)
{
{
...
@@ -523,38 +532,42 @@ class GpuSoftmax (Op):
...
@@ -523,38 +532,42 @@ class GpuSoftmax (Op):
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
PyGpuArray_DIMS(
%(x)
s)[1],
(dtype_
%(x)
s*)(((char *)cuda_get_ptr(
%(x)
s->ga.data)) +
(npy_
%(dtype_x)
s*)(
((char *)cuda_get_ptr(
%(x)
s->ga.data)) +
%(x)
s->ga.offset),
%(x)
s->ga.offset),
PyGpuArray_STRIDES(
%(x)
s)[0],
PyGpuArray_STRIDES(
%(x)
s)[0]
/ 4
,
PyGpuArray_STRIDES(
%(x)
s)[1],
PyGpuArray_STRIDES(
%(x)
s)[1]
/ 4
,
(dtype_
%(z)
s*)(((char *)cuda_get_ptr(
%(z)
s->ga.data)) +
(npy_
%(dtype_z)
s*)(
%(z)
s->ga.offset);
((char *)cuda_get_ptr(
%(z)
s->ga.data)) +
PyGpuArray_STRIDES(
%(z)
s)[0],
%(z)
s->ga.offset),
PyGpuArray_STRIDES(
%(z)
s)[1]
PyGpuArray_STRIDES(
%(z)
s)[0] / 4,
PyGpuArray_STRIDES(
%(z)
s)[1] / 4
);
);
}else{
}else{
kSoftmax_fixed_shared
%(nodename)
s
kSoftmax_fixed_shared
%(nodename)
s
<<<
<<<
n_blocks,
n_blocks,
n_threads,
n_threads,
n_threads * sizeof(npy_
%(dtype)
s)
n_threads * sizeof(npy_
%(dtype
_x
)
s)
>>>(
>>>(
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
PyGpuArray_DIMS(
%(x)
s)[1],
(dtype_
%(x)
s*)(((char *)cuda_get_ptr(
%(x)
s->ga.data)) +
(npy_
%(dtype_x)
s*)(
((char *)cuda_get_ptr(
%(x)
s->ga.data)) +
%(x)
s->ga.offset),
%(x)
s->ga.offset),
PyGpuArray_STRIDES(
%(x)
s)[0],
PyGpuArray_STRIDES(
%(x)
s)[0]
/ 4
,
PyGpuArray_STRIDES(
%(x)
s)[1],
PyGpuArray_STRIDES(
%(x)
s)[1]
/ 4
,
(dtype_
%(z)
s*)(((char *)cuda_get_ptr(
%(z)
s->ga.data)) +
(npy_
%(dtype_z)
s*)(
%(z)
s->ga.offset);
((char *)cuda_get_ptr(
%(z)
s->ga.data)) +
PyGpuArray_STRIDES(
%(z)
s)[0],
%(z)
s->ga.offset),
PyGpuArray_STRIDES(
%(z)
s)[1]
PyGpuArray_STRIDES(
%(z)
s)[0] / 4,
PyGpuArray_STRIDES(
%(z)
s)[1] / 4
);
);
}
}
CNDA_THREAD_SYNC;
%(cnda_thread_sync)
s
cudaError_t err = cudaGetLastError();
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
if( cudaSuccess != err)
{
{
...
@@ -572,6 +585,7 @@ class GpuSoftmax (Op):
...
@@ -572,6 +585,7 @@ class GpuSoftmax (Op):
"""
%
locals
()
"""
%
locals
()
def
c_support_code_apply
(
self
,
node
,
nodename
):
def
c_support_code_apply
(
self
,
node
,
nodename
):
dtype
=
node
.
inputs
[
0
]
.
dtype
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
ret1
=
nvcc_kernel
(
"kSoftmax_
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const npy_
%(dtype)
s * x'
,
'const int sx0'
,
'const int sx1'
,
'const npy_
%(dtype)
s * x'
,
'const int sx0'
,
'const int sx1'
,
...
@@ -612,12 +626,12 @@ class GpuSoftmax (Op):
...
@@ -612,12 +626,12 @@ class GpuSoftmax (Op):
"}"
,
"}"
,
])
])
ret3
=
"CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
ret3
=
"CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
return
ret1
+
"
\n
"
+
ret2
+
"
\n
"
=
ret3
return
(
ret1
+
"
\n
"
+
ret2
+
"
\n
"
+
ret3
)
%
locals
()
gpu_softmax
=
GpuSoftmax
()
gpu_softmax
=
GpuSoftmax
()
class
GpuSoftmaxWithBias
(
Gpu
Op
):
class
GpuSoftmaxWithBias
(
Op
):
"""
"""
Implement SoftmaxWithBias on the gpu.
Implement SoftmaxWithBias on the gpu.
"""
"""
...
@@ -634,17 +648,18 @@ class GpuSoftmaxWithBias (GpuOp):
...
@@ -634,17 +648,18 @@ class GpuSoftmaxWithBias (GpuOp):
return
self
.
__class__
.
__name__
return
self
.
__class__
.
__name__
def
make_node
(
self
,
x
,
b
):
def
make_node
(
self
,
x
,
b
):
x
=
as_gpuarray_variable
(
x
)
b
=
as_gpuarray_variable
(
b
)
return
Apply
(
self
,
[
x
,
b
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
,
b
],
[
x
.
type
()])
def
infer_shape
(
self
,
node
,
shape
):
def
infer_shape
(
self
,
node
,
shape
):
return
[
shape
[
0
]]
return
[
shape
[
0
]]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
#return ()
return
(
9
,)
+
inline_softmax
.
code_version
return
(
8
,)
+
inline_softmax
.
code_version
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<compyte/extension.h>'
,
'<
compyte/
numpy_compat.h>'
]
return
[
'cuda.h'
,
'<compyte/extension.h>'
,
'<numpy_compat.h>'
]
def
c_compiler
(
self
):
def
c_compiler
(
self
):
return
NVCC_compiler
return
NVCC_compiler
...
@@ -653,11 +668,17 @@ class GpuSoftmaxWithBias (GpuOp):
...
@@ -653,11 +668,17 @@ class GpuSoftmaxWithBias (GpuOp):
return
[
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'
]
return
[
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
dtype
=
self
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
typecode
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
dtype
)
dtype_b
=
node
.
inputs
[
1
]
.
dtype
dtype_z
=
node
.
outputs
[
0
]
.
dtype
typecode
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
x
,
b
=
inp
x
,
b
=
inp
z
,
=
out
z
,
=
out
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
if
config
.
gpuarray
.
sync
:
cnda_thread_sync
=
"GpuArray_sync(&
%(zz)
s->ga);"
%
dict
(
zz
=
zz
)
else
:
cnda_thread_sync
=
""
return
"""
return
"""
if (PyGpuArray_NDIM(
%(x)
s) != 2)
if (PyGpuArray_NDIM(
%(x)
s) != 2)
{
{
...
@@ -690,16 +711,16 @@ class GpuSoftmaxWithBias (GpuOp):
...
@@ -690,16 +711,16 @@ class GpuSoftmaxWithBias (GpuOp):
%(typecode)
s,
%(typecode)
s,
GA_C_ORDER,
GA_C_ORDER,
pygpu_default_context(), Py_None);
pygpu_default_context(), Py_None);
if (!
%(
nll
)
s) {
if (!
%(
z
)
s) {
%(fail)
s
%(fail)
s
}
}
}
}
{
{
int n_blocks = std::min(
PyGpuArray_DIMS(
%(x)
s)[0],
32*1024);
int n_blocks = std::min(
(int)PyGpuArray_DIMS(
%(x)
s)[0],
32*1024);
//TODO, detect the maximum number of thread per block.
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(
%(x)
s)[1], 512);
int n_threads = std::min(
(int)
PyGpuArray_DIMS(
%(x)
s)[1], 512);
int n_shared_bytes = PyGpuArray_DIMS(
%(x)
s)[1] *
int n_shared_bytes = PyGpuArray_DIMS(
%(x)
s)[1] *
2 * sizeof(npy_
%(dtype)
s);
2 * sizeof(npy_
%(dtype
_x
)
s);
if (PyGpuArray_DIMS(
%(x)
s)[0] > 0)
if (PyGpuArray_DIMS(
%(x)
s)[0] > 0)
{
{
if(n_shared_bytes < (32 * 1024 - 500)){
if(n_shared_bytes < (32 * 1024 - 500)){
...
@@ -712,46 +733,50 @@ class GpuSoftmaxWithBias (GpuOp):
...
@@ -712,46 +733,50 @@ class GpuSoftmaxWithBias (GpuOp):
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
PyGpuArray_DIMS(
%(x)
s)[1],
(dtype_
%(x)
s*)(((char *)cuda_get_ptr(
%(x)
s->ga.data)) +
(npy_
%(dtype_x)
s*)(
((char *)cuda_get_ptr(
%(x)
s->ga.data)) +
%(x)
s->ga.offset),
%(x)
s->ga.offset),
PyGpuArray_STRIDES(
%(x)
s)[0],
PyGpuArray_STRIDES(
%(x)
s)[0]
/ 4
,
PyGpuArray_STRIDES(
%(x)
s)[1],
PyGpuArray_STRIDES(
%(x)
s)[1]
/ 4
,
(
dtype_
%(
b)
s*)(((char *)cuda_get_ptr(
%(b)
s->ga.data)) +
(
npy_
%(dtype_
b)
s*)(((char *)cuda_get_ptr(
%(b)
s->ga.data)) +
%(b)
s->ga.offset),
%(b)
s->ga.offset),
PyGpuArray_STRIDES(
%(b)
s)[0],
PyGpuArray_STRIDES(
%(b)
s)[0]
/ 4
,
(
dtype_
%(
z)
s*)(((char *)cuda_get_ptr(
%(z)
s->ga.data)) +
(
npy_
%(dtype_
z)
s*)(((char *)cuda_get_ptr(
%(z)
s->ga.data)) +
%(z)
s->ga.offset),
%(z)
s->ga.offset),
PyGpuArray_STRIDES(
%(z)
s)[0],
PyGpuArray_STRIDES(
%(z)
s)[0]
/ 4
,
PyGpuArray_STRIDES(
%(z)
s)[1]
PyGpuArray_STRIDES(
%(z)
s)[1]
/ 4
);
);
}else{
}else{
kSoftmaxWithBias_fixed_shared
%(nodename)
s
kSoftmaxWithBias_fixed_shared
%(nodename)
s
<<<
<<<
n_blocks,
n_blocks,
n_threads,
n_threads,
n_threads * sizeof(npy_
%(dtype)
s)
n_threads * sizeof(npy_
%(dtype
_x
)
s)
>>>(
>>>(
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
PyGpuArray_DIMS(
%(x)
s)[1],
(dtype_
%(x)
s*)(((char *)cuda_get_ptr(
%(x)
s->ga.data)) +
(npy_
%(dtype_x)
s*)(
((char *)cuda_get_ptr(
%(x)
s->ga.data)) +
%(x)
s->ga.offset),
%(x)
s->ga.offset),
PyGpuArray_STRIDES(
%(x)
s)[0],
PyGpuArray_STRIDES(
%(x)
s)[0]
/ 4
,
PyGpuArray_STRIDES(
%(x)
s)[1],
PyGpuArray_STRIDES(
%(x)
s)[1]
/ 4
,
(dtype_
%(b)
s*)(((char *)cuda_get_ptr(
%(b)
s->ga.data)) +
(npy_
%(dtype_b)
s*)(
((char *)cuda_get_ptr(
%(b)
s->ga.data)) +
%(b)
s->ga.offset),
%(b)
s->ga.offset),
PyGpuArray_STRIDES(
%(b)
s)[0],
PyGpuArray_STRIDES(
%(b)
s)[0]
/ 4
,
(dtype_
%(z)
s*)(((char *)cuda_get_ptr(
%(z)
s->ga.data)) +
(npy_
%(dtype_z)
s*)(
((char *)cuda_get_ptr(
%(z)
s->ga.data)) +
%(z)
s->ga.offset),
%(z)
s->ga.offset),
PyGpuArray_STRIDES(
%(z)
s)[0],
PyGpuArray_STRIDES(
%(z)
s)[0]
/ 4
,
PyGpuArray_STRIDES(
%(z)
s)[1]
PyGpuArray_STRIDES(
%(z)
s)[1]
/ 4
);
);
}
}
CNDA_THREAD_SYNC;
%(cnda_thread_sync)
s
cudaError_t err = cudaGetLastError();
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
if( cudaSuccess != err)
{
{
...
@@ -767,7 +792,7 @@ class GpuSoftmaxWithBias (GpuOp):
...
@@ -767,7 +792,7 @@ class GpuSoftmaxWithBias (GpuOp):
"""
%
locals
()
"""
%
locals
()
def
c_support_code_apply
(
self
,
node
,
nodename
):
def
c_support_code_apply
(
self
,
node
,
nodename
):
dtype
=
self
.
dtype
dtype
=
node
.
inputs
[
0
]
.
dtype
ret1
=
nvcc_kernel
(
"kSoftmaxWithBias_
%
s"
%
nodename
,
ret1
=
nvcc_kernel
(
"kSoftmaxWithBias_
%
s"
%
nodename
,
params
=
[
'int M'
,
'int N'
,
params
=
[
'int M'
,
'int N'
,
'const npy_
%(dtype)
s * x'
,
'const int sx0'
,
'const int sx1'
,
'const npy_
%(dtype)
s * x'
,
'const int sx0'
,
'const int sx1'
,
...
@@ -815,6 +840,6 @@ class GpuSoftmaxWithBias (GpuOp):
...
@@ -815,6 +840,6 @@ class GpuSoftmaxWithBias (GpuOp):
"}"
,
"}"
,
])
])
ret3
=
"CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
ret3
=
"CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
return
ret1
+
"
\n
"
+
ret2
+
"
\n
"
=
ret3
return
(
ret1
+
"
\n
"
+
ret2
+
"
\n
"
+
ret3
)
%
locals
()
gpu_softmax_with_bias
=
GpuSoftmaxWithBias
()
gpu_softmax_with_bias
=
GpuSoftmaxWithBias
()
\ No newline at end of file
theano/sandbox/gpuarray/opt.py
浏览文件 @
ae8ce3c6
...
@@ -344,12 +344,12 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
...
@@ -344,12 +344,12 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return
GpuCrossentropySoftmax1HotWithBiasDx
()
return
GpuCrossentropySoftmax1HotWithBiasDx
()
@register_opt
()
@register_opt
()
@op_lifter
([
tensor
.
nnet
.
Gpu
Softmax
])
@op_lifter
([
tensor
.
nnet
.
Softmax
])
def
local_gpua_softmax
(
node
):
def
local_gpua_softmax
(
node
):
return
GpuSoftmax
()
return
GpuSoftmax
()
@register_opt
()
@register_opt
()
@op_lifter
([
tensor
.
nnet
.
Gpu
SoftmaxWithBias
])
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
])
def
local_gpua_softmaxwithbias
(
node
):
def
local_gpua_softmaxwithbias
(
node
):
return
GpuSoftmaxWithBias
()
return
GpuSoftmaxWithBias
()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论