Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
c02a385a
提交
c02a385a
authored
3月 01, 2017
作者:
Florian Bordes
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' of
https://github.com/Theano/Theano
上级
1db19553
19540d4e
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
45 行增加
和
207 行删除
+45
-207
elemwise.py
theano/gpuarray/elemwise.py
+12
-149
extra_ops.py
theano/gpuarray/extra_ops.py
+2
-2
kernel_codegen.py
theano/gpuarray/kernel_codegen.py
+30
-55
subtensor.py
theano/gpuarray/subtensor.py
+1
-1
没有找到文件。
theano/gpuarray/elemwise.py
浏览文件 @
c02a385a
...
...
@@ -1005,15 +1005,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)
s myresult = 0;
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
if (warpSize != 32)
{
Z[0] = -666;
return;
}
"""
%
locals
()
def
_assign_init
(
self
,
first_item
):
...
...
@@ -1117,67 +1108,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
acc_dtype
=
"npy_"
+
self
.
_acc_dtype
(
node
.
inputs
[
0
]
.
dtype
)
write_out
=
write_w
(
node
.
outputs
[
0
]
.
dtype
)
# This code (the code in new_version) is currently ignored.
# Code produced later in this function is returned instead.
# The code here works with all nvidia driver
# But only for powers or multiples of 2!
new_version
=
"""
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
if (threadNum >= ((threadCount >> 1) * 2))
{
int idx = threadNum - (threadCount >> 1) * 2;"""
new_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[idx]'
,
'buf[threadNum]'
,
sub
,
False
)
new_version
+=
"""
}
__syncthreads();
// Works for power of 2 only.
int nTotalThreads = threadCount; // Total number of active threads
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadNum < halfPoint)
{
// Get the shared value stored by another thread
%(acc_dtype)
s temp = buf[threadNum + halfPoint];
"""
new_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'temp'
,
sub
,
False
)
new_version
+=
"""
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
__syncthreads();
if (threadNum == 0)
{
%(z_pos)
s =
%(write_out)
s(buf[0]);
}
__syncthreads();"""
new_version
=
new_version
%
locals
()
current_version
=
"""
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
if (threadNum < warpSize) {
//round up all the partial sums into the first `warpSize` elements
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
...
...
@@ -1187,44 +1124,19 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
sub
,
False
)
+
"""
}
buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU.
TODO: find why it don't work or put the GPU compute capability into the version
// no sync because only one warp is running
if(threadCount >32)
{"""
for
num
in
[
16
,
8
,
4
,
2
,
1
]:
current_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'buf[threadNum+
%
d]'
%
num
,
sub
,
False
)
current_version
+=
"""
}
__syncthreads();
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if (threadNum < _n && threadNum + _n < threadCount)
"""
current_version
+=
"""
if (threadNum == 0)
{
%(z_pos)
s =
%(write_out)
s(buf[0]);
}
current_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'buf[threadNum+_n]'
,
sub
,
False
)
}
else */
if (threadNum < 16)
{
//reduce so that threadNum 0 has the reduction of everything
"""
for
num
in
[
16
,
8
,
4
,
2
,
1
]:
this_if
=
"if (threadNum +
%
d < threadCount) "
%
num
+
\
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'buf[threadNum+
%
d]'
%
num
,
sub
,
False
)
current_version
+=
this_if
current_version
+=
"""
"""
current_version
+=
"""
if (threadNum == 0)
{
%(z_pos)
s =
%(write_out)
s(buf[0]);
}
}
__syncthreads();
}
if (threadNum == 0) {
%(z_pos)
s =
%(write_out)
s(buf[0]);
}
"""
...
...
@@ -1900,7 +1812,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"""
%
locals
(),
file
=
sio
)
def
c_code_cache_version_apply
(
self
,
node
):
version
=
[
1
8
]
# the version corresponding to the c code in this Op
version
=
[
1
9
]
# the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node
=
Apply
(
...
...
@@ -1953,11 +1865,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
%(reduce_fct)
s
...
...
@@ -1997,11 +1904,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
%(reduce_fct)
s
...
...
@@ -2042,11 +1944,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
...
...
@@ -2169,12 +2066,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
...
...
@@ -2221,11 +2112,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
X = (const
%(in_type)
s *)(((char *)X)+offset_X);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int a = blockIdx.x; a < A; a += gridDim.x)
{
for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
...
...
@@ -2279,12 +2165,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
print
(
"""
%(decl)
s
{
if(warpSize<blockDim.x){
//TODO: set error code
Z[0] = -666;
return;
}
%(init)
s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
...
...
@@ -2332,13 +2212,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
//TODO: set error code
Z[blockIdx.x * sZ0] =
%(write_out)
s(-666);
return;
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
...
...
@@ -2445,11 +2318,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
...
...
@@ -2601,11 +2469,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
...
...
theano/gpuarray/extra_ops.py
浏览文件 @
c02a385a
...
...
@@ -111,7 +111,7 @@ class GpuCumOp(GpuKernelBase, Op):
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) {
if
(index < blockDim.x*2) {
partialCumOp[index]
%(op)
s= partialCumOp[index - stride];
}
}
...
...
@@ -136,7 +136,7 @@ class GpuCumOp(GpuKernelBase, Op):
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) {
if
(index + stride < blockDim.x*2) {
partialCumOp[index + stride]
%(op)
s= partialCumOp[index];
}
}
...
...
theano/gpuarray/kernel_codegen.py
浏览文件 @
c02a385a
...
...
@@ -57,7 +57,7 @@ def code_version(version):
UNVERSIONED
=
()
@code_version
((
1
,))
@code_version
((
2
,))
def
inline_reduce
(
N
,
buf
,
pos
,
count
,
manner_fn
):
"""
Return C++ code for a function that reduces a contiguous buffer.
...
...
@@ -89,37 +89,25 @@ def inline_reduce(N, buf, pos, count, manner_fn):
"""
loop_line
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
(
buf
))
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+2]"
%
(
buf
,
pos
))
r_1
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+1]"
%
(
buf
,
pos
))
r_n
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+_n]"
%
(
buf
,
pos
))
return
"""
{
// This function trashes buf[1..warpSize],
// leaving the reduction result in buf[0].
if (
%(pos)
s < warpSize)
{
if (
%(pos)
s < warpSize) {
for (int i =
%(pos)
s + warpSize; i <
%(N)
s; i += warpSize)
{
%(buf)
s[
%(pos)
s] =
%(loop_line)
s;
}
if (
%(pos)
s < 16)
{
//reduce so that
%(pos)
s 0 has the sum of everything
if(
%(pos)
s + 16 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_16)
s;
if(
%(pos)
s + 8 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_8)
s;
if(
%(pos)
s + 4 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_4)
s;
if(
%(pos)
s + 2 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_2)
s;
if(
%(pos)
s + 1 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_1)
s;
}
}
__syncthreads();
//reduce so that
%(pos)
s 0 has the reduction of everything
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if (
%(pos)
s < _n &&
%(pos)
s + _n <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_n)
s;
__syncthreads();
}
}
"""
%
locals
()
...
...
@@ -205,7 +193,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
]
@code_version
((
2
,))
@code_version
((
3
,))
def
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
manner_fn
,
manner_init
,
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
dtype
=
'float32'
):
...
...
@@ -231,14 +219,6 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
Index of executing thread.
count
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and
returns c code for their reduction.
...
...
@@ -249,6 +229,14 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_init
A function that accepts strings of arguments a and return c
code for its initialization.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
Notes
-----
...
...
@@ -268,11 +256,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+2]"
%
(
buf
,
pos
))
r_1
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+1]"
%
(
buf
,
pos
))
r_n
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+_n]"
%
(
buf
,
pos
))
ctype
=
gpuarray
.
dtype_to_ctype
(
dtype
)
return
"""
...
...
@@ -281,31 +265,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
// leaving the reduction result in buf[0].
%(ctype)
s red =
%(init)
s;
#pragma unroll 16
for (int i =
%(pos)
s +
%(count)
s; i<
%(N)
s; i +=
%(count)
s){
for (int i =
%(pos)
s +
%(count)
s; i<
%(N)
s; i +=
%(count)
s)
{
red =
%(loop_line)
s;
}
buf[
%(pos)
s] = red;
__syncthreads();
if (
%(pos)
s < warpSize)
{
for (int i =
%(pos)
s + warpSize; i <
%(count)
s; i += warpSize)
{
if (
%(pos)
s < warpSize) {
for (int i =
%(pos)
s + warpSize; i <
%(count)
s; i += warpSize) {
%(buf)
s[
%(pos)
s] =
%(loop_line2)
s;
}
if (
%(pos)
s < 16)
{
//reduce so that
%(pos)
s 0 has the reduction of everything
if(
%(pos)
s + 16 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_16)
s;
if(
%(pos)
s + 8 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_8)
s;
if(
%(pos)
s + 4 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_4)
s;
if(
%(pos)
s + 2 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_2)
s;
if(
%(pos)
s + 1 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_1)
s;
}
}
__syncthreads();
//reduce so that
%(pos)
s 0 has the reduction of everything
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if (
%(pos)
s < _n &&
%(pos)
s + _n <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_n)
s;
__syncthreads();
}
}
"""
%
locals
()
...
...
theano/gpuarray/subtensor.py
浏览文件 @
c02a385a
...
...
@@ -1144,7 +1144,7 @@ class GpuDiagonal(Subtensor):
# This is also in consistence with the interface of numpy.diagonal.
if
slice_axis
<
stride_axis
:
stride_axis
-=
1
new_dim_order
=
range
(
x
[
slicer
]
.
ndim
)
new_dim_order
=
list
(
range
(
x
[
slicer
]
.
ndim
)
)
new_dim_order
=
tuple
(
new_dim_order
[:
stride_axis
]
+
new_dim_order
[
stride_axis
+
1
:]
+
[
stride_axis
,
])
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论