Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
19540d4e
提交
19540d4e
authored
2月 28, 2017
作者:
Frédéric Bastien
提交者:
GitHub
2月 28, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5632 from abergeron/no_warpsync
Remove warp-synchronous programming.
上级
76cfd7e8
bb0a326c
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
45 行增加
和
207 行删除
+45
-207
elemwise.py
theano/gpuarray/elemwise.py
+12
-149
extra_ops.py
theano/gpuarray/extra_ops.py
+2
-2
kernel_codegen.py
theano/gpuarray/kernel_codegen.py
+30
-55
subtensor.py
theano/gpuarray/subtensor.py
+1
-1
没有找到文件。
theano/gpuarray/elemwise.py
浏览文件 @
19540d4e
...
@@ -1005,15 +1005,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1005,15 +1005,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)
s myresult = 0;
%(acc_type)
s myresult = 0;
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
if (warpSize != 32)
{
Z[0] = -666;
return;
}
"""
%
locals
()
"""
%
locals
()
def
_assign_init
(
self
,
first_item
):
def
_assign_init
(
self
,
first_item
):
...
@@ -1117,67 +1108,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1117,67 +1108,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
acc_dtype
=
"npy_"
+
self
.
_acc_dtype
(
node
.
inputs
[
0
]
.
dtype
)
acc_dtype
=
"npy_"
+
self
.
_acc_dtype
(
node
.
inputs
[
0
]
.
dtype
)
write_out
=
write_w
(
node
.
outputs
[
0
]
.
dtype
)
write_out
=
write_w
(
node
.
outputs
[
0
]
.
dtype
)
# This code (the code in new_version) is currently ignored.
# Code produced later in this function is returned instead.
# The code here works with all nvidia driver
# But only for powers or multiples of 2!
new_version
=
"""
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
if (threadNum >= ((threadCount >> 1) * 2))
{
int idx = threadNum - (threadCount >> 1) * 2;"""
new_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[idx]'
,
'buf[threadNum]'
,
sub
,
False
)
new_version
+=
"""
}
__syncthreads();
// Works for power of 2 only.
int nTotalThreads = threadCount; // Total number of active threads
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadNum < halfPoint)
{
// Get the shared value stored by another thread
%(acc_dtype)
s temp = buf[threadNum + halfPoint];
"""
new_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'temp'
,
sub
,
False
)
new_version
+=
"""
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
__syncthreads();
if (threadNum == 0)
{
%(z_pos)
s =
%(write_out)
s(buf[0]);
}
__syncthreads();"""
new_version
=
new_version
%
locals
()
current_version
=
"""
current_version
=
"""
__syncthreads(); // some kernel do multiple reduction.
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
buf[threadNum] = myresult;
__syncthreads();
__syncthreads();
// rest of function is handled by one warp
// rest of function is handled by one warp
if (threadNum < warpSize)
if (threadNum < warpSize) {
{
//round up all the partial sums into the first `warpSize` elements
//round up all the partial sums into the first `warpSize` elements
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
{
...
@@ -1187,44 +1124,19 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1187,44 +1124,19 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
sub
,
False
)
+
"""
sub
,
False
)
+
"""
}
}
buf[threadNum] = myresult;
buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU.
}
TODO: find why it don't work or put the GPU compute capability into the version
__syncthreads();
// no sync because only one warp is running
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if(threadCount >32)
if (threadNum < _n && threadNum + _n < threadCount)
{"""
for
num
in
[
16
,
8
,
4
,
2
,
1
]:
current_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'buf[threadNum+
%
d]'
%
num
,
sub
,
False
)
current_version
+=
"""
"""
"""
current_version
+=
"""
current_version
+=
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
if (threadNum == 0)
'buf[threadNum+_n]'
,
sub
,
False
)
{
%(z_pos)
s =
%(write_out)
s(buf[0]);
}
}
else */
if (threadNum < 16)
{
//reduce so that threadNum 0 has the reduction of everything
"""
for
num
in
[
16
,
8
,
4
,
2
,
1
]:
this_if
=
"if (threadNum +
%
d < threadCount) "
%
num
+
\
self
.
_assign_reduce
(
node
,
name
,
'buf[threadNum]'
,
'buf[threadNum+
%
d]'
%
num
,
sub
,
False
)
current_version
+=
this_if
current_version
+=
"""
"""
current_version
+=
"""
current_version
+=
"""
if (threadNum == 0)
__syncthreads();
{
}
%(z_pos)
s =
%(write_out)
s(buf[0]);
if (threadNum == 0) {
}
%(z_pos)
s =
%(write_out)
s(buf[0]);
}
}
}
"""
"""
...
@@ -1900,7 +1812,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1900,7 +1812,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
def
c_code_cache_version_apply
(
self
,
node
):
def
c_code_cache_version_apply
(
self
,
node
):
version
=
[
1
8
]
# the version corresponding to the c code in this Op
version
=
[
1
9
]
# the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
# now we insert versions for the ops on which we depend...
scalar_node
=
Apply
(
scalar_node
=
Apply
(
...
@@ -1953,11 +1865,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1953,11 +1865,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
{
%(reduce_fct)
s
%(reduce_fct)
s
...
@@ -1997,11 +1904,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1997,11 +1904,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
{
%(reduce_fct)
s
%(reduce_fct)
s
...
@@ -2042,11 +1944,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2042,11 +1944,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
...
@@ -2169,12 +2066,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2169,12 +2066,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
...
@@ -2221,11 +2112,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2221,11 +2112,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
X = (const
%(in_type)
s *)(((char *)X)+offset_X);
X = (const
%(in_type)
s *)(((char *)X)+offset_X);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int a = blockIdx.x; a < A; a += gridDim.x)
for (int a = blockIdx.x; a < A; a += gridDim.x)
{
{
for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
...
@@ -2279,12 +2165,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2279,12 +2165,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
print
(
"""
print
(
"""
%(decl)
s
%(decl)
s
{
{
if(warpSize<blockDim.x){
//TODO: set error code
Z[0] = -666;
return;
}
%(init)
s
%(init)
s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
{
...
@@ -2332,13 +2212,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2332,13 +2212,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
//TODO: set error code
Z[blockIdx.x * sZ0] =
%(write_out)
s(-666);
return;
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
...
@@ -2445,11 +2318,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2445,11 +2318,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
...
@@ -2601,11 +2469,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2601,11 +2469,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
A = (const
%(in_type)
s *)(((char *)A)+offset_A);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
Z = (
%(out_type)
s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{
{
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
...
...
theano/gpuarray/extra_ops.py
浏览文件 @
19540d4e
...
@@ -111,7 +111,7 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -111,7 +111,7 @@ class GpuCumOp(GpuKernelBase, Op):
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier();
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) {
if
(index < blockDim.x*2) {
partialCumOp[index]
%(op)
s= partialCumOp[index - stride];
partialCumOp[index]
%(op)
s= partialCumOp[index - stride];
}
}
}
}
...
@@ -136,7 +136,7 @@ class GpuCumOp(GpuKernelBase, Op):
...
@@ -136,7 +136,7 @@ class GpuCumOp(GpuKernelBase, Op):
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier();
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) {
if
(index + stride < blockDim.x*2) {
partialCumOp[index + stride]
%(op)
s= partialCumOp[index];
partialCumOp[index + stride]
%(op)
s= partialCumOp[index];
}
}
}
}
...
...
theano/gpuarray/kernel_codegen.py
浏览文件 @
19540d4e
...
@@ -57,7 +57,7 @@ def code_version(version):
...
@@ -57,7 +57,7 @@ def code_version(version):
UNVERSIONED
=
()
UNVERSIONED
=
()
@code_version
((
1
,))
@code_version
((
2
,))
def
inline_reduce
(
N
,
buf
,
pos
,
count
,
manner_fn
):
def
inline_reduce
(
N
,
buf
,
pos
,
count
,
manner_fn
):
"""
"""
Return C++ code for a function that reduces a contiguous buffer.
Return C++ code for a function that reduces a contiguous buffer.
...
@@ -89,37 +89,25 @@ def inline_reduce(N, buf, pos, count, manner_fn):
...
@@ -89,37 +89,25 @@ def inline_reduce(N, buf, pos, count, manner_fn):
"""
"""
loop_line
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
(
buf
))
loop_line
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
(
buf
))
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_n
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+_n]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+2]"
%
(
buf
,
pos
))
r_1
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+1]"
%
(
buf
,
pos
))
return
"""
return
"""
{
{
// This function trashes buf[1..warpSize],
// This function trashes buf[1..warpSize],
// leaving the reduction result in buf[0].
// leaving the reduction result in buf[0].
if (
%(pos)
s < warpSize)
if (
%(pos)
s < warpSize) {
{
for (int i =
%(pos)
s + warpSize; i <
%(N)
s; i += warpSize)
for (int i =
%(pos)
s + warpSize; i <
%(N)
s; i += warpSize)
{
{
%(buf)
s[
%(pos)
s] =
%(loop_line)
s;
%(buf)
s[
%(pos)
s] =
%(loop_line)
s;
}
}
if (
%(pos)
s < 16)
}
{
__syncthreads();
//reduce so that
%(pos)
s 0 has the sum of everything
//reduce so that
%(pos)
s 0 has the reduction of everything
if(
%(pos)
s + 16 <
%(N)
s)
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
%(buf)
s[
%(pos)
s] =
%(r_16)
s;
if (
%(pos)
s < _n &&
%(pos)
s + _n <
%(N)
s)
if(
%(pos)
s + 8 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_n)
s;
%(buf)
s[
%(pos)
s] =
%(r_8)
s;
__syncthreads();
if(
%(pos)
s + 4 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_4)
s;
if(
%(pos)
s + 2 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_2)
s;
if(
%(pos)
s + 1 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_1)
s;
}
}
}
}
}
"""
%
locals
()
"""
%
locals
()
...
@@ -205,7 +193,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
...
@@ -205,7 +193,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
]
]
@code_version
((
2
,))
@code_version
((
3
,))
def
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
def
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
manner_fn
,
manner_init
,
manner_fn
,
manner_init
,
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
dtype
=
'float32'
):
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
dtype
=
'float32'
):
...
@@ -231,14 +219,6 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
...
@@ -231,14 +219,6 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
Index of executing thread.
Index of executing thread.
count
count
Number of executing threads.
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
manner_fn
manner_fn
A function that accepts strings of arguments a and b, and
A function that accepts strings of arguments a and b, and
returns c code for their reduction.
returns c code for their reduction.
...
@@ -249,6 +229,14 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
...
@@ -249,6 +229,14 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_init
manner_init
A function that accepts strings of arguments a and return c
A function that accepts strings of arguments a and return c
code for its initialization.
code for its initialization.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
Notes
Notes
-----
-----
...
@@ -268,11 +256,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
...
@@ -268,11 +256,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
locals
()))
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_n
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+_n]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
r_2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+2]"
%
(
buf
,
pos
))
r_1
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+1]"
%
(
buf
,
pos
))
ctype
=
gpuarray
.
dtype_to_ctype
(
dtype
)
ctype
=
gpuarray
.
dtype_to_ctype
(
dtype
)
return
"""
return
"""
...
@@ -281,31 +265,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
...
@@ -281,31 +265,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
// leaving the reduction result in buf[0].
// leaving the reduction result in buf[0].
%(ctype)
s red =
%(init)
s;
%(ctype)
s red =
%(init)
s;
#pragma unroll 16
#pragma unroll 16
for (int i =
%(pos)
s +
%(count)
s; i<
%(N)
s; i +=
%(count)
s){
for (int i =
%(pos)
s +
%(count)
s; i<
%(N)
s; i +=
%(count)
s)
{
red =
%(loop_line)
s;
red =
%(loop_line)
s;
}
}
buf[
%(pos)
s] = red;
buf[
%(pos)
s] = red;
__syncthreads();
__syncthreads();
if (
%(pos)
s < warpSize)
if (
%(pos)
s < warpSize) {
{
for (int i =
%(pos)
s + warpSize; i <
%(count)
s; i += warpSize) {
for (int i =
%(pos)
s + warpSize; i <
%(count)
s; i += warpSize)
{
%(buf)
s[
%(pos)
s] =
%(loop_line2)
s;
%(buf)
s[
%(pos)
s] =
%(loop_line2)
s;
}
}
if (
%(pos)
s < 16)
}
{
__syncthreads();
//reduce so that
%(pos)
s 0 has the reduction of everything
//reduce so that
%(pos)
s 0 has the reduction of everything
if(
%(pos)
s + 16 <
%(N)
s)
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
%(buf)
s[
%(pos)
s] =
%(r_16)
s;
if (
%(pos)
s < _n &&
%(pos)
s + _n <
%(N)
s)
if(
%(pos)
s + 8 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_n)
s;
%(buf)
s[
%(pos)
s] =
%(r_8)
s;
__syncthreads();
if(
%(pos)
s + 4 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_4)
s;
if(
%(pos)
s + 2 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_2)
s;
if(
%(pos)
s + 1 <
%(N)
s)
%(buf)
s[
%(pos)
s] =
%(r_1)
s;
}
}
}
}
}
"""
%
locals
()
"""
%
locals
()
...
...
theano/gpuarray/subtensor.py
浏览文件 @
19540d4e
...
@@ -1144,7 +1144,7 @@ class GpuDiagonal(Subtensor):
...
@@ -1144,7 +1144,7 @@ class GpuDiagonal(Subtensor):
# This is also in consistence with the interface of numpy.diagonal.
# This is also in consistence with the interface of numpy.diagonal.
if
slice_axis
<
stride_axis
:
if
slice_axis
<
stride_axis
:
stride_axis
-=
1
stride_axis
-=
1
new_dim_order
=
range
(
x
[
slicer
]
.
ndim
)
new_dim_order
=
list
(
range
(
x
[
slicer
]
.
ndim
)
)
new_dim_order
=
tuple
(
new_dim_order
[:
stride_axis
]
+
new_dim_order
=
tuple
(
new_dim_order
[:
stride_axis
]
+
new_dim_order
[
stride_axis
+
1
:]
+
new_dim_order
[
stride_axis
+
1
:]
+
[
stride_axis
,
])
[
stride_axis
,
])
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论