Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
41daf4a8
提交
41daf4a8
authored
3月 07, 2015
作者:
Sean Lee
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Use the CUDA Driver API for conv operations
上级
0d5cffbe
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
1043 行增加
和
640 行删除
+1043
-640
conv.cu
theano/sandbox/gpuarray/conv.cu
+526
-541
conv.py
theano/sandbox/gpuarray/conv.py
+294
-23
conv_full_kernel.cu
theano/sandbox/gpuarray/conv_full_kernel.cu
+70
-23
conv_kernel.cu
theano/sandbox/gpuarray/conv_kernel.cu
+153
-53
没有找到文件。
theano/sandbox/gpuarray/conv.cu
浏览文件 @
41daf4a8
...
...
@@ -10,12 +10,6 @@ PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
const
size_t
subsample_cols
,
const
int
version
,
const
int
verbose
);
template
<
typename
T
>
static
T
ceil_intdiv
(
T
a
,
T
b
)
{
return
(
a
/
b
)
+
((
a
%
b
)
?
1
:
0
);
}
/*
* version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation
...
...
@@ -108,6 +102,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const
int
stack_len
=
PyGpuArray_DIMS
(
img
)[
1
];
const
int
nstack
=
PyGpuArray_DIMS
(
kern
)[
1
];
const
int
nbatch
=
PyGpuArray_DIMS
(
img
)[
0
];
const
int
nkern
=
PyGpuArray_DIMS
(
kern
)[
0
];
...
...
@@ -126,6 +121,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const
int
kern_stride_row
=
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
;
const
int
kern_stride_stack
=
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
;
const
int
kern_stride_nkern
=
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
;
const
int
out_stride_col
=
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
;
const
int
out_stride_row
=
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
;
const
int
out_stride_nkern
=
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
;
const
int
out_stride_batch
=
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
;
const
int
img_size
=
img_len
*
img_wid
;
const
int
kern_size
=
kern_len
*
kern_wid
;
...
...
@@ -156,16 +155,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we don't need to unflip it, but have the new value when we unflip it.
bool
kern_flipped
=
true
;
bool
kern_contiguous_2d_unflipped
=
kern_contiguous_2d
;
const
float
*
kern_data_unflipped
=
cuda_get_ptr
(
kern
);
int
kern_stride_col_unflipped
=
kern_stride_col
;
int
kern_stride_row_unflipped
=
kern_stride_row
;
if
(
kern_stride_col_unflipped
==-
1
&&
kern_stride_row_unflipped
==-
kern_wid
){
if
(
kern_stride_col
==-
1
&&
kern_stride_row
==-
kern_wid
){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped
=
1
;
kern_stride_row_unflipped
=
kern_wid
;
kern_flipped
=
false
;
kern_contiguous_2d_unflipped
=
true
;
kern_data_unflipped
=&
(
cuda_get_ptr
(
kern
)[(
kern_wid
-
1
)
*
kern_stride_col
+
(
kern_len
-
1
)
*
kern_stride_row
]);
}
//if we remove the restriction
...
...
@@ -195,43 +188,47 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while
(
ceil_intdiv
(
out_len
,
nb_split
)
*
out_wid
>
max_threads_dim0
)
nb_split
++
;
dim3
threads
(
out_wid
,
ceil_intdiv
(
out_len
,
nb_split
));
dim3
grid
(
nbatch
,
nkern
);
int
shared_size
=
(
img_size
+
kern_size
)
*
sizeof
(
float
);
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
);
if
(
threads
.
y
==
out_len
)
f
=
conv_patch_2
;
else
f
=
conv_patch_3
;
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
),
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
ceil_intdiv
((
size_t
)
out_len
,(
size_t
)
nb_split
),
(
size_t
)
1
};
size_t
n_blocks
[
3
]
=
{(
size_t
)
nbatch
,
(
size_t
)
nkern
,
(
size_t
)
1
};
size_t
shmem_sz
=
(
img_size
+
kern_size
)
*
sizeof
(
float
);
GpuKernel
*
k
=
NULL
;
if
(
threads_per_block
[
1
]
==
out_len
)
k
=&
conv_patch_2_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
k
=&
conv_patch_3_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
void
*
kernel_params
[]
=
{(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
};
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
if
(
verbose
)
fprintf
(
stderr
,
"INFO: used 'conv_patch' version %s nb_split=%d
\n
"
,
threads
.
y
==
out_len
?
"no split"
:
"split"
,
nb_split
);
threads
_per_block
[
1
]
==
out_len
?
"no split"
:
"split"
,
nb_split
);
work_complete
=
true
;
}
else
{
if
(
verbose
)
fprintf
(
stderr
,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
shared_size
,
threads
.
x
*
threads
.
y
,
nb_split
);
"threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i, nb_split=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
],
nb_split
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_patch' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
...
...
@@ -250,75 +247,77 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if
((
version
==
3
||
version
==
12
)
&&
out_len
>
1
)
nb_split
++
;
//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while
(
ceil_intdiv
(
out_len
,
nb_split
)
*
out_wid
>
max_threads_dim0
)
nb_split
++
;
dim3
threads
(
out_wid
,
ceil_intdiv
(
out_len
,
nb_split
));
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
(
size_t
)
ceil_intdiv
(
out_len
,
nb_split
),
(
size_t
)
1
};
bool
preload_full_kernel
=
(
img_size_byte
+
kern_size_byte
)
<
shared_avail
;
if
(
version
==
11
||
version
==
12
)
preload_full_kernel
=
false
;
dim3
grid
(
nbatch
,
nkern
)
;
int
shared_size
=
(
img_size
+
(
preload_full_kernel
?
kern_size
:
kern_wid
))
*
sizeof
(
float
);
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_64
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_65
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_66
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_67
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_68
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_69
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_7
0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_71
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_72
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_73
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_74
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_75
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_76
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_77
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_78
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_79
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_8
0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_81
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_82
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_83
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_84
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_85
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_86
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_87
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_88
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_89
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_90
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_91
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_92
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_93
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
f
=
conv_patch_stack_94
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
f
=
conv_patch_stack_95
;}
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
out_len
,
out_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
,
subsample_rows
,
subsample_cols
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
size_t
n_blocks
[
3
]
=
{(
size_t
)
nbatch
,
(
size_t
)
nkern
,
(
size_t
)
1
}
;
size_t
shmem_sz
=
(
img_size
+
(
preload_full_kernel
?
kern_size
:
kern_wid
))
*
sizeof
(
float
);
GpuKernel
*
k
=
NULL
;
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_64_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_65_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_66_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_67_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_68_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_69_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_70_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_71_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_72_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_73_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_74_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_75_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_76_node_
<<<<
HASH_PLACEHOLDER
>>>>
_
0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_77_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_78_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
!
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_79_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_80_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_81_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_82_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_83_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_84_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_85_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_86_node_
<<<<
HASH_PLACEHOLDER
>>>>
_
0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
!
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_87_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_88_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_89_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_90_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
==
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_91_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_92_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
!
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_93_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
!
subsample
){
k
=&
conv_patch_stack_94_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
else
if
(
preload_full_kernel
&&
nb_split
!=
1
&&
img_contiguous_2d
&&
kern_contiguous_2d
&&
subsample
){
k
=&
conv_patch_stack_95_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;}
void
*
kernel_params
[]
=
{(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
out_len
,
(
void
*
)
&
out_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_nkern
,
(
void
*
)
&
subsample_rows
,
(
void
*
)
&
subsample_cols
};
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i,"
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
,
THEANO_KERN_WID
,
img_contiguous_2d
,
kern_contiguous_2d
,
nb_split
,
preload_full_kernel
,
(
unsigned
long
long
)
subsample_rows
,
...
...
@@ -337,15 +336,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i,"
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
,
THEANO_KERN_WID
,
img_contiguous_2d
,
kern_contiguous_2d
,
nb_split
,
preload_full_kernel
,
(
unsigned
long
long
)
subsample_rows
,
...
...
@@ -354,7 +353,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
fprintf
(
stderr
,
"INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
...
...
@@ -366,28 +365,28 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
!
work_complete
)
//conv_rows
{
dim3
threads
(
out_wid
)
;
dim3
grid
(
out_len
,
nbatch
*
nkern
)
;
int
shared_size
=
(
kern_len
*
img_wid
+
kern_size
)
*
sizeof
(
float
);
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
if
(
!
img_contiguous_2d
||
!
kern_contiguous_2d
)
f
=
conv_rows_0
;
else
f
=
conv_rows_1
;
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
)
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
(
size_t
)
1
,
(
size_t
)
1
}
;
size_t
n_blocks
[
3
]
=
{(
size_t
)
out_len
,
(
size_t
)
nbatch
*
nkern
,
(
size_t
)
1
}
;
size_t
shmem_sz
=
(
kern_len
*
img_wid
+
kern_size
)
*
sizeof
(
float
);
GpuKernel
*
k
=
NULL
;
if
(
!
img_contiguous_2d
||
!
kern_contiguous_2d
)
k
=&
conv_rows_0_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
k
=&
conv_rows_1_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
void
*
kernel_params
[]
=
{
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_nkern
}
;
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
work_complete
=
true
;
if
(
verbose
)
...
...
@@ -397,15 +396,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
);
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i
\n
"
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_rows' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
if
(
!
subsample
&&
out_contiguous
&&
...
...
@@ -423,52 +422,50 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_row
=
i
;
}
dim3
threads
(
out_wid
,
nb_row
);
dim3
grid
(
ceil_intdiv
(
out_len
,
nb_row
),
nbatch
*
nkern
);
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
(
size_t
)
nb_row
,
(
size_t
)
1
};
size_t
n_blocks
[
3
]
=
{(
size_t
)
ceil_intdiv
(
out_len
,
nb_row
),
(
size_t
)
nbatch
*
nkern
,
(
size_t
)
1
};
int
shared_size
=
((
kern_len
+
nb_row
-
1
)
*
img_wid
+
kern_size
)
*
sizeof
(
float
);
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
size_t
shmem_sz
=
((
kern_len
+
nb_row
-
1
)
*
img_wid
+
kern_size
)
*
sizeof
(
float
);
if
(
0
)
fprintf
(
stderr
,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)
\n
"
,
img_contiguous_2d
,
kern_contiguous_2d
,
threads
.
x
,
threads
.
y
,
threads
.
z
,
grid
.
x
,
grid
.
y
,
grid
.
z
);
threads
_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
]
,
n_blocks
[
0
],
n_blocks
[
1
],
n_blocks
[
2
]
);
GpuKernel
*
k
=
NULL
;
if
(
!
img_contiguous_2d
||
!
kern_contiguous_2d
)
{
//fprintf(stderr, "using false version\n");
f
=
conv_rows_stack
_0
;
k
=&
conv_rows_stack_0_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
}
else
{
//fprintf(stderr, "using true version\n");
f
=
conv_rows_stack_1
;
k
=&
conv_rows_stack_1_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
}
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
),
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
void
*
kernel_params
[]
=
{
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_nkern
};
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
work_complete
=
true
;
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
);
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i
\n
"
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: used 'conv_rows_stack' version
\n
"
);
}
...
...
@@ -476,15 +473,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
);
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i
\n
"
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
...
...
@@ -517,42 +514,41 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//to test the case when we don't have a thread by output pixel.
if
((
version_back
!=-
1
)
&&
nb_row
>
1
)
nb_row
--
;
dim3
threads
(
out_wid
,
nb_row
);
dim3
grid
(
ceil_intdiv
(
out_len
,
nb_row
),
nbatch
*
nkern
);
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
(
size_t
)
nb_row
,
(
size_t
)
1
};
size_t
n_blocks
[
3
]
=
{(
size_t
)
ceil_intdiv
(
out_len
,
nb_row
),
(
size_t
)
nbatch
*
nkern
,
(
size_t
)
1
};
int
shared_size
=
(
threads
.
y
*
img_wid
+
k_size
)
*
sizeof
(
float
);
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
if
((
!
img_contiguous_2d
||
!
kern_contiguous_2d
)
&&
version
==
9
)
f
=
conv_rows_stack2_1
;
else
if
(
version
==
9
)
f
=
conv_rows_stack2_3
;
else
if
(
!
img_contiguous_2d
||
!
kern_contiguous_2d
)
f
=
conv_rows_stack2_0
;
else
f
=
conv_rows_stack2_2
;
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
),
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
size_t
shmem_sz
=
((
kern_len
+
nb_row
-
1
)
*
img_wid
+
kern_size
)
*
sizeof
(
float
);
GpuKernel
*
k
=
NULL
;
if
((
!
img_contiguous_2d
||
!
kern_contiguous_2d
)
&&
version
==
9
)
k
=&
conv_rows_stack2_1_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
version
==
9
)
k
=&
conv_rows_stack2_3_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
!
img_contiguous_2d
||
!
kern_contiguous_2d
)
k
=&
conv_rows_stack2_0_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
k
=&
conv_rows_stack2_2_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
void
*
kernel_params
[]
=
{
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_nkern
};
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
work_complete
=
true
;
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
);
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i
\n
"
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: used 'conv_rows_stack2' version %s with"
...
...
@@ -564,15 +560,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i version=%d
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
,(
version
==
9
?
2
:
3
));
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i version=%d
\n
"
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
,(
version
==
9
?
2
:
3
));
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
...
...
@@ -619,18 +615,18 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_split
++
;
// tentative estimates (prior to contraint c)
in
t
thread_z
=
ceil_intdiv
(
kern_len
,
nb_split
);
int
shared_size
=
sizeof
(
float
)
*
(
full_kern
?
std
::
max
(
img_size
+
kern_size
,
out_size
*
thread_z
)
:
std
::
max
(
img_size
+
thread_z
*
kern_wid
,
out_size
*
thread_z
));
size_
t
thread_z
=
ceil_intdiv
(
kern_len
,
nb_split
);
size_t
shmem_sz
=
sizeof
(
float
)
*
(
full_kern
?
std
::
max
(
(
size_t
)
img_size
+
kern_size
,
out_size
*
thread_z
)
:
std
::
max
(
(
size_t
)
img_size
+
thread_z
*
kern_wid
,
out_size
*
thread_z
));
// constraint (c)
while
((
sh
ared_size
>=
shared_avail
)
&&
(
nb_split
<=
kern_len
)){
while
((
sh
mem_sz
>=
shared_avail
)
&&
(
nb_split
<=
kern_len
)){
//if we can't fit the kernel in shared memory, we must split it more.
nb_split
++
;
thread_z
=
ceil_intdiv
(
kern_len
,
nb_split
);
sh
ared_size
=
sizeof
(
float
)
*
(
full_kern
?
std
::
max
(
img_size
+
kern_size
,
out_size
*
thread_z
)
sh
mem_sz
=
sizeof
(
float
)
*
(
full_kern
?
std
::
max
(
(
size_t
)
img_size
+
kern_size
,
out_size
*
thread_z
)
:
std
::
max
(
img_size
+
thread_z
*
kern_wid
,
out_size
*
thread_z
));
}
if
(
nb_split
<=
kern_len
)
...
...
@@ -638,15 +634,12 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert
(
thread_z
>
0
);
//should not happen, but in case...
if
(
!
full_kern
)
assert
(
thread_z
!=
kern_len
);
dim3
threads
(
out_wid
,
out_len
,
thread_z
);
dim3
grid
(
nbatch
,
nkern
);
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
(
size_t
)
out_len
,
(
size_t
)
thread_z
};
size_t
n_blocks
[
3
]
=
{(
size_t
)
nbatch
,
(
size_t
)
nkern
,
(
size_t
)
1
};
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
GpuKernel
*
k
=
NULL
;
const
bool
split
=
thread_z
!=
kern_len
;
const
bool
ccontig
=
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
;
...
...
@@ -654,40 +647,46 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
/* if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_0;*/
/*else*/
if
(
!
kern_flipped
&&
!
ccontig
&&
!
split
&&
full_kern
)
f
=
conv_patch_stack_reduce_1
;
else
if
(
!
kern_flipped
&&
!
ccontig
&&
split
&&
!
full_kern
)
f
=
conv_patch_stack_reduce_2
;
else
if
(
!
kern_flipped
&&
!
ccontig
&&
split
&&
full_kern
)
f
=
conv_patch_stack_reduce_3
;
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_4;*/
else
if
(
!
kern_flipped
&&
ccontig
&&
!
split
&&
full_kern
)
f
=
conv_patch_stack_reduce_5
;
else
if
(
!
kern_flipped
&&
ccontig
&&
split
&&
!
full_kern
)
f
=
conv_patch_stack_reduce_6
;
else
if
(
!
kern_flipped
&&
ccontig
&&
split
&&
full_kern
)
f
=
conv_patch_stack_reduce_7
;
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_8;*/
else
if
(
kern_flipped
&&
!
ccontig
&&
!
split
&&
full_kern
)
f
=
conv_patch_stack_reduce_9
;
else
if
(
kern_flipped
&&
!
ccontig
&&
split
&&
!
full_kern
)
f
=
conv_patch_stack_reduce_10
;
else
if
(
kern_flipped
&&
!
ccontig
&&
split
&&
full_kern
)
f
=
conv_patch_stack_reduce_11
;
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_12;*/
else
if
(
kern_flipped
&&
ccontig
&&
!
split
&&
full_kern
)
f
=
conv_patch_stack_reduce_13
;
else
if
(
kern_flipped
&&
ccontig
&&
split
&&
!
full_kern
)
f
=
conv_patch_stack_reduce_14
;
else
if
(
kern_flipped
&&
ccontig
&&
split
&&
full_kern
)
f
=
conv_patch_stack_reduce_15
;
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
kern_data_unflipped
,
cuda_get_ptr
(
out
),
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col_unflipped
,
kern_stride_row_unflipped
,
kern_stride_stack
,
kern_stride_nkern
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
/* if(!kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
/*else*/
if
(
!
kern_flipped
&&
!
ccontig
&&
!
split
&&
full_kern
)
k
=&
conv_patch_stack_reduce_1_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
!
kern_flipped
&&
!
ccontig
&&
split
&&
!
full_kern
)
k
=&
conv_patch_stack_reduce_2_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
!
kern_flipped
&&
!
ccontig
&&
split
&&
full_kern
)
k
=&
conv_patch_stack_reduce_3_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
/*else if(!kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_4_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else
if
(
!
kern_flipped
&&
ccontig
&&
!
split
&&
full_kern
)
k
=&
conv_patch_stack_reduce_5_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
!
kern_flipped
&&
ccontig
&&
split
&&
!
full_kern
)
k
=&
conv_patch_stack_reduce_6_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
!
kern_flipped
&&
ccontig
&&
split
&&
full_kern
)
k
=&
conv_patch_stack_reduce_7_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
/*else if(kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_8_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else
if
(
kern_flipped
&&
!
ccontig
&&
!
split
&&
full_kern
)
k
=&
conv_patch_stack_reduce_9_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
kern_flipped
&&
!
ccontig
&&
split
&&
!
full_kern
)
k
=&
conv_patch_stack_reduce_10_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
kern_flipped
&&
!
ccontig
&&
split
&&
full_kern
)
k
=&
conv_patch_stack_reduce_11_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
/*else if(kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_12_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else
if
(
kern_flipped
&&
ccontig
&&
!
split
&&
full_kern
)
k
=&
conv_patch_stack_reduce_13_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
kern_flipped
&&
ccontig
&&
split
&&
!
full_kern
)
k
=&
conv_patch_stack_reduce_14_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
kern_flipped
&&
ccontig
&&
split
&&
full_kern
)
k
=&
conv_patch_stack_reduce_15_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
void
*
kernel_params
[]
=
{
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_nkern
};
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, threads.z
=%i, "
"
grid.x=%i, grid.y=%i, shared_size
=%i,"
"threads
_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]
=%i, "
"
n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz
=%i,"
" nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
threads
.
z
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
*
threads
.
z
);
threads
_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
*
threads_per_block
[
2
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: used 'conv_patch_stack_reduce' version"
...
...
@@ -700,17 +699,17 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, threads.z
=%i,"
"
grid.x=%i, grid.y=%i,shared_size
=%i,"
"threads
_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]
=%i,"
"
n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz
=%i,"
" nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
threads
.
z
,
grid
.
x
,
grid
.
y
,
shared_size
,
threads
.
x
*
threads
.
y
*
threads
.
z
);
threads
_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
]
,
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads
_per_block
[
0
]
*
threads_per_block
[
1
]
*
threads_per_block
[
2
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
// else no good nb_splits was found
}
...
...
@@ -719,8 +718,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
kern_len
<=
320
&&
!
work_complete
)
//conv_valid_row_reduce
{
int
outsize
=
PyGpuArray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
4096
);
size_t
outsize
=
PyGpuArray_SIZE
(
out
);
size_t
n_blocks
[
3
]
=
{
std
::
min
(
outsize
,
(
size_t
)
4096
),
(
size_t
)
1
,
(
size_t
)
1
};
int
block_nstack
=
nstack
;
//Max of 512 threads per blocks.
...
...
@@ -728,9 +728,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register...
while
(
block_nstack
*
kern_len
>
320
)
block_nstack
--
;
dim3
n_threads
(
block_nstack
,
kern_len
,
1
)
;
size_t
threads_per_block
[
3
]
=
{(
size_t
)
block_nstack
,
(
size_t
)
kern_len
,
(
size_t
)
1
}
;
in
t
n_reduce_buf
=
block_nstack
*
kern_len
*
sizeof
(
float
);
size_
t
n_reduce_buf
=
block_nstack
*
kern_len
*
sizeof
(
float
);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
*
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
...
...
@@ -747,39 +747,34 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert
(
initial_reduce_boundary
<
n_reduce_buf
/
sizeof
(
float
));
}
void
(
*
f
)(
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
const
float
*
,
int
,
int
,
int
,
int
,
const
float
*
,
int
,
int
,
int
,
int
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
GpuKernel
*
k
=
NULL
;
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if
(
block_nstack
==
nstack
)
f
=
conv_valid_row_reduce
_0
;
k
=&
conv_valid_row_reduce_0_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
f
=
conv_valid_row_reduce_1
;
f
<<<
n_blocks
,
n_threads
,
n_reduce_buf
>>>
(
nbatch
,
nkern
,
PyGpuArray_DIMS
(
img
)[
1
],
img_len
,
img_wid
,
kern_len
,
kern_wid
,
out_len
,
out_wid
,
cuda_get_ptr
(
img
),
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
img_stride_row
,
img_stride_col
,
cuda_get_ptr
(
kern
),
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
,
cuda_get_ptr
(
out
),
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
,
subsample_rows
,
subsample_cols
,
initial_reduce_boundary
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
k
=&
conv_valid_row_reduce_1_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
void
*
kernel_params
[]
=
{
(
void
*
)
&
nbatch
,
(
void
*
)
&
nkern
,
(
void
*
)
&
stack_len
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
out_len
,
(
void
*
)
&
out_wid
,
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
&
kern_stride_nkern
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
out_stride_batch
,
(
void
*
)
&
out_stride_nkern
,
(
void
*
)
&
out_stride_row
,
(
void
*
)
&
out_stride_col
,
(
void
*
)
&
subsample_rows
,
(
void
*
)
&
subsample_cols
,
(
void
*
)
&
initial_reduce_boundary
};
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
n_reduce_buf
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
work_complete
=
true
;
if
(
verbose
)
...
...
@@ -789,24 +784,27 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x
=%i,"
" sh
ared_size
=%i, nb_threads=%i
\n
"
,
n_threads
.
x
,
n_threads
.
y
,
n_blocks
,
n_reduce_buf
,
n_threads
.
x
*
n_threads
.
y
);
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
]
,
n_reduce_buf
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
if
(
1
&&
!
work_complete
)
//conv_reference_valid
{
int
outsize
=
PyGpuArray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
4096
);
int
n_threads
=
std
::
min
(
ceil_intdiv
(
outsize
,
n_blocks
),
256
);
size_t
outsize
=
PyGpuArray_SIZE
(
out
);
size_t
n_blocks
[
3
]
=
{
std
::
min
(
outsize
,
(
size_t
)
4096
),
(
size_t
)
1
,
(
size_t
)
1
};
size_t
threads_per_block
[
3
]
=
{
std
::
min
(
ceil_intdiv
(
outsize
,
n_blocks
[
0
]),
(
size_t
)
256
),
(
size_t
)
1
,
(
size_t
)
1
};
if
(
1
)
{
if
(
verbose
)
...
...
@@ -814,61 +812,56 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if
(
verbose
>
1
)
fprintf
(
stderr
,
" img : %i %llu %i %i %p "
"%lld %lld %lld %lld
\n
"
,
nbatch
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
1
],
img_len
,
img_wid
,
cuda_get_ptr
(
img
),
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
);
nbatch
,
(
unsigned
long
long
)
stack_len
,
img_len
,
img_wid
,
(
void
*
)(
cuda_get_ptr
(
img
->
ga
.
data
)
+
img
->
ga
.
offset
),
(
long
long
)
img_stride_batch
,
(
long
long
)
img_stride_stack
,
(
long
long
)
img_stride_row
,
(
long
long
)
img_stride_col
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" kern: %i %i %i %i %p "
"%lld %lld %lld %lld
\n
"
,
nkern
,
nstack
,
kern_len
,
kern_wid
,
cuda_get_ptr
(
kern
),
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
);
(
void
*
)(
cuda_get_ptr
(
kern
->
ga
.
data
)
+
kern
->
ga
.
offset
),
(
long
long
)
kern_stride_nkern
,
(
long
long
)
kern_stride_stack
,
(
long
long
)
kern_stride_row
,
(
long
long
)
kern_stride_col
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" out : %llu %llu %i %i %p "
"%lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
0
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
1
],
out_len
,
out_wid
,
cuda_get_ptr
(
ou
t
),
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
);
(
void
*
)(
cuda_get_ptr
(
out
->
ga
.
data
)
+
out
->
ga
.
offse
t
),
(
long
long
)
out_stride_batch
,
(
long
long
)
out_stride_nkern
,
(
long
long
)
out_stride_row
,
(
long
long
)
out_stride_col
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" launch params: %i %i %i
\n
"
,
outsize
,
n_blocks
,
n_threads
);
outsize
,
n_blocks
[
0
],
threads_per_block
[
0
]
);
}
conv_reference_valid
<<<
n_blocks
,
n_threads
>>>
(
nbatch
,
nkern
,
PyGpuArray_DIMS
(
img
)[
1
],
img_len
,
img_wid
,
kern_len
,
kern_wid
,
out_len
,
out_wid
,
cuda_get_ptr
(
img
),
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
,
cuda_get_ptr
(
kern
),
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
,
cuda_get_ptr
(
out
),
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
,
subsample_rows
,
subsample_cols
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
void
*
kernel_params
[]
=
{
(
void
*
)
&
nbatch
,
(
void
*
)
&
nkern
,
(
void
*
)
&
stack_len
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
out_len
,
(
void
*
)
&
out_wid
,
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
&
kern_stride_nkern
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
out_stride_batch
,
(
void
*
)
&
out_stride_nkern
,
(
void
*
)
&
out_stride_row
,
(
void
*
)
&
out_stride_col
,
(
void
*
)
&
subsample_rows
,
(
void
*
)
&
subsample_cols
};
int
err
=
GpuKernel_call
(
&
conv_reference_valid_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
3
,
threads_per_block
,
n_blocks
,
0
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
work_complete
=
true
;
if
(
verbose
)
...
...
@@ -881,7 +874,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
PyErr_Format
(
PyExc_RuntimeError
,
"ERROR: all implementations failed for"
" PyGpuArray_conv_valid! (%s)"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
&
conv_reference_valid_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
err
));
return
-
1
;
}
}
...
...
@@ -930,6 +923,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert
(
PyGpuArray_DIMS
(
out
)[
1
]
==
PyGpuArray_DIMS
(
kern
)[
0
]);
assert
(
PyGpuArray_DIMS
(
img
)[
1
]
==
PyGpuArray_DIMS
(
kern
)[
1
]);
const
int
stack_len
=
PyGpuArray_DIMS
(
img
)[
1
];
const
int
nstack
=
PyGpuArray_DIMS
(
kern
)[
1
];
const
int
nbatch
=
PyGpuArray_DIMS
(
img
)[
0
];
const
int
nkern
=
PyGpuArray_DIMS
(
kern
)[
0
];
...
...
@@ -948,6 +942,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
const
int
kern_stride_row
=
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
;
const
int
kern_stride_stack
=
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
;
const
int
kern_stride_nkern
=
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
;
const
int
out_stride_col
=
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
;
const
int
out_stride_row
=
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
;
const
int
out_stride_nkern
=
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
;
const
int
out_stride_batch
=
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
;
const
int
img_size
=
img_len
*
img_wid
;
const
int
kern_size
=
kern_len
*
kern_wid
;
...
...
@@ -990,16 +988,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
//we don't need to unflip it, but have the new value when we unflip it.
bool
kern_flipped
=
true
;
bool
kern_contiguous_2d_unflipped
=
kern_contiguous_2d
;
const
float
*
kern_data_unflipped
=
cuda_get_ptr
(
kern
);
int
kern_stride_col_unflipped
=
kern_stride_col
;
int
kern_stride_row_unflipped
=
kern_stride_row
;
if
(
kern_stride_col_unflipped
==-
1
&&
kern_stride_row_unflipped
==-
kern_wid
){
if
(
kern_stride_col
==-
1
&&
kern_stride_row
==-
kern_wid
){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped
=
1
;
kern_stride_row_unflipped
=
kern_wid
;
kern_flipped
=
false
;
kern_contiguous_2d_unflipped
=
true
;
kern_data_unflipped
=&
(
cuda_get_ptr
(
kern
)[(
kern_wid
-
1
)
*
kern_stride_col
+
(
kern_len
-
1
)
*
kern_stride_row
]);
}
if
(
verbose
>
1
)
...
...
@@ -1008,34 +1000,34 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" MACRO kern_width=%d with inputs:
\n
"
,
version
,
THEANO_KERN_WID
);
printf
(
"INFO: img dim: %llu %llu %llu %llu "
"img stride: %lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
0
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
1
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
2
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
3
]
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
);
(
unsigned
long
long
)
nbatch
,
(
unsigned
long
long
)
stack_len
,
(
unsigned
long
long
)
img_len
,
(
unsigned
long
long
)
img_wid
,
(
long
long
)
img_stride_batch
,
(
long
long
)
img_stride_stack
,
(
long
long
)
img_stride_row
,
(
long
long
)
img_stride_col
);
printf
(
"INFO: kern dim: %llu %llu %llu %llu "
"kern stride: %lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
kern
)[
0
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
kern
)[
1
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
kern
)[
2
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
kern
)[
3
]
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
);
(
unsigned
long
long
)
nkern
,
(
unsigned
long
long
)
nstack
,
(
unsigned
long
long
)
kern_len
,
(
unsigned
long
long
)
kern_wid
,
(
long
long
)
kern_stride_nkern
,
(
long
long
)
kern_stride_stack
,
(
long
long
)
kern_stride_row
,
(
long
long
)
kern_stride_col
);
printf
(
"INFO: out dim: %llu %llu %llu %llu "
"out stride: %lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
0
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
1
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
2
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
3
]
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
);
(
unsigned
long
long
)
out_len
,
(
unsigned
long
long
)
out_wid
,
(
long
long
)
out_stride_batch
,
(
long
long
)
out_stride_nkern
,
(
long
long
)
out_stride_row
,
(
long
long
)
out_stride_col
);
}
if
(
!
subsample
&&
...
...
@@ -1082,50 +1074,53 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert
(
version
!=
5
||
kern_len
>
1
);
assert
(
version
!=-
1
);
dim3
threads
(
out_wid
,
ceil_intdiv
(
out_len
,
nb_split
));
dim3
grid
(
nbatch
,
nkern
);
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
ceil_intdiv
((
size_t
)
out_len
,(
size_t
)
nb_split
),
(
size_t
)
1
};
size_t
n_blocks
[
3
]
=
{(
size_t
)
nbatch
,
(
size_t
)
nkern
,
(
size_t
)
1
};
int
shared_size
=
img_size_padded_byte
+
kern_size_byte
;
size_t
shmem_sz
=
img_size_padded_byte
+
kern_size_byte
;
if
(
version
==
5
)
shared_size
=
((
kern_len
+
threads
.
y
-
1
)
+
2
*
kern_len
-
2
)
*
img_wid_padded
*
sizeof
(
float
)
+
kern_size_byte
;
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
if
(
version
==
3
)
f
=
conv_full_patch_stack_padded_0
;
else
if
(
version
==
5
)
f
=
conv_full_patch_stack_padded_1
;
else
if
(
version
==
4
)
f
=
conv_full_patch_stack_padded_2
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
3
)
f
=
conv_full_patch_stack_padded_4
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
5
)
f
=
conv_full_patch_stack_padded_5
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
4
)
f
=
conv_full_patch_stack_padded_6
;
else
if
(
version
==
3
&&
kern_flipped
)
f
=
conv_full_patch_stack_padded_8
;
else
if
(
version
==
5
&&
kern_flipped
)
f
=
conv_full_patch_stack_padded_9
;
else
if
(
version
==
4
&&
kern_flipped
)
f
=
conv_full_patch_stack_padded_10
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
3
&&
kern_flipped
)
f
=
conv_full_patch_stack_padded_12
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
5
&&
kern_flipped
)
f
=
conv_full_patch_stack_padded_13
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
4
&&
kern_flipped
)
f
=
conv_full_patch_stack_padded_14
;
shmem_sz
=
((
kern_len
+
threads_per_block
[
1
]
-
1
)
+
2
*
kern_len
-
2
)
*
img_wid_padded
*
sizeof
(
float
)
+
kern_size_byte
;
GpuKernel
*
k
=
NULL
;
if
(
version
==
3
)
k
=&
conv_full_patch_stack_padded_0_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
version
==
5
)
k
=&
conv_full_patch_stack_padded_1_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
version
==
4
)
k
=&
conv_full_patch_stack_padded_2_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
3
)
k
=&
conv_full_patch_stack_padded_4_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
5
)
k
=&
conv_full_patch_stack_padded_5_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
4
)
k
=&
conv_full_patch_stack_padded_6_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
version
==
3
&&
kern_flipped
)
k
=&
conv_full_patch_stack_padded_8_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
version
==
5
&&
kern_flipped
)
k
=&
conv_full_patch_stack_padded_9_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
version
==
4
&&
kern_flipped
)
k
=&
conv_full_patch_stack_padded_10_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
3
&&
kern_flipped
)
k
=&
conv_full_patch_stack_padded_12_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
5
&&
kern_flipped
)
k
=&
conv_full_patch_stack_padded_13_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d_unflipped
&&
version
==
4
&&
kern_flipped
)
k
=&
conv_full_patch_stack_padded_14_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
assert
(
false
);
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
kern_data_unflipped
,
cuda_get_ptr
(
out
),
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col_unflipped
,
kern_stride_row_unflipped
,
kern_stride_stack
,
kern_stride_nkern
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
void
*
kernel_params
[]
=
{
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_nkern
};
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
if
(
verbose
>
1
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, threads.z
=%i,"
"
grid.x=%i, grid.y=%i, shared_size
=%i, nb_threads=%i,"
"threads
_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]
=%i,"
"
n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz
=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i
\n
"
,
threads
.
x
,
threads
.
y
,
threads
.
z
,
grid
.
x
,
grid
.
y
,
shared_size
,
threads
.
x
*
threads
.
y
*
threads
.
z
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
]
,
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads
_per_block
[
0
]
*
threads_per_block
[
1
]
*
threads_per_block
[
2
]
,
out_len
,
nb_split
,
version
);
if
(
verbose
)
fprintf
(
stderr
,
...
...
@@ -1138,12 +1133,12 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, threads.z
=%i,"
"
grid.x=%i, grid.y=%i,shared_size
=%i, nb_threads=%i,"
"threads
_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]
=%i,"
"
n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz
=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i
\n
"
,
threads
.
x
,
threads
.
y
,
threads
.
z
,
grid
.
x
,
grid
.
y
,
shared_size
,
threads
.
x
*
threads
.
y
*
threads
.
z
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
threads_per_block
[
2
]
,
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads
_per_block
[
0
]
*
threads_per_block
[
1
]
*
threads_per_block
[
2
]
,
out_len
,
nb_split
,
version
);
if
(
verbose
)
fprintf
(
stderr
,
...
...
@@ -1151,7 +1146,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" failed (%s), trying next implementation
\n
"
,
version
==
3
?
"no split"
:
"split"
,
(
version
==
5
?
"low_mem"
:
"not_low_mem"
),
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
...
...
@@ -1162,21 +1157,22 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte
+
kern_size_byte
<
shared_avail
&&
//their is only 16k of shared memory
!
work_complete
)
//conv_full_patch
{
dim3
threads
(
out_wid
,
out_len
)
;
dim3
grid
(
nbatch
,
nkern
)
;
int
shared_size
=
(
img_size
+
kern_size
)
*
sizeof
(
float
);
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
(
size_t
)
out_len
,
(
size_t
)
1
}
;
size_t
n_blocks
[
3
]
=
{(
size_t
)
nbatch
,
(
size_t
)
nkern
,
(
size_t
)
1
}
;
size_t
shmem_sz
=
(
img_size
+
kern_size
)
*
sizeof
(
float
);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
),
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
void
*
kernel_params
[]
=
{
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
};
int
err
=
GpuKernel_call
(
&
conv_full_patch_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
if
(
verbose
)
fprintf
(
stderr
,
"INFO: used 'conv_full_patch' version
\n
"
);
work_complete
=
true
;
...
...
@@ -1185,15 +1181,15 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
shared_size
,
threads
.
x
*
threads
.
y
);
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i
\n
"
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads
_per_block
[
0
]
*
threads_per_block
[
1
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
&
conv_full_patch_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
err
));
}
}
if
(
false
&&
!
subsample
&&
//disabled as test fail for this kernel
...
...
@@ -1203,35 +1199,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
nstack
*
img_size_byte
+
nstack
*
kern_size_byte
<
shared_avail
&&
//there is only 16k of shared memory
!
work_complete
)
//conv_full_load_everything
{
dim3
threads
(
out_wid
,
out_len
)
;
dim3
grid
(
nbatch
)
;
int
shared_size
=
(
img_size
+
kern_size
)
*
nstack
*
sizeof
(
float
);
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
(
size_t
)
out_len
,
(
size_t
)
1
}
;
size_t
n_blocks
[
3
]
=
{(
size_t
)
nbatch
,
(
size_t
)
1
,
(
size_t
)
1
}
;
size_t
shmem_sz
=
(
img_size
+
kern_size
)
*
nstack
*
sizeof
(
float
);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ;
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
)
=
conv_full_load_everything
;
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
),
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
void
*
kernel_params
[]
=
{
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_nkern
};
int
err
=
GpuKernel_call
(
&
conv_full_load_everything_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
if
(
verbose
)
fprintf
(
stderr
,
"INFO: used 'conv_full_load_everything' version
\n
"
);
work_complete
=
true
;
...
...
@@ -1240,14 +1227,14 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
shared_size
,
threads
.
x
*
threads
.
y
);
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i
\n
"
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
],
shmem_sz
,
threads
_per_block
[
0
]
*
threads_per_block
[
1
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
&
conv_full_load_everything_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
err
));
}
}
...
...
@@ -1259,32 +1246,29 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte
+
kern_size_byte
<
shared_avail
&&
//their is only 16k of shared memory
!
work_complete
)
//conv_full_patch_stack
{
dim3
threads
(
out_wid
,
out_len
);
dim3
grid
(
nbatch
,
nkern
);
int
shared_size
=
(
img_size
+
kern_size
)
*
sizeof
(
float
);
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
if
(
!
img_contiguous_2d
&&
!
kern_contiguous_2d
)
f
=
conv_full_patch_stack_0
;
else
if
(
!
img_contiguous_2d
&&
kern_contiguous_2d
)
f
=
conv_full_patch_stack_1
;
else
if
(
img_contiguous_2d
&&
!
kern_contiguous_2d
)
f
=
conv_full_patch_stack_2
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d
)
f
=
conv_full_patch_stack_3
;
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
),
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
size_t
threads_per_block
[
3
]
=
{(
size_t
)
out_wid
,
(
size_t
)
out_len
,
(
size_t
)
1
};
size_t
n_blocks
[
3
]
=
{(
size_t
)
nbatch
,
(
size_t
)
nkern
,
(
size_t
)
1
};
size_t
shmem_sz
=
(
img_size
+
kern_size
)
*
sizeof
(
float
);
GpuKernel
*
k
=
NULL
;
if
(
!
img_contiguous_2d
&&
!
kern_contiguous_2d
)
k
=&
conv_full_patch_stack_0_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
!
img_contiguous_2d
&&
kern_contiguous_2d
)
k
=&
conv_full_patch_stack_1_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
img_contiguous_2d
&&
!
kern_contiguous_2d
)
k
=&
conv_full_patch_stack_2_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
else
if
(
img_contiguous_2d
&&
kern_contiguous_2d
)
k
=&
conv_full_patch_stack_3_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
;
void
*
kernel_params
[]
=
{
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
nkern
,
(
void
*
)
&
nstack
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_nkern
};
int
err
=
GpuKernel_call
(
k
,
3
,
threads_per_block
,
n_blocks
,
shmem_sz
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
if
(
verbose
)
fprintf
(
stderr
,
"INFO: used 'conv_full_patch_stack' version
\n
"
);
...
...
@@ -1294,23 +1278,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if
(
verbose
)
fprintf
(
stderr
,
"threads
.x=%i, threads.y=%i, grid.x=%i, grid.y
=%i,"
" sh
ared_size
=%i, nb_threads=%i
\n
"
,
threads
.
x
,
threads
.
y
,
grid
.
x
,
grid
.
y
,
sh
ared_size
,
threads
.
x
*
threads
.
y
);
"threads
_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]
=%i,"
" sh
mem_sz
=%i, nb_threads=%i
\n
"
,
threads
_per_block
[
0
],
threads_per_block
[
1
],
n_blocks
[
0
],
n_blocks
[
1
]
,
sh
mem_sz
,
threads_per_block
[
0
]
*
threads_per_block
[
1
]
);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
k
,
err
));
}
}
if
(
1
&&
!
work_complete
)
//conv_reference_full
{
if
(
verbose
>
1
)
fprintf
(
stderr
,
"INFO: will start conv_reference_full
\n
"
);
int
outsize
=
PyGpuArray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
4096
);
int
n_threads
=
std
::
min
(
ceil_intdiv
(
outsize
,
n_blocks
),
256
);
size_t
outsize
=
PyGpuArray_SIZE
(
out
);
size_t
n_blocks
[
3
]
=
{
std
::
min
(
outsize
,
(
size_t
)
4096
),
(
size_t
)
1
,
(
size_t
)
1
};
size_t
threads_per_block
[
3
]
=
{
std
::
min
(
ceil_intdiv
(
outsize
,
n_blocks
[
0
]),
(
size_t
)
256
),
(
size_t
)
1
,
(
size_t
)
1
};
if
(
0
)
{
if
(
verbose
)
...
...
@@ -1318,70 +1305,67 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
if
(
verbose
)
fprintf
(
stderr
,
" img : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
0
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
1
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
2
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
img
)[
3
]
,
cuda_get_ptr
(
img
),
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
);
(
unsigned
long
long
)
nbatch
,
(
unsigned
long
long
)
stack_len
,
(
unsigned
long
long
)
img_len
,
(
unsigned
long
long
)
img_wid
,
(
void
*
)(
cuda_get_ptr
(
img
->
ga
.
data
)
+
img
->
ga
.
offset
),
(
long
long
)
img_stride_batch
,
(
long
long
)
img_stride_stack
,
(
long
long
)
img_stride_row
,
(
long
long
)
img_stride_col
);
if
(
verbose
)
fprintf
(
stderr
,
" kern: %llu %llu %llu %llu %p "
"%lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
kern
)[
0
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
kern
)[
1
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
kern
)[
2
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
kern
)[
3
],
cuda_get_ptr
(
kern
),
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
);
(
unsigned
long
long
)
nkern
,
(
unsigned
long
long
)
nstack
,
(
unsigned
long
long
)
kern_len
,
(
unsigned
long
long
)
kern_wid
,
(
void
*
)(
cuda_get_ptr
(
kern
->
ga
.
data
)
+
kern
->
ga
.
offset
),
(
long
long
)
kern_stride_nkern
,
(
long
long
)
kern_stride_stack
,
(
long
long
)
kern_stride_row
,
(
long
long
)
kern_stride_col
);
if
(
verbose
)
fprintf
(
stderr
,
" out : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld
\n
"
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
0
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
1
],
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
2
]
,
(
unsigned
long
long
)
PyGpuArray_DIMS
(
out
)[
3
]
,
cuda_get_ptr
(
ou
t
),
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
(
long
long
)
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
);
(
unsigned
long
long
)
out_len
,
(
unsigned
long
long
)
out_wid
,
(
void
*
)(
cuda_get_ptr
(
out
->
ga
.
data
)
+
out
->
ga
.
offse
t
),
(
long
long
)
out_stride_batch
,
(
long
long
)
out_stride_nkern
,
(
long
long
)
out_stride_row
,
(
long
long
)
out_stride_col
);
if
(
verbose
)
fprintf
(
stderr
,
" launch params: %i %i %i
\n
"
,
outsize
,
n_blocks
,
n_threads
);
outsize
,
n_blocks
[
0
],
threads_per_block
[
0
]
);
if
(
verbose
)
fprintf
(
stderr
,
" subsample params: %llu %llu
\n
"
,
(
unsigned
long
long
)
subsample_rows
,
(
unsigned
long
long
)
subsample_cols
);
}
conv_reference_full
<<<
n_blocks
,
n_threads
>>>
(
PyGpuArray_DIMS
(
img
)[
0
],
PyGpuArray_DIMS
(
kern
)[
0
],
PyGpuArray_DIMS
(
img
)[
1
],
PyGpuArray_DIMS
(
img
)[
2
],
PyGpuArray_DIMS
(
img
)[
3
],
PyGpuArray_DIMS
(
kern
)[
2
],
PyGpuArray_DIMS
(
kern
)[
3
],
PyGpuArray_DIMS
(
out
)[
2
],
PyGpuArray_DIMS
(
out
)[
3
],
cuda_get_ptr
(
img
),
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
,
cuda_get_ptr
(
kern
),
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
,
cuda_get_ptr
(
out
),
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
,
subsample_rows
,
subsample_cols
);
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
void
*
kernel_params
[]
=
{
(
void
*
)
&
nbatch
,
(
void
*
)
&
nkern
,
(
void
*
)
&
stack_len
,
(
void
*
)
&
img_len
,
(
void
*
)
&
img_wid
,
(
void
*
)
&
kern_len
,
(
void
*
)
&
kern_wid
,
(
void
*
)
&
out_len
,
(
void
*
)
&
out_wid
,
(
void
*
)
img
->
ga
.
data
,
(
void
*
)
&
img
->
ga
.
offset
,
(
void
*
)
&
img_stride_batch
,
(
void
*
)
&
img_stride_stack
,
(
void
*
)
&
img_stride_row
,
(
void
*
)
&
img_stride_col
,
(
void
*
)
kern
->
ga
.
data
,
(
void
*
)
&
kern
->
ga
.
offset
,
(
void
*
)
&
kern_stride_nkern
,
(
void
*
)
&
kern_stride_stack
,
(
void
*
)
&
kern_stride_row
,
(
void
*
)
&
kern_stride_col
,
(
void
*
)
out
->
ga
.
data
,
(
void
*
)
&
out
->
ga
.
offset
,
(
void
*
)
&
out_stride_batch
,
(
void
*
)
&
out_stride_nkern
,
(
void
*
)
&
out_stride_row
,
(
void
*
)
&
out_stride_col
,
(
void
*
)
&
subsample_rows
,
(
void
*
)
&
subsample_cols
};
int
err
=
GpuKernel_call
(
&
conv_reference_full_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
3
,
threads_per_block
,
n_blocks
,
0
,
kernel_params
);
if
(
err
==
GA_NO_ERROR
)
{
if
(
verbose
)
fprintf
(
stderr
,
"INFO: used 'conv_reference_full' version"
...
...
@@ -1394,17 +1378,18 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
else
{
if
(
verbose
)
fprintf
(
stderr
,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i
\n
"
,
n_threads
,
1
,
n_blocks
,
1
,
0
,
n_threads
);
fprintf
(
stderr
,
"threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i
\n
"
,
threads_per_block
[
0
],
1
,
n_blocks
[
0
],
1
,
0
,
threads_per_block
[
0
]);
if
(
verbose
)
fprintf
(
stderr
,
"INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation
\n
"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
&
conv_reference_full_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
err
));
PyErr_Format
(
PyExc_RuntimeError
,
"ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)"
,
cudaGetErrorString
(
sts
));
GpuKernel_error
(
&
conv_reference_full_node_
<<<<
HASH_PLACEHOLDER
>>>>
_0
,
err
));
return
-
1
;
}
}
...
...
theano/sandbox/gpuarray/conv.py
浏览文件 @
41daf4a8
...
...
@@ -3,13 +3,20 @@ import os
import
theano
from
theano
import
config
,
gof
try
:
import
pygpu
from
pygpu
import
gpuarray
except
ImportError
:
pass
from
six.moves
import
reduce
from
.comp
import
NVCC_compiler
from
.type
import
GpuArrayType
from
.basic_ops
import
as_gpuarray_variable
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
HideC
,
Kernel
)
from
theano.gof
import
utils
class
GpuConv
(
gof
.
Op
):
class
GpuConv
(
GpuKernelBase
,
HideC
,
gof
.
Op
):
"""
Implement the batched and stacked 2d convolution on the gpu.
...
...
@@ -223,29 +230,30 @@ class GpuConv(gof.Op):
return
[
'-DTHEANO_KERN_WID='
+
str
(
nb
)]
# ,'-g','-G']
def
c_headers
(
self
):
return
[
'<stdio.h>'
,
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
]
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'<stdint.h>'
,
'<stdio.h>'
,
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
import
os
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_code_cache_version
(
self
):
# raise this whenever modifying any of the support_code_files
return
(
0
,
21
)
def
c_init_code
(
self
):
return
[
'cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'
]
def
c_support_code_apply
(
self
,
node
,
nodename
):
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
files
=
[
'conv_kernel.cu'
,
'conv_full_kernel.cu'
,
'conv.cu'
]
codes
=
[
"CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);"
,
"float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}"
,
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}"
]
codes
+=
[
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
f
))
.
read
()
for
f
in
files
]
return
reduce
(
str
.
__add__
,
codes
)
def
c_compiler
(
self
):
return
NVCC_compiler
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'setup_ext_cuda();'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
img
,
kern
=
inp
...
...
@@ -270,8 +278,8 @@ class GpuConv(gof.Op):
//Optional args
int version =
%(version)
s;
int verbose =
%(verbose)
s;
in
t dx =
%(dx)
s;
in
t dy =
%(dy)
s;
size_
t dx =
%(dx)
s;
size_
t dy =
%(dy)
s;
int mode;
if (strcmp(mode_str, "full") == 0)
...
...
@@ -286,7 +294,7 @@ class GpuConv(gof.Op):
{
PyErr_SetString(PyExc_ValueError,
"mode must be one of 'full' or 'valid'");
return
NULL
;
return
0
;
}
// TODO, make out be decref before we alloc out2!
...
...
@@ -303,3 +311,266 @@ class GpuConv(gof.Op):
%(fail)
s
}
"""
%
sub
def
c_support_code_apply
(
self
,
node
,
name
):
nb
=
0
if
self
.
kshp
is
not
None
:
nb
=
self
.
kshp
[
1
]
kernels
=
self
.
gpu_kernels
(
node
,
name
)
k
=
kernels
[
0
]
code
=
"""
#define THEANO_KERN_WID
%(nb)
d
"""
%
locals
()
code
+=
"
\n
"
.
join
([
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
f
))
.
read
()
for
f
in
[
"conv_kernel.cu"
,
"conv_full_kernel.cu"
]])
kname
=
"conv_full_load_everything"
gk
=
gpuarray
.
GpuKernel
(
code
,
k
.
name
,
k
.
params
,
**
k
.
flags
)
bin
=
gk
.
_binary
bcode
=
','
.
join
(
hex
(
ord
(
c
))
for
c
in
bin
)
code
=
code
.
replace
(
'
\\
'
,
'
\\\\
'
)
code
=
code
.
replace
(
'"'
,
'
\\
"'
)
code
=
code
.
replace
(
'
\n
'
,
'
\\
n'
)
mod
=
"""
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a
%%
b) ? 1: 0);
}
static const char conv_bcode[] = {
%(bcode)
s};
static const char *conv_code = "
%(code)
s";
"""
%
locals
()
for
k
in
kernels
:
mod
+=
"static GpuKernel "
+
k
.
name
+
'_'
+
name
+
";
\n
"
mod
+=
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
"conv.cu"
))
.
read
()
return
mod
@utils.memoize
def
gpu_kernels
(
self
,
node
,
name
):
dtypes
=
[
i
.
dtype
for
i
in
node
.
inputs
]
dtypes
.
extend
([
o
.
dtype
for
o
in
node
.
outputs
])
flags
=
Kernel
.
get_flags
(
*
dtypes
)
kernels
=
self
.
conv_patch_kernels
(
name
,
flags
)
kernels
.
extend
(
self
.
conv_patch_stack_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_patch_stack_reduce_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_rows_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_rows_stack_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_rows_stack2_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_valid_row_reduce_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_reference_valid_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_reference_full_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_full_patch_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_full_patch_stack_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_full_patch_stack_padded_kernels
(
name
,
flags
))
kernels
.
extend
(
self
.
conv_full_load_everything_kernels
(
name
,
flags
))
return
kernels
def
conv_patch_kernels
(
self
,
name
,
flags
):
kname
=
"conv_patch_
%
d"
k_var
=
"conv_patch_
%
d_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
[
2
,
3
]
]
def
conv_patch_stack_kernels
(
self
,
name
,
flags
):
kname
=
"conv_patch_stack_
%
d"
k_var
=
"conv_patch_stack_
%
d_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
range
(
64
,
96
)
]
def
conv_patch_stack_reduce_kernels
(
self
,
name
,
flags
):
kname
=
"conv_patch_stack_reduce_
%
d"
k_var
=
"conv_patch_stack_reduce_
%
d_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
[
1
,
2
,
3
,
5
,
6
,
7
,
9
,
10
,
11
,
13
,
14
,
15
]
]
def
conv_rows_kernels
(
self
,
name
,
flags
):
kname
=
"conv_rows_
%
d"
k_var
=
"conv_rows_
%
d_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
[
0
,
1
]
]
def
conv_rows_stack_kernels
(
self
,
name
,
flags
):
kname
=
"conv_rows_stack_
%
d"
k_var
=
"conv_rows_stack_
%
d_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
[
0
,
1
]
]
def
conv_rows_stack2_kernels
(
self
,
name
,
flags
):
kname
=
"conv_rows_stack2_
%
d"
k_var
=
"conv_rows_stack2_
%
d_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
[
0
,
1
,
2
,
3
]
]
def
conv_valid_row_reduce_kernels
(
self
,
name
,
flags
):
kname
=
"conv_valid_row_reduce_
%
d"
k_var
=
"conv_valid_row_reduce_
%
d_"
+
name
params
=
[
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
[
0
,
1
]
]
def
conv_reference_valid_kernels
(
self
,
name
,
flags
):
kname
=
"conv_reference_valid"
k_var
=
"conv_reference_valid_"
+
name
params
=
[
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
)
]
def
conv_reference_full_kernels
(
self
,
name
,
flags
):
kname
=
"conv_reference_full"
k_var
=
"conv_reference_full_"
+
name
params
=
[
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
)
]
def
conv_full_patch_kernels
(
self
,
name
,
flags
):
kname
=
"conv_full_patch"
k_var
=
"conv_full_patch_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
)
]
def
conv_full_patch_stack_kernels
(
self
,
name
,
flags
):
kname
=
"conv_full_patch_stack_
%
d"
k_var
=
"conv_full_patch_stack_
%
d_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
[
0
,
1
,
2
,
3
]
]
def
conv_full_patch_stack_padded_kernels
(
self
,
name
,
flags
):
kname
=
"conv_full_patch_stack_padded_
%
d"
k_var
=
"conv_full_patch_stack_padded_
%
d_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
%
i
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
%
i
)
for
i
in
[
0
,
1
,
2
,
4
,
5
,
6
,
8
,
9
,
10
,
12
,
13
,
14
]
]
def
conv_full_load_everything_kernels
(
self
,
name
,
flags
):
kname
=
"conv_full_load_everything"
k_var
=
"conv_full_load_everything_"
+
name
params
=
[
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
gpuarray
.
GpuArray
,
'uintp'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
]
return
[
Kernel
(
None
,
params
,
kname
,
flags
,
'conv_code'
,
'conv_bcode'
,
k_var
)
]
theano/sandbox/gpuarray/conv_full_kernel.cu
浏览文件 @
41daf4a8
extern
__shared__
float
s_data
[];
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__
void
conv_full_patch_split
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
extern
"C"
__global__
void
conv_full_patch_split
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nb_split
)
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
out_len
=
img_len
+
kern_len
-
1
;
out_wid
=
img_wid
+
kern_wid
-
1
;
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
extern
__shared__
float
s_data
[];
int
batch_id
=
blockIdx
.
x
;
// Thread index
...
...
@@ -60,18 +67,23 @@ conv_full_patch_split(const float* img, const float* kern, float* out,
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__
void
conv_full_patch
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
extern
"C"
__global__
void
conv_full_patch
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
)
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
out_len
=
img_len
+
kern_len
-
1
;
out_wid
=
img_wid
+
kern_wid
-
1
;
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
extern
__shared__
float
s_data
[];
int
batch_id
=
blockIdx
.
x
;
// Thread index
...
...
@@ -114,6 +126,8 @@ conv_full_patch( const float* img, const float* kern, float* out,
out_row
*
out_wid
+
out_col
]
=
sum
;
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
...
...
@@ -123,7 +137,9 @@ conv_full_patch( const float* img, const float* kern, float* out,
template
<
bool
img_c_contiguous_2d
,
bool
kern_c_contiguous_2d
>
__device__
inline
void
conv_full_patch_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
conv_full_patch_stack
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
...
@@ -131,12 +147,15 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
int
kern_stride_stack
,
int
kern_stride_nkern
)
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
out_len
=
img_len
+
kern_len
-
1
;
out_wid
=
img_wid
+
kern_wid
-
1
;
nb_thread_id
=
blockDim
.
y
*
blockDim
.
x
;
//blockDim.z*
const
float
__shared__
*
kern_
,
*
img_
;
extern
__shared__
float
s_data
[];
const
int
batch_id
=
blockIdx
.
x
;
const
int
nkern_id
=
blockIdx
.
y
;
...
...
@@ -186,7 +205,9 @@ extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
...
...
@@ -194,7 +215,8 @@ conv_full_patch_stack_##suffix( \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
...
...
@@ -207,6 +229,8 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
...
...
@@ -227,22 +251,34 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
*/
template
<
bool
flipped_kern
,
bool
c_contiguous
,
bool
split
,
bool
low_mem
>
__device__
inline
void
conv_full_patch_stack_padded
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
conv_full_patch_stack_padded
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
const
int
img_stride_col
,
const
int
img_stride_row
,
const
int
img_stride_stack
,
const
int
img_stride_batch
,
const
int
kern_stride_col
,
const
int
kern_stride_row
,
int
kern_stride_col
,
int
kern_stride_row
,
const
int
kern_stride_stack
,
const
int
kern_stride_nkern
)
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
if
(
kern_stride_col
==-
1
&&
kern_stride_row
==-
kern_wid
){
//the last two dimensions are c_contiguous but flipped!
kern
=
&
(
kern
[(
kern_wid
-
1
)
*
kern_stride_col
+
(
kern_len
-
1
)
*
kern_stride_row
]);
kern_stride_col
=
1
;
kern_stride_row
=
kern_wid
;
}
out_len
=
img_len
+
kern_len
-
1
;
out_wid
=
img_wid
+
kern_wid
-
1
;
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
extern
__shared__
float
s_data
[];
__shared__
int
batch_id
,
kern_id
,
img_wid_valid
,
nb_rows
;
batch_id
=
blockIdx
.
x
;
kern_id
=
blockIdx
.
y
;
...
...
@@ -380,7 +416,9 @@ extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
...
...
@@ -390,7 +428,8 @@ conv_full_patch_stack_padded_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
...
...
@@ -412,6 +451,7 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template
<
int
i
>
__device__
float
everything_dot
(
const
float
*
x
,
const
int
sx
,
const
float
*
y
,
const
int
sy
)
{
return
everything_dot
<
i
/
2
>
(
x
,
sx
,
y
,
sy
)
+
everything_dot
<
(
i
+
1
)
/
2
>
(
x
+
sy
*
(
i
/
2
),
sx
,
y
+
sy
*
(
i
/
2
),
sy
)
;
...
...
@@ -425,8 +465,10 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{
return
x
[
0
]
*
y
[
0
];
}
__global__
void
conv_full_load_everything
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
extern
"C"
__global__
void
conv_full_load_everything
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
...
@@ -435,12 +477,15 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
int
kern_stride_stack
,
int
kern_stride_nkern
)
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
out_len
=
img_len
+
kern_len
-
1
;
out_wid
=
img_wid
+
kern_wid
-
1
;
nb_thread_id
=
blockDim
.
y
*
blockDim
.
x
;
extern
__shared__
float
s_data
[];
int
batch_id
=
blockIdx
.
x
;
const
int
out_col
=
threadIdx
.
x
;
//output col
...
...
@@ -503,6 +548,8 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
__syncthreads
();
//don't start loading another kernel until we're done here
}
}
/*
Local Variables:
mode:c++
...
...
theano/sandbox/gpuarray/conv_kernel.cu
浏览文件 @
41daf4a8
...
...
@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
*/
#ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU
#include <stdint.h>
/*
#define CHECK_BANK_CONFLICTS 0
...
...
@@ -220,11 +219,18 @@ __device__ void store_or_accumulate(float& dst,const float value ){
*/
template
<
bool
flipped_kern
,
bool
split
>
__device__
inline
void
conv_patch
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
conv_patch
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
)
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
out_len
=
img_len
-
kern_len
+
1
;
out_wid
=
img_wid
-
kern_wid
+
1
;
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
...
...
@@ -282,11 +288,14 @@ conv_patch( const float* img, const float* kern, float* out,
extern
"C"
{
#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
__global__ void \
conv_patch_##suffix(const float *img, const float *kern, float *out, \
conv_patch_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack) \
{ \
conv_patch<__VA_ARGS__>(img, kern, out, img_len, img_wid, kern_len, \
conv_patch<__VA_ARGS__>(img, img_offset, kern, kern_offset, \
out, out_offset, img_len, img_wid, kern_len, \
kern_wid, nkern, nstack); \
}
...
...
@@ -297,6 +306,7 @@ __INSTANTIATE_CONV_PATCH(3, true, true)
}
/**
* As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency.
...
...
@@ -320,7 +330,9 @@ __INSTANTIATE_CONV_PATCH(3, true, true)
*/
template
<
bool
flipped_kern
,
bool
accumulate
,
bool
img_c_contiguous_2d
,
bool
kern_c_contiguous_2d
,
bool
split
,
bool
preload_full_kern
,
bool
subsample
>
__device__
inline
void
conv_patch_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
conv_patch_stack
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
...
@@ -329,6 +341,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
int
kern_stride_stack
,
int
kern_stride_nkern
,
int
dx
,
int
dy
)
{
int
__shared__
nb_thread_id
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
extern
__shared__
float
s_data
[];
...
...
@@ -459,7 +476,9 @@ conv_patch_stack( const float* img, const float* kern, float* out,
extern
"C"
{
#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
__global__ void \
conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \
conv_patch_stack_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int out_len, int out_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
...
...
@@ -469,7 +488,8 @@ conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \
int dx, int dy) \
{ \
conv_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, out_len, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, out_len, \
out_wid, nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
...
...
@@ -513,6 +533,7 @@ __INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
}
/**
* As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register.
...
...
@@ -529,7 +550,9 @@ __INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
*/
template
<
bool
flipped_kern
,
bool
c_contiguous
,
bool
split
,
bool
preload_full_kern
>
__device__
inline
void
conv_patch_stack_reduce
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
conv_patch_stack_reduce
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
int
img_stride_stack
,
int
img_stride_batch
,
...
...
@@ -543,6 +566,17 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const
int
out_len
=
blockDim
.
y
;
const
int
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
if
(
kern_stride_col
==-
1
&&
kern_stride_row
==-
kern_wid
){
//the last two dimensions are c_contiguous but flipped!
kern
=
&
(
kern
[(
kern_wid
-
1
)
*
kern_stride_col
+
(
kern_len
-
1
)
*
kern_stride_row
]);
kern_stride_col
=
1
;
kern_stride_row
=
kern_wid
;
}
extern
__shared__
float
s_data
[];
int
batch_id
=
blockIdx
.
x
;
...
...
@@ -636,7 +670,9 @@ extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
__global__ void \
conv_patch_stack_reduce_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
...
...
@@ -644,33 +680,35 @@ conv_patch_stack_reduce_##suffix( \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_patch_stack_reduce<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(0, false, false, false, false);
*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
1
,
false
,
false
,
false
,
true
)
;
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
2
,
false
,
false
,
true
,
false
)
;
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
3
,
false
,
false
,
true
,
true
)
;
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(4, false, true, false, false);
*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
5
,
false
,
true
,
false
,
true
)
;
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
6
,
false
,
true
,
true
,
false
)
;
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
7
,
false
,
true
,
true
,
true
)
;
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)
;
*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
9
,
true
,
false
,
false
,
true
)
;
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
10
,
true
,
false
,
true
,
false
)
;
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
11
,
true
,
false
,
true
,
true
)
;
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)
;
*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
13
,
true
,
true
,
false
,
true
)
;
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
14
,
true
,
true
,
true
,
false
)
;
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
15
,
true
,
true
,
true
,
true
)
;
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE
#(0, false, false, false, false)
*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
1
,
false
,
false
,
false
,
true
)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
2
,
false
,
false
,
true
,
false
)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
3
,
false
,
false
,
true
,
true
)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE
#(4, false, true, false, false)
*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
5
,
false
,
true
,
false
,
true
)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
6
,
false
,
true
,
true
,
false
)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
7
,
false
,
true
,
true
,
true
)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
9
,
true
,
false
,
false
,
true
)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
10
,
true
,
false
,
true
,
false
)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
11
,
true
,
false
,
true
,
true
)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
13
,
true
,
true
,
false
,
true
)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
14
,
true
,
true
,
true
,
false
)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE
(
15
,
true
,
true
,
true
,
true
)
#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory
...
...
@@ -684,7 +722,9 @@ __INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true);
*/
template
<
bool
c_contiguous
>
__device__
inline
void
conv_rows
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
conv_rows
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
...
@@ -694,6 +734,11 @@ conv_rows( const float* img, const float* kern, float* out,
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
,
batch_id
,
kern_id
;
float
__shared__
*
d_img
,
*
d_kern
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
out_len
=
img_len
-
kern_len
+
1
;
out_wid
=
img_wid
-
kern_wid
+
1
;
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
...
...
@@ -735,7 +780,9 @@ conv_rows( const float* img, const float* kern, float* out,
extern
"C"
{
#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
__global__ void \
conv_rows_##suffix(const float *img, const float *kern, float *out, \
conv_rows_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
...
...
@@ -744,7 +791,8 @@ conv_rows_##suffix(const float *img, const float *kern, float *out, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_rows<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
...
...
@@ -757,6 +805,8 @@ __INSTANTIATE_CONV_ROWS(1, true)
#undef __INSTANTIATE_CONV_ROWS
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register.
...
...
@@ -770,7 +820,9 @@ __INSTANTIATE_CONV_ROWS(1, true)
*/
template
<
bool
c_contiguous
>
__device__
inline
void
conv_rows_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
conv_rows_stack
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
const
int
img_stride_col
,
const
int
img_stride_row
,
...
...
@@ -780,6 +832,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
,
batch_id
,
kern_id
,
nb_rows
;
float
__shared__
*
d_img
,
*
d_kern
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
out_len
=
img_len
-
kern_len
+
1
;
out_wid
=
img_wid
-
kern_wid
+
1
;
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
...
...
@@ -859,7 +916,9 @@ extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
__global__ void \
conv_rows_stack_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
...
...
@@ -869,7 +928,8 @@ conv_rows_stack_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
...
...
@@ -882,6 +942,8 @@ __INSTANTIATE_CONV_ROWS_STACK(1, true)
#undef __INSTANTIATE_CONV_ROWS_STACK
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
...
...
@@ -895,7 +957,9 @@ __INSTANTIATE_CONV_ROWS_STACK(1, true)
*/
template
<
bool
c_contiguous
,
bool
preload_full_kern
>
__device__
inline
void
conv_rows_stack2
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
conv_rows_stack2
(
const
float
*
img
,
const
size_t
img_offset
,
const
float
*
kern
,
const
size_t
kern_offset
,
float
*
out
,
const
size_t
out_offset
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
const
int
img_stride_col
,
const
int
img_stride_row
,
...
...
@@ -905,6 +969,11 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
,
batch_id
,
kern_id
,
nb_rows
;
float
__shared__
*
d_img
,
*
d_kern
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
out_len
=
img_len
-
kern_len
+
1
;
out_wid
=
img_wid
-
kern_wid
+
1
;
nb_thread_id
=
blockDim
.
z
*
blockDim
.
y
*
blockDim
.
x
;
...
...
@@ -984,7 +1053,9 @@ extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
__global__ void \
conv_rows_stack2_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
...
...
@@ -994,8 +1065,8 @@ conv_rows_stack2_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack2<__VA_ARGS__>( \
img,
kern, out, img_len, img_wid
, \
kern_len, kern_wid, nkern, nstack, \
img,
img_offset, kern, kern_offset, out, out_offset
, \
img_len, img_wid,
kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
...
...
@@ -1009,6 +1080,8 @@ __INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
#undef __INSTANTIATE_CONV_ROWS_STACK2
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
...
...
@@ -1024,13 +1097,18 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
const
float
*
img
,
const
size_t
img_offset
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
const
size_t
kern_offset
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
const
size_t
out_offset
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
,
const
int
initial_reduce_boundary
)
{
const
int
outsize
=
nB
*
nK
*
out_len
*
out_wid
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
extern
__shared__
float
reducebuf
[];
for
(
int
i
=
blockIdx
.
x
;
i
<
/*physical*/
outsize
;
i
+=
gridDim
.
x
)
{
...
...
@@ -1110,18 +1188,21 @@ __global__ void \
conv_valid_row_reduce_##suffix( \
int nB, int nK, int stacklen, int img_len, int img_wid, \
int kern_len, int kern_wid, int out_len, int out_wid, \
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
const float *img, const size_t img_offset, \
int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, const size_t kern_offset, \
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, const size_t out_offset, \
int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
int subsample_rows, int subsample_cols, \
const int initial_reduce_boundary) \
{ \
conv_valid_row_reduce<__VA_ARGS__>( \
nB, nK, stacklen, img_len, img_wid, \
kern_len, kern_wid, out_len, out_wid, \
img, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_str_B, out_str_K, out_str_R, out_str_C, \
img, img_
offset, img_
str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_
offset, kern_
str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_
offset, out_
str_B, out_str_K, out_str_R, out_str_C, \
subsample_rows, subsample_cols, initial_reduce_boundary); \
}
...
...
@@ -1132,6 +1213,7 @@ __INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
}
/**
* Reference implementation of 'valid' mode convolution (with stack)
*
...
...
@@ -1139,18 +1221,26 @@ __INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
*
* TODO: explain parameters, preconditions
*/
__global__
void
extern
"C"
__global__
void
conv_reference_valid
(
int
nB
,
int
nK
,
int
stacklen
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
const
float
*
img
,
const
size_t
img_offset
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
const
size_t
kern_offset
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
const
size_t
out_offset
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
)
{
const
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
__shared__
int
numThreads
,
outsize
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
numThreads
=
blockDim
.
x
*
gridDim
.
x
;
outsize
=
nB
*
nK
*
out_len
*
out_wid
;
...
...
@@ -1191,6 +1281,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
}
}
/**
* Reference implementation of 'full' mode convolution (with stack)
*
...
...
@@ -1198,18 +1290,26 @@ conv_reference_valid(int nB, int nK, int stacklen,
*
* TODO: explain parameters, preconditions
*/
__global__
void
extern
"C"
__global__
void
conv_reference_full
(
int
nB
,
int
nK
,
int
stacklen
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical dimensions
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
const
float
*
img
,
const
size_t
img_offset
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
const
size_t
kern_offset
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
const
size_t
out_offset
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
)
{
const
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
__shared__
int
numThreads
,
physical_outsize
;
kern
=
(
const
float
*
)(((
const
char
*
)
kern
)
+
kern_offset
);
img
=
(
const
float
*
)(((
const
char
*
)
img
)
+
img_offset
);
out
=
(
float
*
)(((
char
*
)
out
)
+
out_offset
);
numThreads
=
blockDim
.
x
*
gridDim
.
x
;
physical_outsize
=
nB
*
nK
*
out_len
*
out_wid
;
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论