Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
5fc89c03
提交
5fc89c03
authored
12月 14, 2013
作者:
Frederic
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
new GpuConv compile, but give wrong version in some cases!
上级
baf12f54
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
59 行增加
和
35 行删除
+59
-35
conv.cu
theano/sandbox/gpuarray/conv.cu
+0
-0
conv.py
theano/sandbox/gpuarray/conv.py
+28
-6
conv_full_kernel.cu
theano/sandbox/gpuarray/conv_full_kernel.cu
+7
-6
conv_kernel.cu
theano/sandbox/gpuarray/conv_kernel.cu
+17
-17
test_conv_cuda_ndarray.py
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+7
-6
没有找到文件。
theano/sandbox/gpuarray/conv.cu
浏览文件 @
5fc89c03
差异被折叠。
点击展开。
theano/sandbox/gpuarray/conv.py
浏览文件 @
5fc89c03
import
copy
import
os
import
theano
import
theano
from
theano
import
gof
from
theano
import
config
,
gof
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
from
theano.sandbox.gpuarray.type
import
GpuArrayType
class
GpuConv
(
gof
.
Op
):
class
GpuConv
(
gof
.
Op
):
...
@@ -114,6 +119,9 @@ class GpuConv(gof.Op):
...
@@ -114,6 +119,9 @@ class GpuConv(gof.Op):
str
(
self
.
kshp
))
str
(
self
.
kshp
))
def
make_node
(
self
,
img
,
kern
):
def
make_node
(
self
,
img
,
kern
):
if
img
.
dtype
!=
"float32"
or
kern
.
dtype
!=
"float32"
:
raise
NotImplementedError
(
"GpuConv currently only work"
" with float32 dtype"
)
if
img
.
type
.
ndim
!=
4
:
if
img
.
type
.
ndim
!=
4
:
raise
TypeError
(
'img must be 4D tensor'
)
raise
TypeError
(
'img must be 4D tensor'
)
if
kern
.
type
.
ndim
!=
4
:
if
kern
.
type
.
ndim
!=
4
:
...
@@ -121,7 +129,8 @@ class GpuConv(gof.Op):
...
@@ -121,7 +129,8 @@ class GpuConv(gof.Op):
broadcastable
=
[
img
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
0
],
broadcastable
=
[
img
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
0
],
False
,
False
]
False
,
False
]
return
Apply
(
self
,
[
img
,
kern
],
[
CudaNdarrayType
(
broadcastable
)()])
out
=
GpuArrayType
(
img
.
dtype
,
broadcastable
)()
return
gof
.
Apply
(
self
,
[
img
,
kern
],
[
out
])
def
flops
(
self
,
inputs
,
outputs
):
def
flops
(
self
,
inputs
,
outputs
):
""" Useful with the hack in profilemode to print the MFlops"""
""" Useful with the hack in profilemode to print the MFlops"""
...
@@ -145,6 +154,8 @@ class GpuConv(gof.Op):
...
@@ -145,6 +154,8 @@ class GpuConv(gof.Op):
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
node_
=
copy
.
copy
(
node
)
node_
=
copy
.
copy
(
node
)
assert
node
.
op
is
node_
.
op
assert
node
.
op
is
node_
.
op
if
config
.
gpuarray
.
sync
:
raise
NotImplementedError
(
"GpuConv do not implement gpuarray.sync Theano flag"
)
if
node_
.
op
.
max_threads_dim0
is
None
:
if
node_
.
op
.
max_threads_dim0
is
None
:
cuda
=
theano
.
sandbox
.
cuda
cuda
=
theano
.
sandbox
.
cuda
device_id
=
cuda
.
use
.
device_number
device_id
=
cuda
.
use
.
device_number
...
@@ -169,20 +180,30 @@ class GpuConv(gof.Op):
...
@@ -169,20 +180,30 @@ class GpuConv(gof.Op):
return
[
'-DTHEANO_KERN_WID='
+
str
(
nb
)]
# ,'-g','-G']
return
[
'-DTHEANO_KERN_WID='
+
str
(
nb
)]
# ,'-g','-G']
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda_ndarray.cuh'
,
'<stdio.h>'
]
return
[
'<stdio.h>'
,
'cuda.h'
,
'<compyte/extension.h>'
,
'<compyte/numpy_compat.h>'
]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
# raise this whenever modifying any of the support_code_files
# raise this whenever modifying any of the support_code_files
return
(
0
,
20
)
return
(
0
,
20
)
def
c_init_code
(
self
):
return
[
'cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'
]
def
c_support_code_apply
(
self
,
node
,
nodename
):
def
c_support_code_apply
(
self
,
node
,
nodename
):
# REMEMBER TO RAISE c_code_cache_version when changing any of
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
# these files
files
=
[
'conv_kernel.cu'
,
'conv_full_kernel.cu'
,
'conv.cu'
]
files
=
[
'conv_kernel.cu'
,
'conv_full_kernel.cu'
,
'conv.cu'
]
codes
=
[
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
f
))
.
read
()
codes
=
[
"CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);"
,
for
f
in
files
]
"float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"
,
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"
]
codes
+=
[
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
f
))
.
read
()
for
f
in
files
]
return
reduce
(
str
.
__add__
,
codes
)
return
reduce
(
str
.
__add__
,
codes
)
def
c_compiler
(
self
):
return
NVCC_compiler
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
img
,
kern
=
inp
img
,
kern
=
inp
out
,
=
out_
out
,
=
out_
...
@@ -226,7 +247,8 @@ class GpuConv(gof.Op):
...
@@ -226,7 +247,8 @@ class GpuConv(gof.Op):
}
}
// TODO, make out be decref before we alloc out2!
// TODO, make out be decref before we alloc out2!
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(
%(img)
s,
%(kern)
s,
PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
%(img)
s,
%(kern)
s,
%(out)
s, mode,
%(out)
s, mode,
dx, dy,
dx, dy,
version, verbose,
version, verbose,
...
...
theano/sandbox/gpuarray/conv_full_kernel.cu
浏览文件 @
5fc89c03
...
@@ -4,7 +4,8 @@
...
@@ -4,7 +4,8 @@
//grid block size=batch_id
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__
void
__global__
void
conv_full_patch_split
(
float
*
img
,
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nb_split
)
conv_full_patch_split
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nb_split
)
{
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
out_len
=
img_len
+
kern_len
-
1
;
out_len
=
img_len
+
kern_len
-
1
;
...
@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
...
@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
//grid block size=batch_id, nkern
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__
void
__global__
void
conv_full_patch
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_full_patch
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
)
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
)
{
{
...
@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out,
...
@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out,
template
<
bool
img_c_contiguous_2d
,
bool
kern_c_contiguous_2d
>
template
<
bool
img_c_contiguous_2d
,
bool
kern_c_contiguous_2d
>
__global__
void
__global__
void
conv_full_patch_stack
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_full_patch_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
int
img_stride_col
,
int
img_stride_row
,
...
@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
...
@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
out_len
=
img_len
+
kern_len
-
1
;
out_len
=
img_len
+
kern_len
-
1
;
out_wid
=
img_wid
+
kern_wid
-
1
;
out_wid
=
img_wid
+
kern_wid
-
1
;
nb_thread_id
=
blockDim
.
y
*
blockDim
.
x
;
//blockDim.z*
nb_thread_id
=
blockDim
.
y
*
blockDim
.
x
;
//blockDim.z*
float
__shared__
*
kern_
,
*
img_
;
const
float
__shared__
*
kern_
,
*
img_
;
extern
__shared__
float
s_data
[];
extern
__shared__
float
s_data
[];
const
int
batch_id
=
blockIdx
.
x
;
const
int
batch_id
=
blockIdx
.
x
;
...
@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
...
@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
*/
*/
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
split
,
bool
low_mem
>
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
split
,
bool
low_mem
>
__global__
void
__global__
void
conv_full_patch_stack_padded
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_full_patch_stack_padded
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
const
int
img_len
,
const
int
img_wid
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
const
int
nkern
,
const
int
nstack
,
...
@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
...
@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
}
}
template
<
int
NSTACK
>
template
<
int
NSTACK
>
__global__
void
__global__
void
conv_full_load_everything
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_full_load_everything
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
int
img_stride_col
,
int
img_stride_row
,
...
...
theano/sandbox/gpuarray/conv_kernel.cu
浏览文件 @
5fc89c03
...
@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){
...
@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){
*/
*/
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
split
>
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
split
>
__global__
void
__global__
void
conv_patch
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_patch
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
)
int
nkern
,
int
nstack
)
{
{
...
@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out,
...
@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out,
*/
*/
template
<
bool
flipped_kern
,
bool
accumulate
,
int
KERN_WIDTH
,
bool
img_c_contiguous_2d
,
bool
kern_c_contiguous_2d
,
bool
split
,
bool
preload_full_kern
,
bool
subsample
>
template
<
bool
flipped_kern
,
bool
accumulate
,
int
KERN_WIDTH
,
bool
img_c_contiguous_2d
,
bool
kern_c_contiguous_2d
,
bool
split
,
bool
preload_full_kern
,
bool
subsample
>
__global__
void
__global__
void
conv_patch_stack
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_patch_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
int
out_len
,
int
out_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out,
...
@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out,
out_row
*
out_wid
+
out_col
],
sum
);
out_row
*
out_wid
+
out_col
],
sum
);
}
else
{
}
else
{
float
__shared__
*
kern_
,
*
img_
;
const
float
__shared__
*
kern_
,
*
img_
;
int
__shared__
out_len_max
;
int
__shared__
out_len_max
;
kern_
=
kern
+
kern_stride_nkern
*
kern_id
;
//the good nkern
kern_
=
kern
+
kern_stride_nkern
*
kern_id
;
//the good nkern
...
@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out,
...
@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out,
*/
*/
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
split
,
bool
preload_full_kern
>
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
split
,
bool
preload_full_kern
>
__global__
void
__global__
void
conv_patch_stack_reduce
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_patch_stack_reduce
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
int
img_stride_stack
,
int
img_stride_batch
,
int
img_stride_stack
,
int
img_stride_batch
,
...
@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
...
@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
*/
*/
template
<
int
KERN_WIDTH
,
bool
c_contiguous
>
template
<
int
KERN_WIDTH
,
bool
c_contiguous
>
__global__
void
__global__
void
conv_rows
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_rows
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
int
img_stride_col
,
int
img_stride_row
,
...
@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out,
...
@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out,
*/
*/
template
<
int
KERN_WIDTH
,
bool
c_contiguous
>
template
<
int
KERN_WIDTH
,
bool
c_contiguous
>
__global__
void
__global__
void
conv_rows_stack
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_rows_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
const
int
nkern
,
const
int
nstack
,
const
int
img_stride_col
,
const
int
img_stride_row
,
const
int
img_stride_col
,
const
int
img_stride_row
,
...
@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out,
...
@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out,
*/
*/
template
<
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
preload_full_kern
>
template
<
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
preload_full_kern
>
__global__
void
__global__
void
conv_rows_stack2
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_rows_stack2
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
const
int
nkern
,
const
int
nstack
,
const
int
img_stride_col
,
const
int
img_stride_row
,
const
int
img_stride_col
,
const
int
img_stride_row
,
...
@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
...
@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int
img_len
,
int
img_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical
int
out_len
,
int
out_wid
,
//physical
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
,
int
subsample_rows
,
int
subsample_cols
,
const
int
initial_reduce_boundary
)
const
int
initial_reduce_boundary
)
...
@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
...
@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
float
sum
=
0.0
f
;
float
sum
=
0.0
f
;
if
(
stack_loop
){
if
(
stack_loop
){
for
(;
ss
<
stacklen
;
ss
+=
blockDim
.
x
){
for
(;
ss
<
stacklen
;
ss
+=
blockDim
.
x
){
float
*
kk_0
=
kern
+
iK
*
kern_str_K
+
ss
*
kern_str_S
+
rr
*
kern_str_R
;
const
float
*
kk_0
=
kern
+
iK
*
kern_str_K
+
ss
*
kern_str_S
+
rr
*
kern_str_R
;
float
*
ii_0
=
img
+
iB
*
img_str_B
+
ss
*
img_str_S
+
img_rr
*
img_str_R
+
(
iC_logical
+
kern_wid
-
1
)
*
img_str_C
;
const
float
*
ii_0
=
img
+
iB
*
img_str_B
+
ss
*
img_str_S
+
img_rr
*
img_str_R
+
(
iC_logical
+
kern_wid
-
1
)
*
img_str_C
;
for
(
int
cc
=
0
;
cc
<
kern_wid
;
++
cc
)
for
(
int
cc
=
0
;
cc
<
kern_wid
;
++
cc
)
{
{
sum
+=
kk_0
[
0
]
*
ii_0
[
0
];
sum
+=
kk_0
[
0
]
*
ii_0
[
0
];
...
@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
...
@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
}
}
}
}
}
else
{
}
else
{
float
*
kk_0
=
kern
+
iK
*
kern_str_K
+
ss
*
kern_str_S
+
rr
*
kern_str_R
;
const
float
*
kk_0
=
kern
+
iK
*
kern_str_K
+
ss
*
kern_str_S
+
rr
*
kern_str_R
;
float
*
ii_0
=
img
+
iB
*
img_str_B
+
ss
*
img_str_S
+
img_rr
*
img_str_R
+
(
iC_logical
+
kern_wid
-
1
)
*
img_str_C
;
const
float
*
ii_0
=
img
+
iB
*
img_str_B
+
ss
*
img_str_S
+
img_rr
*
img_str_R
+
(
iC_logical
+
kern_wid
-
1
)
*
img_str_C
;
for
(
int
cc
=
0
;
cc
<
kern_wid
;
++
cc
)
for
(
int
cc
=
0
;
cc
<
kern_wid
;
++
cc
)
{
{
sum
+=
kk_0
[
0
]
*
ii_0
[
0
];
sum
+=
kk_0
[
0
]
*
ii_0
[
0
];
...
@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
...
@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
int
img_len
,
int
img_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical
int
out_len
,
int
out_wid
,
//physical
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
)
int
subsample_rows
,
int
subsample_cols
)
{
{
...
@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen,
...
@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen,
int
img_len
,
int
img_wid
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical dimensions
int
out_len
,
int
out_wid
,
//physical dimensions
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
)
int
subsample_rows
,
int
subsample_cols
)
{
{
...
...
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
浏览文件 @
5fc89c03
...
@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng
...
@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng
from
theano.sandbox.gpuarray.tests.test_basic_ops
import
(
mode_with_gpu
,
from
theano.sandbox.gpuarray.tests.test_basic_ops
import
(
mode_with_gpu
,
mode_without_gpu
)
mode_without_gpu
)
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.conv
import
GpuConv
import
pygpu
import
pygpu
gftensor4
=
GpuArrayType
(
'float32'
,
[
False
]
*
4
)
gftensor4
=
GpuArrayType
(
'float32'
,
[
False
]
*
4
)
...
@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
...
@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
t1
=
time
.
time
()
t1
=
time
.
time
()
i
=
gftensor4
()
i
=
gftensor4
()
k
=
gftensor4
()
k
=
gftensor4
()
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuConv
(
border_mode
=
mode
,
op
=
GpuConv
(
border_mode
=
mode
,
subsample
=
subsample
,
subsample
=
subsample
,
version
=
version
,
version
=
version
,
verbose
=
verbose
,
verbose
=
verbose
,
kshp
=
compile_kshp
)(
i
,
k
)
kshp
=
compile_kshp
)(
i
,
k
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
mode_with_gpu
)
gpuval
=
f
(
img
,
kern
)
gpuval
=
f
(
img
,
kern
)
t2
=
time
.
time
()
t2
=
time
.
time
()
...
@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase):
...
@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase):
func
=
theano
.
function
([
a
,
A
],
image_estimate
,
mode
=
mode_with_gpu
)
func
=
theano
.
function
([
a
,
A
],
image_estimate
,
mode
=
mode_with_gpu
)
#theano.printing.debugprint(func,)
#theano.printing.debugprint(func,)
assert
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
blas
.
GpuConv
)
assert
any
([
isinstance
(
node
.
op
,
GpuConv
)
for
node
in
func
.
maker
.
fgraph
.
toposort
()])
for
node
in
func
.
maker
.
fgraph
.
toposort
()])
a_in
=
numpy
.
random
.
randn
(
*
featshp
)
.
astype
(
"float32"
)
a_in
=
numpy
.
random
.
randn
(
*
featshp
)
.
astype
(
"float32"
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论