Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
cea45e8b
提交
cea45e8b
authored
11月 08, 2016
作者:
Frédéric Bastien
提交者:
GitHub
11月 08, 2016
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5174 from abergeron/cormm_f16
Make corrMM work in float16/64
上级
29af0e5b
d79d38c1
隐藏空白字符变更
内嵌
并排
正在显示
10 个修改的文件
包含
329 行增加
和
185 行删除
+329
-185
basic_ops.py
theano/gpuarray/basic_ops.py
+31
-14
blas.py
theano/gpuarray/blas.py
+21
-8
corr3d_gemm.c
theano/gpuarray/corr3d_gemm.c
+119
-73
corr_gemm.c
theano/gpuarray/corr_gemm.c
+103
-58
elemwise.py
theano/gpuarray/elemwise.py
+9
-0
neighbours.py
theano/gpuarray/neighbours.py
+9
-0
config.py
theano/gpuarray/tests/config.py
+8
-0
test_dnn.py
theano/gpuarray/tests/test_dnn.py
+1
-8
test_gemmcorr.py
theano/gpuarray/tests/test_gemmcorr.py
+15
-12
test_gemmcorr3d.py
theano/gpuarray/tests/test_gemmcorr3d.py
+13
-12
没有找到文件。
theano/gpuarray/basic_ops.py
浏览文件 @
cea45e8b
...
@@ -173,11 +173,15 @@ class Kernel(object):
...
@@ -173,11 +173,15 @@ class Kernel(object):
fname: str
fname: str
the name of the function wrapper.
the name of the function wrapper.
(defaults to name + `_call`)
(defaults to name + `_call`)
sname: str
the name of the scheduled call function
(defaults to name _ `_scall`)
"""
"""
def
__init__
(
self
,
code
,
params
,
name
,
flags
,
def
__init__
(
self
,
code
,
params
,
name
,
flags
,
codevar
=
None
,
binvar
=
None
,
objvar
=
None
,
fname
=
None
):
codevar
=
None
,
binvar
=
None
,
objvar
=
None
,
fname
=
None
,
sname
=
None
):
self
.
code
=
code
self
.
code
=
code
self
.
params
=
params
self
.
params
=
params
self
.
name
=
name
self
.
name
=
name
...
@@ -194,6 +198,9 @@ class Kernel(object):
...
@@ -194,6 +198,9 @@ class Kernel(object):
if
fname
is
None
:
if
fname
is
None
:
fname
=
name
+
'_call'
fname
=
name
+
'_call'
self
.
fname
=
fname
self
.
fname
=
fname
if
sname
is
None
:
sname
=
name
+
'_scall'
self
.
sname
=
sname
@staticmethod
@staticmethod
def
get_flags
(
*
types
):
def
get_flags
(
*
types
):
...
@@ -338,22 +345,30 @@ class GpuKernelBase(object):
...
@@ -338,22 +345,30 @@ class GpuKernelBase(object):
setargs
=
'
\n
'
.
join
(
setargs
)
setargs
=
'
\n
'
.
join
(
setargs
)
return
"""
return
"""
int {fname}(unsigned int
nd, size_t *gdim, size_t *ldim, size_t
shared,
int {fname}(unsigned int
_nd, size_t *_gdim, size_t *_ldim, size_t _
shared,
{args}) {{
{args}) {{
{setargs}
{setargs}
return GpuKernel_call(&{kname},
nd, ldim, gdim,
shared, NULL);
return GpuKernel_call(&{kname},
_nd, _ldim, _gdim, _
shared, NULL);
}}
}}
"""
.
format
(
args
=
args
,
fname
=
k
.
fname
,
setargs
=
setargs
,
kname
=
k
.
objvar
)
def
c_support_code
(
self
):
int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
return
"""
size_t _ls = 0;
template <typename T>
size_t _gs = 0;
static T ceil_intdiv(T a, T b)
int _err;
{
return (a/b) + ((a
%
b) ? 1: 0);
if (_nd != 1) return GA_UNSUPPORTED_ERROR;
}
"""
_err = GpuKernel_sched(&{kname}, _n[0], &_ls, &_gs);
if (_err != GA_NO_ERROR)
return _err;
{setargs}
return GpuKernel_call(&{kname}, 1, &_ls, &_gs, _shared, NULL);
}}
"""
.
format
(
args
=
args
,
fname
=
k
.
fname
,
setargs
=
setargs
,
sname
=
k
.
sname
,
kname
=
k
.
objvar
)
def
c_support_code_apply
(
self
,
node
,
name
):
def
c_support_code_apply
(
self
,
node
,
name
):
kernels
=
self
.
gpu_kernels
(
node
,
name
)
kernels
=
self
.
gpu_kernels
(
node
,
name
)
...
@@ -428,7 +443,7 @@ int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
...
@@ -428,7 +443,7 @@ int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
The node that we need the cache version for.
The node that we need the cache version for.
"""
"""
return
(
6
,
self
.
get_params
(
node
)
.
bin_id
)
return
(
7
,
self
.
get_params
(
node
)
.
bin_id
)
def
forward_string_meth
(
name
):
def
forward_string_meth
(
name
):
...
@@ -466,12 +481,14 @@ class CGpuKernelBase(COp, GpuKernelBase):
...
@@ -466,12 +481,14 @@ class CGpuKernelBase(COp, GpuKernelBase):
kernel_re
=
re
.
compile
(
r'^#kernel ([a-zA-Z_].*?)$'
,
re
.
MULTILINE
)
kernel_re
=
re
.
compile
(
r'^#kernel ([a-zA-Z_].*?)$'
,
re
.
MULTILINE
)
c_support_code
=
forward_string_meth
(
'c_support_code'
)
c_support_code_apply
=
forward_string_meth
(
'c_support_code_apply'
)
c_support_code_apply
=
forward_string_meth
(
'c_support_code_apply'
)
c_support_code_struct
=
forward_string_meth
(
'c_support_code_struct'
)
c_support_code_struct
=
forward_string_meth
(
'c_support_code_struct'
)
c_init_code_struct
=
forward_string_meth
(
'c_init_code_struct'
)
c_init_code_struct
=
forward_string_meth
(
'c_init_code_struct'
)
c_cleanup_code_struct
=
forward_string_meth
(
'c_cleanup_code_struct'
)
c_cleanup_code_struct
=
forward_string_meth
(
'c_cleanup_code_struct'
)
def
c_code_cache_version_apply
(
self
,
node
):
return
GpuKernelBase
.
c_code_cache_version_apply
(
self
,
node
)
def
_type_macros
(
self
,
node
):
def
_type_macros
(
self
,
node
):
define_template
=
"#define
%
s
%
s
\n
"
define_template
=
"#define
%
s
%
s
\n
"
undef_template
=
"#undef
%
s
\n
"
undef_template
=
"#undef
%
s
\n
"
...
...
theano/gpuarray/blas.py
浏览文件 @
cea45e8b
...
@@ -414,7 +414,7 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
...
@@ -414,7 +414,7 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace
=
GpuGemmBatch
(
inplace
=
True
)
gpugemmbatch_inplace
=
GpuGemmBatch
(
inplace
=
True
)
class
BaseGpuCorrMM
(
CGpuKernelBase
,
BlasOp
):
class
BaseGpuCorrMM
(
CGpuKernelBase
):
"""
"""
Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly.
`GpuCorrMM_gradInputs`. Cannot be used directly.
...
@@ -429,9 +429,9 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
...
@@ -429,9 +429,9 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
filter_dilation
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1)).
Perform subsampling of the input, also known as dilation (default: (1, 1)).
"""
"""
check_broadcast
=
False
check_broadcast
=
False
__props__
=
(
'border_mode'
,
'subsample'
,
'filter_dilation'
)
__props__
=
(
'border_mode'
,
'subsample'
,
'filter_dilation'
)
_f16_ok
=
True
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
),
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
),
filter_dilation
=
(
1
,
1
)):
filter_dilation
=
(
1
,
1
)):
...
@@ -489,9 +489,15 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
...
@@ -489,9 +489,15 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
def
get_params
(
self
,
node
):
def
get_params
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
return
node
.
inputs
[
0
]
.
type
.
context
def
c_headers
(
self
):
return
[
"<gpuarray/array.h>"
,
"<gpuarray/blas.h>"
,
"gpuarray_helper.h"
]
def
c_header_dirs
(
self
):
return
[
os
.
path
.
dirname
(
__file__
)]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
#
raise this whenever modifying any of the support_code_files
#
Raise this whenever modifying the code below.
return
(
0
,
2
)
return
(
2
,
)
def
c_code_helper
(
self
,
bottom
,
weights
,
top
,
direction
,
sub
,
height
=
None
,
width
=
None
):
def
c_code_helper
(
self
,
bottom
,
weights
,
top
,
direction
,
sub
,
height
=
None
,
width
=
None
):
"""
"""
...
@@ -953,7 +959,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
...
@@ -953,7 +959,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
return
[[
1
],
[
1
],
[
0
],
[
0
]]
# no connection to height, width
return
[[
1
],
[
1
],
[
0
],
[
0
]]
# no connection to height, width
class
BaseGpuCorr3dMM
(
CGpuKernelBase
,
BlasOp
):
class
BaseGpuCorr3dMM
(
CGpuKernelBase
):
"""
"""
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly.
`GpuCorr3dMM_gradInputs`. Cannot be used directly.
...
@@ -967,10 +973,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
...
@@ -967,10 +973,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
Perform subsampling of the output (default: (1, 1, 1)).
Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
"""
"""
check_broadcast
=
False
check_broadcast
=
False
__props__
=
(
'border_mode'
,
'subsample'
,
'filter_dilation'
)
__props__
=
(
'border_mode'
,
'subsample'
,
'filter_dilation'
)
_f16_ok
=
True
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
filter_dilation
=
(
1
,
1
,
1
)):
filter_dilation
=
(
1
,
1
,
1
)):
...
@@ -1028,9 +1035,15 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
...
@@ -1028,9 +1035,15 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
def
get_params
(
self
,
node
):
def
get_params
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
return
node
.
inputs
[
0
]
.
type
.
context
def
c_headers
(
self
):
return
[
"<gpuarray/array.h>"
,
"<gpuarray/blas.h>"
,
"gpuarray_helper.h"
]
def
c_header_dirs
(
self
):
return
[
os
.
path
.
dirname
(
__file__
)]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
# raise this whenever modifying
any of the support_code_files
# raise this whenever modifying
the code below.
return
(
0
,
2
)
return
(
2
,
)
def
c_code_helper
(
self
,
bottom
,
weights
,
top
,
direction
,
sub
,
def
c_code_helper
(
self
,
bottom
,
weights
,
top
,
direction
,
sub
,
height
=
None
,
width
=
None
,
depth
=
None
):
height
=
None
,
width
=
None
,
depth
=
None
):
...
...
theano/gpuarray/corr3d_gemm.c
浏览文件 @
cea45e8b
...
@@ -236,11 +236,9 @@ KERNEL void col2im3d_kernel(const ga_size n,
...
@@ -236,11 +236,9 @@ KERNEL void col2im3d_kernel(const ga_size n,
}
}
}
}
#section support_code_struct
#section support_code_struct
int
im3d2col
(
const
size_t
max_threads_dim
,
int
im3d2col
(
gpudata
*
data_im
,
const
size_t
data_im_offset
,
const
size_t
channels
,
gpudata
*
data_im
,
const
size_t
data_im_offset
,
const
size_t
channels
,
const
size_t
height
,
const
size_t
width
,
const
size_t
depth
,
const
size_t
height
,
const
size_t
width
,
const
size_t
depth
,
const
size_t
kernel_h
,
const
size_t
kernel_w
,
const
size_t
kernel_d
,
const
size_t
kernel_h
,
const
size_t
kernel_w
,
const
size_t
kernel_d
,
...
@@ -257,13 +255,10 @@ int im3d2col(const size_t max_threads_dim,
...
@@ -257,13 +255,10 @@ int im3d2col(const size_t max_threads_dim,
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_kernel_w
)
/
stride_w
+
1
;
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_kernel_w
)
/
stride_w
+
1
;
size_t
depth_col
=
(
depth
+
2
*
pad_d
-
dil_kernel_d
)
/
stride_d
+
1
;
size_t
depth_col
=
(
depth
+
2
*
pad_d
-
dil_kernel_d
)
/
stride_d
+
1
;
size_t
num_kernels
=
channels
*
height_col
*
width_col
*
depth_col
;
size_t
num_kernels
=
channels
*
height_col
*
width_col
*
depth_col
;
size_t
threads_per_block
=
max_threads_dim
;
size_t
n_blocks
=
(
num_kernels
+
threads_per_block
-
1
)
/
threads_per_block
;
int
err
;
int
err
;
GpuKernel
*
kernel
;
if
(
dilation_h
!=
1
||
dilation_w
!=
1
||
dilation_d
!=
1
)
{
if
(
dilation_h
!=
1
||
dilation_w
!=
1
||
dilation_d
!=
1
){
err
=
dilated_im3d2col_kernel_scall
(
err
=
dilated_im3d2col_kernel_call
(
1
,
&
num_kernels
,
0
,
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
depth
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
depth
,
kernel_h
,
kernel_w
,
kernel_d
,
dilation_h
,
dilation_w
,
dilation_d
,
kernel_h
,
kernel_w
,
kernel_d
,
dilation_h
,
dilation_w
,
dilation_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
...
@@ -273,10 +268,9 @@ int im3d2col(const size_t max_threads_dim,
...
@@ -273,10 +268,9 @@ int im3d2col(const size_t max_threads_dim,
"gpuarray error: dilated_im3d2col_kernel: %s."
,
"gpuarray error: dilated_im3d2col_kernel: %s."
,
GpuKernel_error
(
&
k_dilated_im3d2col_kernel
,
err
));
GpuKernel_error
(
&
k_dilated_im3d2col_kernel
,
err
));
}
}
}
}
else
{
else
{
err
=
im3d2col_kernel_scall
(
err
=
im3d2col_kernel_call
(
1
,
&
num_kernels
,
0
,
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
depth
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
depth
,
kernel_h
,
kernel_w
,
kernel_d
,
pad_h
,
pad_w
,
pad_d
,
kernel_h
,
kernel_w
,
kernel_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
width_col
,
depth_col
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
width_col
,
depth_col
,
...
@@ -290,7 +284,7 @@ int im3d2col(const size_t max_threads_dim,
...
@@ -290,7 +284,7 @@ int im3d2col(const size_t max_threads_dim,
return
err
;
return
err
;
}
}
int
col2im3d
(
const
size_t
max_threads_dim
,
gpudata
*
data_col
,
const
size_t
channels
,
int
col2im3d
(
gpudata
*
data_col
,
const
size_t
channels
,
const
size_t
height
,
const
size_t
width
,
const
size_t
depth
,
const
size_t
height
,
const
size_t
width
,
const
size_t
depth
,
const
size_t
patch_h
,
const
size_t
patch_w
,
const
size_t
patch_d
,
const
size_t
patch_h
,
const
size_t
patch_w
,
const
size_t
patch_d
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
dilation_d
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
dilation_d
,
...
@@ -304,14 +298,12 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
...
@@ -304,14 +298,12 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_patch_w
)
/
stride_w
+
1
;
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_patch_w
)
/
stride_w
+
1
;
size_t
depth_col
=
(
depth
+
2
*
pad_d
-
dil_patch_d
)
/
stride_d
+
1
;
size_t
depth_col
=
(
depth
+
2
*
pad_d
-
dil_patch_d
)
/
stride_d
+
1
;
size_t
num_kernels
=
channels
*
height
*
width
*
depth
;
size_t
num_kernels
=
channels
*
height
*
width
*
depth
;
size_t
threads_per_block
=
max_threads_dim
;
size_t
n_blocks
=
(
num_kernels
+
threads_per_block
-
1
)
/
threads_per_block
;
// To avoid involving atomic operations, we will launch one kernel per
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
// bottom dimension, and then in the kernel add up the top dimensions.
int
err
;
int
err
;
if
(
dilation_h
!=
1
||
dilation_w
!=
1
||
dilation_d
!=
1
)
{
if
(
dilation_h
!=
1
||
dilation_w
!=
1
||
dilation_d
!=
1
)
{
err
=
dilated_col2im3d_kernel_call
(
err
=
dilated_col2im3d_kernel_
s
call
(
1
,
&
n
_blocks
,
&
threads_per_block
,
0
,
1
,
&
n
um_kernels
,
0
,
num_kernels
,
data_col
,
height
,
width
,
depth
,
channels
,
patch_h
,
patch_w
,
num_kernels
,
data_col
,
height
,
width
,
depth
,
channels
,
patch_h
,
patch_w
,
patch_d
,
dilation_h
,
dilation_w
,
dilation_d
,
pad_h
,
pad_w
,
pad_d
,
patch_d
,
dilation_h
,
dilation_w
,
dilation_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
width_col
,
depth_col
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
width_col
,
depth_col
,
...
@@ -323,8 +315,8 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
...
@@ -323,8 +315,8 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
}
}
}
}
else
{
else
{
err
=
col2im3d_kernel_call
(
err
=
col2im3d_kernel_
s
call
(
1
,
&
n
_blocks
,
&
threads_per_block
,
0
,
1
,
&
n
um_kernels
,
0
,
num_kernels
,
data_col
,
height
,
width
,
depth
,
channels
,
patch_h
,
patch_w
,
num_kernels
,
data_col
,
height
,
width
,
depth
,
channels
,
patch_h
,
patch_w
,
patch_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
patch_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
width_col
,
depth_col
,
data_im
,
data_im_offset
);
height_col
,
width_col
,
depth_col
,
data_im
,
data_im_offset
);
...
@@ -460,15 +452,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
...
@@ -460,15 +452,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
return
NULL
;
return
NULL
;
}
}
// Get the max threads per blocks
size_t
max_threads_dim
;
err
=
gpucontext_property
(
bottom
->
context
->
ctx
,
GA_CTX_PROP_MAXLSIZE
,
&
max_threads_dim
);
if
(
err
!=
GA_NO_ERROR
){
PyErr_Format
(
PyExc_RuntimeError
,
"Could not fetch max_threads_dim."
);
return
NULL
;
}
// Create temporary columns
// Create temporary columns
size_t
col_dim
[
2
];
size_t
col_dim
[
2
];
col_dim
[
0
]
=
nChannels
*
kW
*
kH
*
kD
;
col_dim
[
0
]
=
nChannels
*
kW
*
kH
*
kD
;
...
@@ -492,8 +475,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
...
@@ -492,8 +475,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
const
size_t
K_
=
col_dim
[
0
];
const
size_t
K_
=
col_dim
[
0
];
const
size_t
N_
=
col_dim
[
1
];
const
size_t
N_
=
col_dim
[
1
];
const
size_t
M_
=
nFilters
;
const
size_t
M_
=
nFilters
;
const
DTYPE_INPUT_0
one
=
1
.
0
f
;
const
DTYPE_INPUT_0
zero
=
0
.
0
f
;
PyGpuArrayObject
*
output
;
PyGpuArrayObject
*
output
;
if
(
direction
==
0
)
{
// forward pass
if
(
direction
==
0
)
{
// forward pass
...
@@ -502,24 +483,46 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
...
@@ -502,24 +483,46 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Iterate over batch
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// First, im3d2col
// First, im3d2col
err
=
im3d2col
(
max_threads_dim
,
err
=
im3d2col
(
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
col
->
ga
.
data
);
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
col
->
ga
.
data
);
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
Py_DECREF
(
col
);
Py_DECREF
(
col
);
return
NULL
;
return
NULL
;
}
}
// Second, gemm
// Second, gemm
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
switch
(
col
->
ga
.
typecode
)
{
N_
,
M_
,
K_
,
one
,
case
GA_FLOAT
:
col
->
ga
.
data
,
0
,
N_
,
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
weight
->
ga
.
data
,
0
,
K_
,
N_
,
M_
,
K_
,
1
,
zero
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorr3dMM encountered an error running sgemm.
\n
"
);
"GpuCorr3dMM forward encountered an error running gemm.
"
);
Py_DECREF
(
col
);
Py_DECREF
(
col
);
return
NULL
;
return
NULL
;
}
}
...
@@ -531,10 +534,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
...
@@ -531,10 +534,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Iterate over batch
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// First, im3d2col
// First, im3d2col
err
=
im3d2col
(
max_threads_dim
,
err
=
im3d2col
(
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
col
->
ga
.
data
);
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
col
->
ga
.
data
);
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
Py_DECREF
(
col
);
Py_DECREF
(
col
);
return
NULL
;
return
NULL
;
...
@@ -543,15 +546,37 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
...
@@ -543,15 +546,37 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
// is faster than setting weight to all zeros before the loop.)
err
=
gpublas_sgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
switch
(
col
->
ga
.
typecode
)
{
K_
,
M_
,
N_
,
one
,
case
GA_FLOAT
:
col
->
ga
.
data
,
0
,
N_
,
err
=
gpublas_sgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
K_
,
M_
,
N_
,
1
,
(
n
==
0
)
?
zero
:
one
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
);
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorr3dMM encountered an error running sgemm.
\n
"
);
"GpuCorr3dMM grad weights encountered an error running gemm.
"
);
Py_DECREF
(
col
);
Py_DECREF
(
col
);
return
NULL
;
return
NULL
;
}
}
...
@@ -562,29 +587,50 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
...
@@ -562,29 +587,50 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// full convolution: gemm, then col2im3d
// full convolution: gemm, then col2im3d
// Iterate over batch
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// gemm into columns
// gemm into columns
switch
(
top
->
ga
.
typecode
)
{
case
GA_FLOAT
:
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
one
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
weight
->
ga
.
data
,
0
,
K_
,
zero
,
0
,
col
->
ga
.
data
,
0
,
N_
);
col
->
ga
.
data
,
0
,
N_
);
if
(
err
!=
GA_NO_ERROR
)
{
break
;
PyErr_Format
(
PyExc_RuntimeError
,
case
GA_DOUBLE
:
"GpuCorr3dMM encountered an error running sgemm.
\n
"
);
err
=
gpublas_dgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
Py_DECREF
(
col
);
N_
,
K_
,
M_
,
1
,
return
NULL
;
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
}
weight
->
ga
.
data
,
0
,
K_
,
// col2im3d back to the data
0
,
err
=
col2im3d
(
max_threads_dim
,
col
->
ga
.
data
,
0
,
N_
);
col
->
ga
.
data
,
nChannels
,
break
;
bottomHeight
,
bottomWidth
,
bottomDepth
,
case
GA_HALF
:
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
err
=
gpublas_hgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
dH
,
dW
,
dD
,
bottom
->
ga
.
data
,
n
*
bottom_stride
);
N_
,
K_
,
M_
,
1
,
if
(
err
!=
GA_NO_ERROR
)
{
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
Py_DECREF
(
col
);
weight
->
ga
.
data
,
0
,
K_
,
return
NULL
;
0
,
}
col
->
ga
.
data
,
0
,
N_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorr3dMM grad inputs encountered an error running gemm."
);
Py_DECREF
(
col
);
return
NULL
;
}
// col2im3d back to the data
err
=
col2im3d
(
col
->
ga
.
data
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
bottom
->
ga
.
data
,
n
*
bottom_stride
);
if
(
err
!=
GA_NO_ERROR
)
{
Py_DECREF
(
col
);
return
NULL
;
}
}
}
}
}
// Free temporary columns
// Free temporary columns
...
...
theano/gpuarray/corr_gemm.c
浏览文件 @
cea45e8b
...
@@ -195,8 +195,7 @@ KERNEL void col2im_kernel(const ga_size n,
...
@@ -195,8 +195,7 @@ KERNEL void col2im_kernel(const ga_size n,
#section support_code_struct
#section support_code_struct
int
im2col
(
const
size_t
max_threads_dim
,
int
im2col
(
gpudata
*
data_im
,
const
size_t
data_im_offset
,
const
size_t
channels
,
gpudata
*
data_im
,
const
size_t
data_im_offset
,
const
size_t
channels
,
const
size_t
height
,
const
size_t
width
,
const
size_t
kernel_h
,
const
size_t
kernel_w
,
const
size_t
height
,
const
size_t
width
,
const
size_t
kernel_h
,
const
size_t
kernel_w
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
pad_h
,
const
size_t
pad_w
,
const
size_t
pad_h
,
const
size_t
pad_w
,
...
@@ -209,13 +208,10 @@ int im2col(const size_t max_threads_dim,
...
@@ -209,13 +208,10 @@ int im2col(const size_t max_threads_dim,
size_t
height_col
=
(
height
+
2
*
pad_h
-
dil_kernel_h
)
/
stride_h
+
1
;
size_t
height_col
=
(
height
+
2
*
pad_h
-
dil_kernel_h
)
/
stride_h
+
1
;
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_kernel_w
)
/
stride_w
+
1
;
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_kernel_w
)
/
stride_w
+
1
;
size_t
num_kernels
=
channels
*
height_col
*
width_col
;
size_t
num_kernels
=
channels
*
height_col
*
width_col
;
size_t
threads_per_block
=
max_threads_dim
;
size_t
n_blocks
=
(
num_kernels
+
threads_per_block
-
1
)
/
threads_per_block
;
int
err
;
int
err
;
GpuKernel
*
kernel
;
if
(
dilation_h
!=
1
||
dilation_w
!=
1
)
{
if
(
dilation_h
!=
1
||
dilation_w
!=
1
){
err
=
dilated_im2col_kernel_scall
(
err
=
dilated_im2col_kernel_call
(
1
,
&
num_kernels
,
0
,
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
kernel_h
,
kernel_w
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
width_col
,
data_col
);
width_col
,
data_col
);
...
@@ -224,10 +220,9 @@ int im2col(const size_t max_threads_dim,
...
@@ -224,10 +220,9 @@ int im2col(const size_t max_threads_dim,
"gpuarray error: dilated_im2col_kernel: %s."
,
"gpuarray error: dilated_im2col_kernel: %s."
,
GpuKernel_error
(
&
k_dilated_im2col_kernel
,
err
));
GpuKernel_error
(
&
k_dilated_im2col_kernel
,
err
));
}
}
}
}
else
{
else
{
err
=
im2col_kernel_scall
(
err
=
im2col_kernel_call
(
1
,
&
num_kernels
,
0
,
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
kernel_h
,
kernel_w
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
width_col
,
data_col
);
width_col
,
data_col
);
...
@@ -240,7 +235,7 @@ int im2col(const size_t max_threads_dim,
...
@@ -240,7 +235,7 @@ int im2col(const size_t max_threads_dim,
return
err
;
return
err
;
}
}
int
col2im
(
const
size_t
max_threads_dim
,
gpudata
*
data_col
,
const
size_t
channels
,
int
col2im
(
gpudata
*
data_col
,
const
size_t
channels
,
const
size_t
height
,
const
size_t
width
,
const
size_t
patch_h
,
const
size_t
patch_w
,
const
size_t
height
,
const
size_t
width
,
const
size_t
patch_h
,
const
size_t
patch_w
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
pad_h
,
const
size_t
pad_w
,
const
size_t
stride_h
,
const
size_t
pad_h
,
const
size_t
pad_w
,
const
size_t
stride_h
,
...
@@ -250,14 +245,12 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
...
@@ -250,14 +245,12 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
size_t
height_col
=
(
height
+
2
*
pad_h
-
dil_patch_h
)
/
stride_h
+
1
;
size_t
height_col
=
(
height
+
2
*
pad_h
-
dil_patch_h
)
/
stride_h
+
1
;
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_patch_w
)
/
stride_w
+
1
;
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_patch_w
)
/
stride_w
+
1
;
size_t
num_kernels
=
channels
*
height
*
width
;
size_t
num_kernels
=
channels
*
height
*
width
;
size_t
threads_per_block
=
max_threads_dim
;
size_t
n_blocks
=
(
num_kernels
+
threads_per_block
-
1
)
/
threads_per_block
;
// To avoid involving atomic operations, we will launch one kernel per
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
// bottom dimension, and then in the kernel add up the top dimensions.
int
err
;
int
err
;
if
(
dilation_h
!=
1
||
dilation_w
!=
1
)
{
if
(
dilation_h
!=
1
||
dilation_w
!=
1
)
{
err
=
dilated_col2im_kernel_call
(
err
=
dilated_col2im_kernel_
s
call
(
1
,
&
n
_blocks
,
&
threads_per_block
,
0
,
1
,
&
n
um_kernels
,
0
,
num_kernels
,
data_col
,
height
,
width
,
channels
,
patch_h
,
patch_w
,
num_kernels
,
data_col
,
height
,
width
,
channels
,
patch_h
,
patch_w
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
width_col
,
data_im
,
data_im_offset
);
height_col
,
width_col
,
data_im
,
data_im_offset
);
...
@@ -266,10 +259,9 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
...
@@ -266,10 +259,9 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
"gpuarray error: dilated_col2im_kernel: %s."
,
"gpuarray error: dilated_col2im_kernel: %s."
,
GpuKernel_error
(
&
k_dilated_col2im_kernel
,
err
));
GpuKernel_error
(
&
k_dilated_col2im_kernel
,
err
));
}
}
}
}
else
{
else
{
err
=
col2im_kernel_scall
(
err
=
col2im_kernel_call
(
1
,
&
num_kernels
,
0
,
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
num_kernels
,
data_col
,
height
,
width
,
channels
,
patch_h
,
patch_w
,
num_kernels
,
data_col
,
height
,
width
,
channels
,
patch_h
,
patch_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
width_col
,
data_im
,
data_im_offset
);
height_col
,
width_col
,
data_im
,
data_im_offset
);
...
@@ -393,15 +385,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
...
@@ -393,15 +385,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return
NULL
;
return
NULL
;
}
}
// Get the max threads per blocks
size_t
max_threads_dim
;
err
=
gpucontext_property
(
bottom
->
context
->
ctx
,
GA_CTX_PROP_MAXLSIZE
,
&
max_threads_dim
);
if
(
err
!=
GA_NO_ERROR
){
PyErr_Format
(
PyExc_RuntimeError
,
"Could not fetch max_threads_dim."
);
return
NULL
;
}
// Create temporary columns
// Create temporary columns
size_t
col_dim
[
2
];
size_t
col_dim
[
2
];
col_dim
[
0
]
=
nChannels
*
kW
*
kH
;
col_dim
[
0
]
=
nChannels
*
kW
*
kH
;
...
@@ -411,8 +394,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
...
@@ -411,8 +394,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
GA_C_ORDER
,
GA_C_ORDER
,
bottom
->
context
,
bottom
->
context
,
Py_None
);
Py_None
);
if
(
NULL
==
col
)
if
(
NULL
==
col
)
{
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorrMM failed to allocate working memory of %ld x %ld
\n
"
,
"GpuCorrMM failed to allocate working memory of %ld x %ld
\n
"
,
col_dim
[
0
],
col_dim
[
1
]);
col_dim
[
0
],
col_dim
[
1
]);
...
@@ -425,8 +407,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
...
@@ -425,8 +407,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const
size_t
K_
=
col_dim
[
0
];
const
size_t
K_
=
col_dim
[
0
];
const
size_t
N_
=
col_dim
[
1
];
const
size_t
N_
=
col_dim
[
1
];
const
size_t
M_
=
nFilters
;
const
size_t
M_
=
nFilters
;
const
DTYPE_INPUT_0
one
=
1
.
0
f
;
const
DTYPE_INPUT_0
zero
=
0
.
0
f
;
PyGpuArrayObject
*
output
;
PyGpuArrayObject
*
output
;
if
(
direction
==
0
)
{
// forward pass
if
(
direction
==
0
)
{
// forward pass
...
@@ -435,8 +415,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
...
@@ -435,8 +415,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// First, im2col
// First, im2col
err
=
im2col
(
max_threads_dim
,
err
=
im2col
(
bottom
->
ga
.
data
,
n
*
bottom_stride
,
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
nChannels
,
bottomHeight
,
bottomWidth
,
kH
,
kW
,
dilH
,
dilW
,
bottomWidth
,
kH
,
kW
,
dilH
,
dilW
,
padH
,
padW
,
dH
,
dW
,
col
->
ga
.
data
);
padH
,
padW
,
dH
,
dW
,
col
->
ga
.
data
);
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
...
@@ -444,15 +424,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
...
@@ -444,15 +424,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return
NULL
;
return
NULL
;
}
}
// Second, gemm
// Second, gemm
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
switch
(
col
->
ga
.
typecode
)
{
N_
,
M_
,
K_
,
one
,
case
GA_FLOAT
:
col
->
ga
.
data
,
0
,
N_
,
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
weight
->
ga
.
data
,
0
,
K_
,
N_
,
M_
,
K_
,
1
,
zero
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorrMM encountered an error running sgemm.
\n
"
);
"GpuCorrMM forward encountered an error running gemm: %d"
,
err
);
Py_DECREF
(
col
);
Py_DECREF
(
col
);
return
NULL
;
return
NULL
;
}
}
...
@@ -464,8 +466,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
...
@@ -464,8 +466,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// First, im2col
// First, im2col
err
=
im2col
(
max_threads_dim
,
err
=
im2col
(
bottom
->
ga
.
data
,
n
*
bottom_stride
,
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
nChannels
,
bottomHeight
,
bottomWidth
,
kH
,
kW
,
dilH
,
dilW
,
bottomWidth
,
kH
,
kW
,
dilH
,
dilW
,
padH
,
padW
,
dH
,
dW
,
col
->
ga
.
data
);
padH
,
padW
,
dH
,
dW
,
col
->
ga
.
data
);
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
...
@@ -476,15 +478,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
...
@@ -476,15 +478,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
// is faster than setting weight to all zeros before the loop.)
err
=
gpublas_sgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
switch
(
col
->
ga
.
typecode
)
{
K_
,
M_
,
N_
,
one
,
case
GA_FLOAT
:
col
->
ga
.
data
,
0
,
N_
,
err
=
gpublas_sgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
K_
,
M_
,
N_
,
1
,
(
n
==
0
)
?
zero
:
one
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
);
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorrMM encountered an error running sgemm.
\n
"
);
"GpuCorrMM grad weights encountered an error running gemm: %d"
,
err
);
Py_DECREF
(
col
);
Py_DECREF
(
col
);
return
NULL
;
return
NULL
;
}
}
...
@@ -496,21 +520,42 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
...
@@ -496,21 +520,42 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// gemm into columns
// gemm into columns
switch
(
top
->
ga
.
typecode
)
{
case
GA_FLOAT
:
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
one
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
col
->
ga
.
data
,
0
,
N_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
col
->
ga
.
data
,
0
,
N_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
weight
->
ga
.
data
,
0
,
K_
,
zero
,
0
,
col
->
ga
.
data
,
0
,
N_
);
col
->
ga
.
data
,
0
,
N_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorrMM encountered an error running sgemm.
\n
"
);
"GpuCorrMM grad inputs encountered an error running gemm: %d"
,
err
);
Py_DECREF
(
col
);
Py_DECREF
(
col
);
return
NULL
;
return
NULL
;
}
}
// col2im back to the data
// col2im back to the data
err
=
col2im
(
max_threads_dim
,
err
=
col2im
(
col
->
ga
.
data
,
nChannels
,
bottomHeight
,
bottomWidth
,
col
->
ga
.
data
,
nChannels
,
bottomHeight
,
bottomWidth
,
kH
,
kW
,
dilH
,
dilW
,
padH
,
padW
,
kH
,
kW
,
dilH
,
dilW
,
padH
,
padW
,
dH
,
dW
,
bottom
->
ga
.
data
,
n
*
bottom_stride
);
dH
,
dW
,
bottom
->
ga
.
data
,
n
*
bottom_stride
);
if
(
err
!=
GA_NO_ERROR
)
{
if
(
err
!=
GA_NO_ERROR
)
{
...
...
theano/gpuarray/elemwise.py
浏览文件 @
cea45e8b
...
@@ -613,6 +613,15 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -613,6 +613,15 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_support_code
(
self
):
return
"""
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a
%
b) ? 1: 0);
}
"""
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
x
,
=
inp
x
,
=
inp
z
,
=
out
z
,
=
out
...
...
theano/gpuarray/neighbours.py
浏览文件 @
cea45e8b
...
@@ -242,6 +242,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -242,6 +242,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
flags
=
flags
,
objvar
=
k_var
))
flags
=
flags
,
objvar
=
k_var
))
return
kernels
return
kernels
def
c_support_code
(
self
):
return
"""
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a
%
b) ? 1: 0);
}
"""
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
dtype_neib_shape
=
node
.
inputs
[
1
]
.
dtype
dtype_neib_shape
=
node
.
inputs
[
1
]
.
dtype
...
...
theano/gpuarray/tests/config.py
浏览文件 @
cea45e8b
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
nose.plugins.skip
import
SkipTest
from
nose.plugins.skip
import
SkipTest
import
theano.tensor
import
theano.gpuarray
import
theano.gpuarray
if
theano
.
gpuarray
.
pygpu
is
None
:
if
theano
.
gpuarray
.
pygpu
is
None
:
...
@@ -21,3 +22,10 @@ if theano.config.mode == 'FAST_COMPILE':
...
@@ -21,3 +22,10 @@ if theano.config.mode == 'FAST_COMPILE':
else
:
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
# If using float16, cast reference input to float32
def
ref_cast
(
x
):
if
x
.
type
.
dtype
==
'float16'
:
x
=
theano
.
tensor
.
cast
(
x
,
'float32'
)
return
x
theano/gpuarray/tests/test_dnn.py
浏览文件 @
cea45e8b
...
@@ -17,7 +17,7 @@ from .. import dnn
...
@@ -17,7 +17,7 @@ from .. import dnn
from
..basic_ops
import
GpuAllocEmpty
from
..basic_ops
import
GpuAllocEmpty
from
..type
import
gpuarray_shared_constructor
from
..type
import
gpuarray_shared_constructor
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
,
ref_cast
from
.
import
test_nnet
from
.
import
test_nnet
from
.rnn_support
import
Model
,
GRU
,
LSTM
,
WrapperLayer
from
.rnn_support
import
Model
,
GRU
,
LSTM
,
WrapperLayer
...
@@ -33,13 +33,6 @@ def set_precision(floatX):
...
@@ -33,13 +33,6 @@ def set_precision(floatX):
return
precision
return
precision
# If using float16, cast reference input to float32
def
ref_cast
(
x
):
if
theano
.
config
.
floatX
==
'float16'
:
x
=
T
.
cast
(
x
,
'float32'
)
return
x
def
test_dnn_conv_desc_merge
():
def
test_dnn_conv_desc_merge
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
...
...
theano/gpuarray/tests/test_gemmcorr.py
浏览文件 @
cea45e8b
...
@@ -3,13 +3,14 @@ import unittest
...
@@ -3,13 +3,14 @@ import unittest
import
numpy
import
numpy
import
theano
import
theano
from
theano
import
config
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
from
theano.tensor.nnet.corr
import
CorrMM
,
CorrMM_gradWeights
,
CorrMM_gradInputs
from
theano.tensor.nnet.corr
import
CorrMM
,
CorrMM_gradWeights
,
CorrMM_gradInputs
from
..type
import
gpuarray_shared_constructor
from
..type
import
gpuarray_shared_constructor
from
..blas
import
GpuCorrMM
,
GpuCorrMM_gradWeights
,
GpuCorrMM_gradInputs
from
..blas
import
GpuCorrMM
,
GpuCorrMM_gradWeights
,
GpuCorrMM_gradInputs
from
.config
import
mode_with_gpu
,
mode_without_gpu
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
ref_cast
class
TestCorrMM
(
unittest
.
TestCase
):
class
TestCorrMM
(
unittest
.
TestCase
):
...
@@ -22,15 +23,16 @@ class TestCorrMM(unittest.TestCase):
...
@@ -22,15 +23,16 @@ class TestCorrMM(unittest.TestCase):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
conv_ref
=
CorrMM
(
border_mode
=
border_mode
,
conv_ref
=
CorrMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
)(
inputs
,
filters
)
subsample
=
subsample
)(
ref_cast
(
inputs
),
ref_cast
(
filters
))
f_ref
=
theano
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
f_ref
=
theano
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
conv
=
GpuCorrMM
(
border_mode
=
border_mode
,
conv
=
GpuCorrMM
(
border_mode
=
border_mode
,
...
@@ -120,20 +122,20 @@ class TestCorrMM(unittest.TestCase):
...
@@ -120,20 +122,20 @@ class TestCorrMM(unittest.TestCase):
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
dCdH_shape
=
[
dCdH_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
dCdH_shape
=
[
dCdH_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
dCdH_val
=
numpy
.
random
.
random
(
dCdH_shape
)
.
astype
(
'float32'
)
dCdH_val
=
numpy
.
random
.
random
(
dCdH_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
dCdH
=
gpuarray_shared_constructor
(
dCdH_val
)
dCdH
=
gpuarray_shared_constructor
(
dCdH_val
)
shape
=
gpuarray_shared_constructor
(
numpy
.
array
(
filters_shape
[
2
:]))
shape
=
gpuarray_shared_constructor
(
numpy
.
array
(
filters_shape
[
2
:]))
if
(
subsample
==
(
1
,
1
)):
if
(
subsample
==
(
1
,
1
)):
conv_ref
=
CorrMM_gradWeights
(
subsample
=
subsample
)(
conv_ref
=
CorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
)
conv_gemm
=
GpuCorrMM_gradWeights
(
subsample
=
subsample
)(
conv_gemm
=
GpuCorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
inputs
,
dCdH
)
else
:
else
:
conv_ref
=
CorrMM_gradWeights
(
subsample
=
subsample
)(
conv_ref
=
CorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
,
shape
=
shape
)
conv_gemm
=
GpuCorrMM_gradWeights
(
subsample
=
subsample
)(
conv_gemm
=
GpuCorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
inputs
,
dCdH
,
shape
=
shape
)
...
@@ -167,8 +169,8 @@ class TestCorrMM(unittest.TestCase):
...
@@ -167,8 +169,8 @@ class TestCorrMM(unittest.TestCase):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
...
@@ -178,12 +180,13 @@ class TestCorrMM(unittest.TestCase):
...
@@ -178,12 +180,13 @@ class TestCorrMM(unittest.TestCase):
if
(
subsample
==
(
1
,
1
)):
if
(
subsample
==
(
1
,
1
)):
conv_ref
=
CorrMM_gradInputs
(
subsample
=
subsample
)(
conv_ref
=
CorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
)
)
conv_gemm
=
GpuCorrMM_gradInputs
(
subsample
=
subsample
)(
conv_gemm
=
GpuCorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
kern
=
filters
,
topgrad
=
inputs
)
else
:
else
:
conv_ref
=
CorrMM_gradInputs
(
subsample
=
subsample
)(
conv_ref
=
CorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
),
shape
=
bottom_shape
)
conv_gemm
=
GpuCorrMM_gradInputs
(
subsample
=
subsample
)(
conv_gemm
=
GpuCorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
...
...
theano/gpuarray/tests/test_gemmcorr3d.py
浏览文件 @
cea45e8b
...
@@ -3,13 +3,14 @@ import unittest
...
@@ -3,13 +3,14 @@ import unittest
import
numpy
import
numpy
import
theano
import
theano
from
theano
import
config
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
from
theano.tensor.nnet.corr3d
import
Corr3dMM
,
Corr3dMM_gradWeights
,
Corr3dMM_gradInputs
from
theano.tensor.nnet.corr3d
import
Corr3dMM
,
Corr3dMM_gradWeights
,
Corr3dMM_gradInputs
from
..type
import
gpuarray_shared_constructor
from
..type
import
gpuarray_shared_constructor
from
..blas
import
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
from
..blas
import
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
from
.config
import
mode_with_gpu
,
mode_without_gpu
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
ref_cast
class
TestCorr3dMM
(
unittest
.
TestCase
):
class
TestCorr3dMM
(
unittest
.
TestCase
):
...
@@ -22,15 +23,15 @@ class TestCorr3dMM(unittest.TestCase):
...
@@ -22,15 +23,15 @@ class TestCorr3dMM(unittest.TestCase):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
conv_ref
=
Corr3dMM
(
border_mode
=
border_mode
,
conv_ref
=
Corr3dMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
)(
inputs
,
filters
)
subsample
=
subsample
)(
ref_cast
(
inputs
),
ref_cast
(
filters
)
)
f_ref
=
theano
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
f_ref
=
theano
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
conv
=
GpuCorr3dMM
(
border_mode
=
border_mode
,
conv
=
GpuCorr3dMM
(
border_mode
=
border_mode
,
...
@@ -120,20 +121,20 @@ class TestCorr3dMM(unittest.TestCase):
...
@@ -120,20 +121,20 @@ class TestCorr3dMM(unittest.TestCase):
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
dCdH_shape
=
[
dCdH_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
dCdH_shape
=
[
dCdH_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
dCdH_val
=
numpy
.
random
.
random
(
dCdH_shape
)
.
astype
(
'float32'
)
dCdH_val
=
numpy
.
random
.
random
(
dCdH_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
dCdH
=
gpuarray_shared_constructor
(
dCdH_val
)
dCdH
=
gpuarray_shared_constructor
(
dCdH_val
)
shape
=
gpuarray_shared_constructor
(
numpy
.
array
(
filters_shape
[
2
:]))
shape
=
gpuarray_shared_constructor
(
numpy
.
array
(
filters_shape
[
2
:]))
if
(
subsample
==
(
1
,
1
,
1
)):
if
(
subsample
==
(
1
,
1
,
1
)):
conv_ref
=
Corr3dMM_gradWeights
(
subsample
=
subsample
)(
conv_ref
=
Corr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
)
conv_gemm
=
GpuCorr3dMM_gradWeights
(
subsample
=
subsample
)(
conv_gemm
=
GpuCorr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
inputs
,
dCdH
)
else
:
else
:
conv_ref
=
Corr3dMM_gradWeights
(
subsample
=
subsample
)(
conv_ref
=
Corr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
,
shape
=
shape
)
conv_gemm
=
GpuCorr3dMM_gradWeights
(
subsample
=
subsample
)(
conv_gemm
=
GpuCorr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
inputs
,
dCdH
,
shape
=
shape
)
...
@@ -167,8 +168,8 @@ class TestCorr3dMM(unittest.TestCase):
...
@@ -167,8 +168,8 @@ class TestCorr3dMM(unittest.TestCase):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
...
@@ -179,12 +180,12 @@ class TestCorr3dMM(unittest.TestCase):
...
@@ -179,12 +180,12 @@ class TestCorr3dMM(unittest.TestCase):
if
(
subsample
==
(
1
,
1
,
1
)):
if
(
subsample
==
(
1
,
1
,
1
)):
conv_ref
=
Corr3dMM_gradInputs
(
subsample
=
subsample
)(
conv_ref
=
Corr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
)
)
conv_gemm
=
GpuCorr3dMM_gradInputs
(
subsample
=
subsample
)(
conv_gemm
=
GpuCorr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
kern
=
filters
,
topgrad
=
inputs
)
else
:
else
:
conv_ref
=
Corr3dMM_gradInputs
(
subsample
=
subsample
)(
conv_ref
=
Corr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
)
,
shape
=
bottom_shape
)
conv_gemm
=
GpuCorr3dMM_gradInputs
(
subsample
=
subsample
)(
conv_gemm
=
GpuCorr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论