Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
cea45e8b
提交
cea45e8b
authored
11月 08, 2016
作者:
Frédéric Bastien
提交者:
GitHub
11月 08, 2016
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5174 from abergeron/cormm_f16
Make corrMM work in float16/64
上级
29af0e5b
d79d38c1
隐藏空白字符变更
内嵌
并排
正在显示
10 个修改的文件
包含
329 行增加
和
185 行删除
+329
-185
basic_ops.py
theano/gpuarray/basic_ops.py
+31
-14
blas.py
theano/gpuarray/blas.py
+21
-8
corr3d_gemm.c
theano/gpuarray/corr3d_gemm.c
+119
-73
corr_gemm.c
theano/gpuarray/corr_gemm.c
+103
-58
elemwise.py
theano/gpuarray/elemwise.py
+9
-0
neighbours.py
theano/gpuarray/neighbours.py
+9
-0
config.py
theano/gpuarray/tests/config.py
+8
-0
test_dnn.py
theano/gpuarray/tests/test_dnn.py
+1
-8
test_gemmcorr.py
theano/gpuarray/tests/test_gemmcorr.py
+15
-12
test_gemmcorr3d.py
theano/gpuarray/tests/test_gemmcorr3d.py
+13
-12
没有找到文件。
theano/gpuarray/basic_ops.py
浏览文件 @
cea45e8b
...
...
@@ -173,11 +173,15 @@ class Kernel(object):
fname: str
the name of the function wrapper.
(defaults to name + `_call`)
sname: str
the name of the scheduled call function
(defaults to name _ `_scall`)
"""
def
__init__
(
self
,
code
,
params
,
name
,
flags
,
codevar
=
None
,
binvar
=
None
,
objvar
=
None
,
fname
=
None
):
codevar
=
None
,
binvar
=
None
,
objvar
=
None
,
fname
=
None
,
sname
=
None
):
self
.
code
=
code
self
.
params
=
params
self
.
name
=
name
...
...
@@ -194,6 +198,9 @@ class Kernel(object):
if
fname
is
None
:
fname
=
name
+
'_call'
self
.
fname
=
fname
if
sname
is
None
:
sname
=
name
+
'_scall'
self
.
sname
=
sname
@staticmethod
def
get_flags
(
*
types
):
...
...
@@ -338,22 +345,30 @@ class GpuKernelBase(object):
setargs
=
'
\n
'
.
join
(
setargs
)
return
"""
int {fname}(unsigned int
nd, size_t *gdim, size_t *ldim, size_t
shared,
int {fname}(unsigned int
_nd, size_t *_gdim, size_t *_ldim, size_t _
shared,
{args}) {{
{setargs}
return GpuKernel_call(&{kname},
nd, ldim, gdim,
shared, NULL);
return GpuKernel_call(&{kname},
_nd, _ldim, _gdim, _
shared, NULL);
}}
"""
.
format
(
args
=
args
,
fname
=
k
.
fname
,
setargs
=
setargs
,
kname
=
k
.
objvar
)
def
c_support_code
(
self
):
return
"""
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a
%
b) ? 1: 0);
}
"""
int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
size_t _ls = 0;
size_t _gs = 0;
int _err;
if (_nd != 1) return GA_UNSUPPORTED_ERROR;
_err = GpuKernel_sched(&{kname}, _n[0], &_ls, &_gs);
if (_err != GA_NO_ERROR)
return _err;
{setargs}
return GpuKernel_call(&{kname}, 1, &_ls, &_gs, _shared, NULL);
}}
"""
.
format
(
args
=
args
,
fname
=
k
.
fname
,
setargs
=
setargs
,
sname
=
k
.
sname
,
kname
=
k
.
objvar
)
def
c_support_code_apply
(
self
,
node
,
name
):
kernels
=
self
.
gpu_kernels
(
node
,
name
)
...
...
@@ -428,7 +443,7 @@ int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
The node that we need the cache version for.
"""
return
(
6
,
self
.
get_params
(
node
)
.
bin_id
)
return
(
7
,
self
.
get_params
(
node
)
.
bin_id
)
def
forward_string_meth
(
name
):
...
...
@@ -466,12 +481,14 @@ class CGpuKernelBase(COp, GpuKernelBase):
kernel_re
=
re
.
compile
(
r'^#kernel ([a-zA-Z_].*?)$'
,
re
.
MULTILINE
)
c_support_code
=
forward_string_meth
(
'c_support_code'
)
c_support_code_apply
=
forward_string_meth
(
'c_support_code_apply'
)
c_support_code_struct
=
forward_string_meth
(
'c_support_code_struct'
)
c_init_code_struct
=
forward_string_meth
(
'c_init_code_struct'
)
c_cleanup_code_struct
=
forward_string_meth
(
'c_cleanup_code_struct'
)
def
c_code_cache_version_apply
(
self
,
node
):
return
GpuKernelBase
.
c_code_cache_version_apply
(
self
,
node
)
def
_type_macros
(
self
,
node
):
define_template
=
"#define
%
s
%
s
\n
"
undef_template
=
"#undef
%
s
\n
"
...
...
theano/gpuarray/blas.py
浏览文件 @
cea45e8b
...
...
@@ -414,7 +414,7 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace
=
GpuGemmBatch
(
inplace
=
True
)
class
BaseGpuCorrMM
(
CGpuKernelBase
,
BlasOp
):
class
BaseGpuCorrMM
(
CGpuKernelBase
):
"""
Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly.
...
...
@@ -429,9 +429,9 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1)).
"""
check_broadcast
=
False
__props__
=
(
'border_mode'
,
'subsample'
,
'filter_dilation'
)
_f16_ok
=
True
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
),
filter_dilation
=
(
1
,
1
)):
...
...
@@ -489,9 +489,15 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
def
get_params
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
c_headers
(
self
):
return
[
"<gpuarray/array.h>"
,
"<gpuarray/blas.h>"
,
"gpuarray_helper.h"
]
def
c_header_dirs
(
self
):
return
[
os
.
path
.
dirname
(
__file__
)]
def
c_code_cache_version
(
self
):
#
raise this whenever modifying any of the support_code_files
return
(
0
,
2
)
#
Raise this whenever modifying the code below.
return
(
2
,
)
def
c_code_helper
(
self
,
bottom
,
weights
,
top
,
direction
,
sub
,
height
=
None
,
width
=
None
):
"""
...
...
@@ -953,7 +959,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
return
[[
1
],
[
1
],
[
0
],
[
0
]]
# no connection to height, width
class
BaseGpuCorr3dMM
(
CGpuKernelBase
,
BlasOp
):
class
BaseGpuCorr3dMM
(
CGpuKernelBase
):
"""
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly.
...
...
@@ -967,10 +973,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
"""
"""
check_broadcast
=
False
__props__
=
(
'border_mode'
,
'subsample'
,
'filter_dilation'
)
_f16_ok
=
True
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
filter_dilation
=
(
1
,
1
,
1
)):
...
...
@@ -1028,9 +1035,15 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
def
get_params
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
c_headers
(
self
):
return
[
"<gpuarray/array.h>"
,
"<gpuarray/blas.h>"
,
"gpuarray_helper.h"
]
def
c_header_dirs
(
self
):
return
[
os
.
path
.
dirname
(
__file__
)]
def
c_code_cache_version
(
self
):
# raise this whenever modifying
any of the support_code_files
return
(
0
,
2
)
# raise this whenever modifying
the code below.
return
(
2
,
)
def
c_code_helper
(
self
,
bottom
,
weights
,
top
,
direction
,
sub
,
height
=
None
,
width
=
None
,
depth
=
None
):
...
...
theano/gpuarray/corr3d_gemm.c
浏览文件 @
cea45e8b
...
...
@@ -236,11 +236,9 @@ KERNEL void col2im3d_kernel(const ga_size n,
}
}
#section support_code_struct
int
im3d2col
(
const
size_t
max_threads_dim
,
int
im3d2col
(
gpudata
*
data_im
,
const
size_t
data_im_offset
,
const
size_t
channels
,
const
size_t
height
,
const
size_t
width
,
const
size_t
depth
,
const
size_t
kernel_h
,
const
size_t
kernel_w
,
const
size_t
kernel_d
,
...
...
@@ -257,13 +255,10 @@ int im3d2col(const size_t max_threads_dim,
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_kernel_w
)
/
stride_w
+
1
;
size_t
depth_col
=
(
depth
+
2
*
pad_d
-
dil_kernel_d
)
/
stride_d
+
1
;
size_t
num_kernels
=
channels
*
height_col
*
width_col
*
depth_col
;
size_t
threads_per_block
=
max_threads_dim
;
size_t
n_blocks
=
(
num_kernels
+
threads_per_block
-
1
)
/
threads_per_block
;
int
err
;
GpuKernel
*
kernel
;
if
(
dilation_h
!=
1
||
dilation_w
!=
1
||
dilation_d
!=
1
){
err
=
dilated_im3d2col_kernel_call
(
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
if
(
dilation_h
!=
1
||
dilation_w
!=
1
||
dilation_d
!=
1
)
{
err
=
dilated_im3d2col_kernel_scall
(
1
,
&
num_kernels
,
0
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
depth
,
kernel_h
,
kernel_w
,
kernel_d
,
dilation_h
,
dilation_w
,
dilation_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
...
...
@@ -273,10 +268,9 @@ int im3d2col(const size_t max_threads_dim,
"gpuarray error: dilated_im3d2col_kernel: %s."
,
GpuKernel_error
(
&
k_dilated_im3d2col_kernel
,
err
));
}
}
else
{
err
=
im3d2col_kernel_call
(
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
}
else
{
err
=
im3d2col_kernel_scall
(
1
,
&
num_kernels
,
0
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
depth
,
kernel_h
,
kernel_w
,
kernel_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
width_col
,
depth_col
,
...
...
@@ -290,7 +284,7 @@ int im3d2col(const size_t max_threads_dim,
return
err
;
}
int
col2im3d
(
const
size_t
max_threads_dim
,
gpudata
*
data_col
,
const
size_t
channels
,
int
col2im3d
(
gpudata
*
data_col
,
const
size_t
channels
,
const
size_t
height
,
const
size_t
width
,
const
size_t
depth
,
const
size_t
patch_h
,
const
size_t
patch_w
,
const
size_t
patch_d
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
dilation_d
,
...
...
@@ -304,14 +298,12 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_patch_w
)
/
stride_w
+
1
;
size_t
depth_col
=
(
depth
+
2
*
pad_d
-
dil_patch_d
)
/
stride_d
+
1
;
size_t
num_kernels
=
channels
*
height
*
width
*
depth
;
size_t
threads_per_block
=
max_threads_dim
;
size_t
n_blocks
=
(
num_kernels
+
threads_per_block
-
1
)
/
threads_per_block
;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
int
err
;
if
(
dilation_h
!=
1
||
dilation_w
!=
1
||
dilation_d
!=
1
)
{
err
=
dilated_col2im3d_kernel_call
(
1
,
&
n
_blocks
,
&
threads_per_block
,
0
,
if
(
dilation_h
!=
1
||
dilation_w
!=
1
||
dilation_d
!=
1
)
{
err
=
dilated_col2im3d_kernel_
s
call
(
1
,
&
n
um_kernels
,
0
,
num_kernels
,
data_col
,
height
,
width
,
depth
,
channels
,
patch_h
,
patch_w
,
patch_d
,
dilation_h
,
dilation_w
,
dilation_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
width_col
,
depth_col
,
...
...
@@ -323,8 +315,8 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
}
}
else
{
err
=
col2im3d_kernel_call
(
1
,
&
n
_blocks
,
&
threads_per_block
,
0
,
err
=
col2im3d_kernel_
s
call
(
1
,
&
n
um_kernels
,
0
,
num_kernels
,
data_col
,
height
,
width
,
depth
,
channels
,
patch_h
,
patch_w
,
patch_d
,
pad_h
,
pad_w
,
pad_d
,
stride_h
,
stride_w
,
stride_d
,
height_col
,
width_col
,
depth_col
,
data_im
,
data_im_offset
);
...
...
@@ -460,15 +452,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
return
NULL
;
}
// Get the max threads per blocks
size_t
max_threads_dim
;
err
=
gpucontext_property
(
bottom
->
context
->
ctx
,
GA_CTX_PROP_MAXLSIZE
,
&
max_threads_dim
);
if
(
err
!=
GA_NO_ERROR
){
PyErr_Format
(
PyExc_RuntimeError
,
"Could not fetch max_threads_dim."
);
return
NULL
;
}
// Create temporary columns
size_t
col_dim
[
2
];
col_dim
[
0
]
=
nChannels
*
kW
*
kH
*
kD
;
...
...
@@ -492,8 +475,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
const
size_t
K_
=
col_dim
[
0
];
const
size_t
N_
=
col_dim
[
1
];
const
size_t
M_
=
nFilters
;
const
DTYPE_INPUT_0
one
=
1
.
0
f
;
const
DTYPE_INPUT_0
zero
=
0
.
0
f
;
PyGpuArrayObject
*
output
;
if
(
direction
==
0
)
{
// forward pass
...
...
@@ -502,24 +483,46 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// First, im3d2col
err
=
im3d2col
(
max_threads_dim
,
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
col
->
ga
.
data
);
err
=
im3d2col
(
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
col
->
ga
.
data
);
if
(
err
!=
GA_NO_ERROR
)
{
Py_DECREF
(
col
);
return
NULL
;
}
// Second, gemm
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
one
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
zero
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
switch
(
col
->
ga
.
typecode
)
{
case
GA_FLOAT
:
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorr3dMM encountered an error running sgemm.
\n
"
);
"GpuCorr3dMM forward encountered an error running gemm.
"
);
Py_DECREF
(
col
);
return
NULL
;
}
...
...
@@ -531,10 +534,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// First, im3d2col
err
=
im3d2col
(
max_threads_dim
,
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
col
->
ga
.
data
);
err
=
im3d2col
(
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
col
->
ga
.
data
);
if
(
err
!=
GA_NO_ERROR
)
{
Py_DECREF
(
col
);
return
NULL
;
...
...
@@ -543,15 +546,37 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
err
=
gpublas_sgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
one
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
zero
:
one
,
weight
->
ga
.
data
,
0
,
K_
);
switch
(
col
->
ga
.
typecode
)
{
case
GA_FLOAT
:
err
=
gpublas_sgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorr3dMM encountered an error running sgemm.
\n
"
);
"GpuCorr3dMM grad weights encountered an error running gemm.
"
);
Py_DECREF
(
col
);
return
NULL
;
}
...
...
@@ -562,29 +587,50 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// full convolution: gemm, then col2im3d
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// gemm into columns
// gemm into columns
switch
(
top
->
ga
.
typecode
)
{
case
GA_FLOAT
:
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
one
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
zero
,
0
,
col
->
ga
.
data
,
0
,
N_
);
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorr3dMM encountered an error running sgemm.
\n
"
);
Py_DECREF
(
col
);
return
NULL
;
}
// col2im3d back to the data
err
=
col2im3d
(
max_threads_dim
,
col
->
ga
.
data
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
bottom
->
ga
.
data
,
n
*
bottom_stride
);
if
(
err
!=
GA_NO_ERROR
)
{
Py_DECREF
(
col
);
return
NULL
;
}
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
col
->
ga
.
data
,
0
,
N_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
col
->
ga
.
data
,
0
,
N_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorr3dMM grad inputs encountered an error running gemm."
);
Py_DECREF
(
col
);
return
NULL
;
}
// col2im3d back to the data
err
=
col2im3d
(
col
->
ga
.
data
,
nChannels
,
bottomHeight
,
bottomWidth
,
bottomDepth
,
kH
,
kW
,
kD
,
dilH
,
dilW
,
dilD
,
padH
,
padW
,
padD
,
dH
,
dW
,
dD
,
bottom
->
ga
.
data
,
n
*
bottom_stride
);
if
(
err
!=
GA_NO_ERROR
)
{
Py_DECREF
(
col
);
return
NULL
;
}
}
}
// Free temporary columns
...
...
theano/gpuarray/corr_gemm.c
浏览文件 @
cea45e8b
...
...
@@ -195,8 +195,7 @@ KERNEL void col2im_kernel(const ga_size n,
#section support_code_struct
int
im2col
(
const
size_t
max_threads_dim
,
gpudata
*
data_im
,
const
size_t
data_im_offset
,
const
size_t
channels
,
int
im2col
(
gpudata
*
data_im
,
const
size_t
data_im_offset
,
const
size_t
channels
,
const
size_t
height
,
const
size_t
width
,
const
size_t
kernel_h
,
const
size_t
kernel_w
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
pad_h
,
const
size_t
pad_w
,
...
...
@@ -209,13 +208,10 @@ int im2col(const size_t max_threads_dim,
size_t
height_col
=
(
height
+
2
*
pad_h
-
dil_kernel_h
)
/
stride_h
+
1
;
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_kernel_w
)
/
stride_w
+
1
;
size_t
num_kernels
=
channels
*
height_col
*
width_col
;
size_t
threads_per_block
=
max_threads_dim
;
size_t
n_blocks
=
(
num_kernels
+
threads_per_block
-
1
)
/
threads_per_block
;
int
err
;
GpuKernel
*
kernel
;
if
(
dilation_h
!=
1
||
dilation_w
!=
1
){
err
=
dilated_im2col_kernel_call
(
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
if
(
dilation_h
!=
1
||
dilation_w
!=
1
)
{
err
=
dilated_im2col_kernel_scall
(
1
,
&
num_kernels
,
0
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
width_col
,
data_col
);
...
...
@@ -224,10 +220,9 @@ int im2col(const size_t max_threads_dim,
"gpuarray error: dilated_im2col_kernel: %s."
,
GpuKernel_error
(
&
k_dilated_im2col_kernel
,
err
));
}
}
else
{
err
=
im2col_kernel_call
(
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
}
else
{
err
=
im2col_kernel_scall
(
1
,
&
num_kernels
,
0
,
num_kernels
,
data_im
,
data_im_offset
,
height
,
width
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
width_col
,
data_col
);
...
...
@@ -240,7 +235,7 @@ int im2col(const size_t max_threads_dim,
return
err
;
}
int
col2im
(
const
size_t
max_threads_dim
,
gpudata
*
data_col
,
const
size_t
channels
,
int
col2im
(
gpudata
*
data_col
,
const
size_t
channels
,
const
size_t
height
,
const
size_t
width
,
const
size_t
patch_h
,
const
size_t
patch_w
,
const
size_t
dilation_h
,
const
size_t
dilation_w
,
const
size_t
pad_h
,
const
size_t
pad_w
,
const
size_t
stride_h
,
...
...
@@ -250,14 +245,12 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
size_t
height_col
=
(
height
+
2
*
pad_h
-
dil_patch_h
)
/
stride_h
+
1
;
size_t
width_col
=
(
width
+
2
*
pad_w
-
dil_patch_w
)
/
stride_w
+
1
;
size_t
num_kernels
=
channels
*
height
*
width
;
size_t
threads_per_block
=
max_threads_dim
;
size_t
n_blocks
=
(
num_kernels
+
threads_per_block
-
1
)
/
threads_per_block
;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
int
err
;
if
(
dilation_h
!=
1
||
dilation_w
!=
1
)
{
err
=
dilated_col2im_kernel_call
(
1
,
&
n
_blocks
,
&
threads_per_block
,
0
,
if
(
dilation_h
!=
1
||
dilation_w
!=
1
)
{
err
=
dilated_col2im_kernel_
s
call
(
1
,
&
n
um_kernels
,
0
,
num_kernels
,
data_col
,
height
,
width
,
channels
,
patch_h
,
patch_w
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
width_col
,
data_im
,
data_im_offset
);
...
...
@@ -266,10 +259,9 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
"gpuarray error: dilated_col2im_kernel: %s."
,
GpuKernel_error
(
&
k_dilated_col2im_kernel
,
err
));
}
}
else
{
err
=
col2im_kernel_call
(
1
,
&
n_blocks
,
&
threads_per_block
,
0
,
}
else
{
err
=
col2im_kernel_scall
(
1
,
&
num_kernels
,
0
,
num_kernels
,
data_col
,
height
,
width
,
channels
,
patch_h
,
patch_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
height_col
,
width_col
,
data_im
,
data_im_offset
);
...
...
@@ -393,15 +385,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return
NULL
;
}
// Get the max threads per blocks
size_t
max_threads_dim
;
err
=
gpucontext_property
(
bottom
->
context
->
ctx
,
GA_CTX_PROP_MAXLSIZE
,
&
max_threads_dim
);
if
(
err
!=
GA_NO_ERROR
){
PyErr_Format
(
PyExc_RuntimeError
,
"Could not fetch max_threads_dim."
);
return
NULL
;
}
// Create temporary columns
size_t
col_dim
[
2
];
col_dim
[
0
]
=
nChannels
*
kW
*
kH
;
...
...
@@ -411,8 +394,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
GA_C_ORDER
,
bottom
->
context
,
Py_None
);
if
(
NULL
==
col
)
{
if
(
NULL
==
col
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorrMM failed to allocate working memory of %ld x %ld
\n
"
,
col_dim
[
0
],
col_dim
[
1
]);
...
...
@@ -425,8 +407,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
const
size_t
K_
=
col_dim
[
0
];
const
size_t
N_
=
col_dim
[
1
];
const
size_t
M_
=
nFilters
;
const
DTYPE_INPUT_0
one
=
1
.
0
f
;
const
DTYPE_INPUT_0
zero
=
0
.
0
f
;
PyGpuArrayObject
*
output
;
if
(
direction
==
0
)
{
// forward pass
...
...
@@ -435,8 +415,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// First, im2col
err
=
im2col
(
max_threads_dim
,
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
err
=
im2col
(
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottomWidth
,
kH
,
kW
,
dilH
,
dilW
,
padH
,
padW
,
dH
,
dW
,
col
->
ga
.
data
);
if
(
err
!=
GA_NO_ERROR
)
{
...
...
@@ -444,15 +424,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return
NULL
;
}
// Second, gemm
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
one
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
zero
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
switch
(
col
->
ga
.
typecode
)
{
case
GA_FLOAT
:
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_no_trans
,
cb_no_trans
,
N_
,
M_
,
K_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorrMM encountered an error running sgemm.
\n
"
);
"GpuCorrMM forward encountered an error running gemm: %d"
,
err
);
Py_DECREF
(
col
);
return
NULL
;
}
...
...
@@ -464,8 +466,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// First, im2col
err
=
im2col
(
max_threads_dim
,
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
err
=
im2col
(
bottom
->
ga
.
data
,
n
*
bottom_stride
,
nChannels
,
bottomHeight
,
bottomWidth
,
kH
,
kW
,
dilH
,
dilW
,
padH
,
padW
,
dH
,
dW
,
col
->
ga
.
data
);
if
(
err
!=
GA_NO_ERROR
)
{
...
...
@@ -476,15 +478,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
err
=
gpublas_sgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
one
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
zero
:
one
,
weight
->
ga
.
data
,
0
,
K_
);
switch
(
col
->
ga
.
typecode
)
{
case
GA_FLOAT
:
err
=
gpublas_sgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_trans
,
cb_no_trans
,
K_
,
M_
,
N_
,
1
,
col
->
ga
.
data
,
0
,
N_
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
(
n
==
0
)
?
0
:
1
,
weight
->
ga
.
data
,
0
,
K_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorrMM encountered an error running sgemm.
\n
"
);
"GpuCorrMM grad weights encountered an error running gemm: %d"
,
err
);
Py_DECREF
(
col
);
return
NULL
;
}
...
...
@@ -496,21 +520,42 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for
(
size_t
n
=
0
;
n
<
batchSize
;
n
++
)
{
// gemm into columns
switch
(
top
->
ga
.
typecode
)
{
case
GA_FLOAT
:
err
=
gpublas_sgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
one
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
col
->
ga
.
data
,
0
,
N_
);
break
;
case
GA_DOUBLE
:
err
=
gpublas_dgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
0
,
col
->
ga
.
data
,
0
,
N_
);
break
;
case
GA_HALF
:
err
=
gpublas_hgemm
(
cb_fortran
,
cb_no_trans
,
cb_trans
,
N_
,
K_
,
M_
,
1
,
top
->
ga
.
data
,
n
*
top_stride
,
N_
,
weight
->
ga
.
data
,
0
,
K_
,
zero
,
0
,
col
->
ga
.
data
,
0
,
N_
);
break
;
default:
err
=
GA_UNSUPPORTED_ERROR
;
}
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuCorrMM encountered an error running sgemm.
\n
"
);
"GpuCorrMM grad inputs encountered an error running gemm: %d"
,
err
);
Py_DECREF
(
col
);
return
NULL
;
}
// col2im back to the data
err
=
col2im
(
max_threads_dim
,
col
->
ga
.
data
,
nChannels
,
bottomHeight
,
bottomWidth
,
err
=
col2im
(
col
->
ga
.
data
,
nChannels
,
bottomHeight
,
bottomWidth
,
kH
,
kW
,
dilH
,
dilW
,
padH
,
padW
,
dH
,
dW
,
bottom
->
ga
.
data
,
n
*
bottom_stride
);
if
(
err
!=
GA_NO_ERROR
)
{
...
...
theano/gpuarray/elemwise.py
浏览文件 @
cea45e8b
...
...
@@ -613,6 +613,15 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_support_code
(
self
):
return
"""
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a
%
b) ? 1: 0);
}
"""
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
x
,
=
inp
z
,
=
out
...
...
theano/gpuarray/neighbours.py
浏览文件 @
cea45e8b
...
...
@@ -242,6 +242,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
flags
=
flags
,
objvar
=
k_var
))
return
kernels
def
c_support_code
(
self
):
return
"""
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a
%
b) ? 1: 0);
}
"""
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
dtype_neib_shape
=
node
.
inputs
[
1
]
.
dtype
...
...
theano/gpuarray/tests/config.py
浏览文件 @
cea45e8b
from
__future__
import
absolute_import
,
print_function
,
division
from
nose.plugins.skip
import
SkipTest
import
theano.tensor
import
theano.gpuarray
if
theano
.
gpuarray
.
pygpu
is
None
:
...
...
@@ -21,3 +22,10 @@ if theano.config.mode == 'FAST_COMPILE':
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
# If using float16, cast reference input to float32
def
ref_cast
(
x
):
if
x
.
type
.
dtype
==
'float16'
:
x
=
theano
.
tensor
.
cast
(
x
,
'float32'
)
return
x
theano/gpuarray/tests/test_dnn.py
浏览文件 @
cea45e8b
...
...
@@ -17,7 +17,7 @@ from .. import dnn
from
..basic_ops
import
GpuAllocEmpty
from
..type
import
gpuarray_shared_constructor
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
,
ref_cast
from
.
import
test_nnet
from
.rnn_support
import
Model
,
GRU
,
LSTM
,
WrapperLayer
...
...
@@ -33,13 +33,6 @@ def set_precision(floatX):
return
precision
# If using float16, cast reference input to float32
def
ref_cast
(
x
):
if
theano
.
config
.
floatX
==
'float16'
:
x
=
T
.
cast
(
x
,
'float32'
)
return
x
def
test_dnn_conv_desc_merge
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
...
...
theano/gpuarray/tests/test_gemmcorr.py
浏览文件 @
cea45e8b
...
...
@@ -3,13 +3,14 @@ import unittest
import
numpy
import
theano
from
theano
import
config
from
theano.tests
import
unittest_tools
as
utt
from
theano.tensor.nnet.corr
import
CorrMM
,
CorrMM_gradWeights
,
CorrMM_gradInputs
from
..type
import
gpuarray_shared_constructor
from
..blas
import
GpuCorrMM
,
GpuCorrMM_gradWeights
,
GpuCorrMM_gradInputs
from
.config
import
mode_with_gpu
,
mode_without_gpu
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
ref_cast
class
TestCorrMM
(
unittest
.
TestCase
):
...
...
@@ -22,15 +23,16 @@ class TestCorrMM(unittest.TestCase):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
conv_ref
=
CorrMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
)(
inputs
,
filters
)
subsample
=
subsample
)(
ref_cast
(
inputs
),
ref_cast
(
filters
))
f_ref
=
theano
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
conv
=
GpuCorrMM
(
border_mode
=
border_mode
,
...
...
@@ -120,20 +122,20 @@ class TestCorrMM(unittest.TestCase):
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
dCdH_shape
=
[
dCdH_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
dCdH_val
=
numpy
.
random
.
random
(
dCdH_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
dCdH_val
=
numpy
.
random
.
random
(
dCdH_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
dCdH
=
gpuarray_shared_constructor
(
dCdH_val
)
shape
=
gpuarray_shared_constructor
(
numpy
.
array
(
filters_shape
[
2
:]))
if
(
subsample
==
(
1
,
1
)):
conv_ref
=
CorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
)
conv_gemm
=
GpuCorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
else
:
conv_ref
=
CorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
,
shape
=
shape
)
conv_gemm
=
GpuCorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
...
...
@@ -167,8 +169,8 @@ class TestCorrMM(unittest.TestCase):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
...
...
@@ -178,12 +180,13 @@ class TestCorrMM(unittest.TestCase):
if
(
subsample
==
(
1
,
1
)):
conv_ref
=
CorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
)
)
conv_gemm
=
GpuCorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
else
:
conv_ref
=
CorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
),
shape
=
bottom_shape
)
conv_gemm
=
GpuCorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
...
...
theano/gpuarray/tests/test_gemmcorr3d.py
浏览文件 @
cea45e8b
...
...
@@ -3,13 +3,14 @@ import unittest
import
numpy
import
theano
from
theano
import
config
from
theano.tests
import
unittest_tools
as
utt
from
theano.tensor.nnet.corr3d
import
Corr3dMM
,
Corr3dMM_gradWeights
,
Corr3dMM_gradInputs
from
..type
import
gpuarray_shared_constructor
from
..blas
import
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
from
.config
import
mode_with_gpu
,
mode_without_gpu
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
ref_cast
class
TestCorr3dMM
(
unittest
.
TestCase
):
...
...
@@ -22,15 +23,15 @@ class TestCorr3dMM(unittest.TestCase):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
conv_ref
=
Corr3dMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
)(
inputs
,
filters
)
subsample
=
subsample
)(
ref_cast
(
inputs
),
ref_cast
(
filters
)
)
f_ref
=
theano
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
conv
=
GpuCorr3dMM
(
border_mode
=
border_mode
,
...
...
@@ -120,20 +121,20 @@ class TestCorr3dMM(unittest.TestCase):
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
dCdH_shape
=
[
dCdH_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
dCdH_val
=
numpy
.
random
.
random
(
dCdH_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
dCdH_val
=
numpy
.
random
.
random
(
dCdH_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
dCdH
=
gpuarray_shared_constructor
(
dCdH_val
)
shape
=
gpuarray_shared_constructor
(
numpy
.
array
(
filters_shape
[
2
:]))
if
(
subsample
==
(
1
,
1
,
1
)):
conv_ref
=
Corr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
)
conv_gemm
=
GpuCorr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
else
:
conv_ref
=
Corr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
,
shape
=
shape
)
conv_gemm
=
GpuCorr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
...
...
@@ -167,8 +168,8 @@ class TestCorr3dMM(unittest.TestCase):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
...
...
@@ -179,12 +180,12 @@ class TestCorr3dMM(unittest.TestCase):
if
(
subsample
==
(
1
,
1
,
1
)):
conv_ref
=
Corr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
)
)
conv_gemm
=
GpuCorr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
else
:
conv_ref
=
Corr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
)
,
shape
=
bottom_shape
)
conv_gemm
=
GpuCorr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论