Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
1d2411c6
提交
1d2411c6
authored
8月 18, 2016
作者:
Gijs van Tulder
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Enable border_mode != valid and filter_dilation in GpuCorr3dMM.
This reuses the implementation of GpuCorr2dMM and its gradient ops.
上级
1d9aff8a
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
499 行增加
和
138 行删除
+499
-138
blas.py
theano/sandbox/cuda/blas.py
+134
-63
corr3d_gemm.cu
theano/sandbox/cuda/corr3d_gemm.cu
+214
-69
test_gemmcorr3d.py
theano/sandbox/cuda/tests/test_gemmcorr3d.py
+151
-6
没有找到文件。
theano/sandbox/cuda/blas.py
浏览文件 @
1d2411c6
...
...
@@ -1396,29 +1396,64 @@ class BaseGpuCorr3dMM(GpuOp):
Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
`GpuCorr3dMM_gradInputs`. Cannot be used directly.
Parameters
----------
border_mode : {'valid', 'full', 'half'}
Additionally, the padding size could be directly specified by an integer
or a tuple of three integers
subsample
Perform subsampling of the output (default: (1, 1, 1)).
filter_dilation
Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
pad
*deprecated*, now you should always use border_mode.
"""
__props__
=
(
'border_mode'
,
'subsample'
,
'pad'
)
check_broadcast
=
False
__props__
=
(
'border_mode'
,
'subsample'
,
'filter_dilation'
)
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
filter_dilation
=
(
1
,
1
,
1
),
pad
=
(
0
,
0
,
0
)):
if
border_mode
!=
"valid"
:
raise
ValueError
(
"border_mode must be 'valid'"
)
if
pad
!=
(
0
,
0
,
0
):
_logger
.
warning
(
'do not use pad for BaseGpuCorr3dMM; please set padding in '
'border_mode parameter, see the docstring for more details'
)
if
border_mode
!=
"valid"
:
raise
ValueError
(
"border_mode must be 'valid' if pad is given"
)
border_mode
=
pad
if
isinstance
(
border_mode
,
integer_types
):
border_mode
=
(
border_mode
,
border_mode
,
border_mode
)
if
isinstance
(
border_mode
,
tuple
):
pad_h
,
pad_w
,
pad_d
=
map
(
int
,
border_mode
)
border_mode
=
(
pad_h
,
pad_w
,
pad_d
)
if
not
((
isinstance
(
border_mode
,
tuple
)
and
min
(
border_mode
)
>=
0
)
or
border_mode
in
(
'valid'
,
'full'
,
'half'
)):
raise
ValueError
(
'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a tuple of three'
' integers'
.
format
(
border_mode
))
self
.
border_mode
=
border_mode
if
len
(
subsample
)
!=
3
:
raise
ValueError
(
"subsample must have three elements"
)
self
.
subsample
=
subsample
if
(
pad
not
in
(
"half"
,
"full"
))
and
(
len
(
pad
)
!=
3
):
raise
ValueError
(
"pad must be 'half', 'full', or have three elements"
)
self
.
pad
=
pad
if
len
(
filter_dilation
)
!=
3
:
raise
ValueError
(
"filter_dilation must have three elements"
)
self
.
subsample
=
tuple
(
subsample
)
self
.
filter_dilation
=
tuple
(
filter_dilation
)
@property
def
pad
(
self
):
if
self
.
border_mode
!=
'valid'
:
return
self
.
border_mode
return
(
0
,
0
,
0
)
def
__str__
(
self
):
return
'
%
s{
%
s,
%
s,
pad=
%
r
}'
%
(
return
'
%
s{
%
s,
%
s,
%
s
}'
%
(
self
.
__class__
.
__name__
,
self
.
border_mode
,
str
(
self
.
subsample
),
s
elf
.
pad
)
s
tr
(
self
.
filter_dilation
)
)
def
flops
(
self
,
inp
,
outp
):
""" Useful with the hack in profiling to print the MFlops"""
...
...
@@ -1440,7 +1475,7 @@ class BaseGpuCorr3dMM(GpuOp):
def
c_code_cache_version
(
self
):
# raise this whenever modifying any of the support_code_files
return
(
0
,
2
3
)
return
(
0
,
2
5
)
def
c_support_code_apply
(
self
,
node
,
nodename
):
# REMEMBER TO RAISE c_code_cache_version when changing any of
...
...
@@ -1503,15 +1538,17 @@ class BaseGpuCorr3dMM(GpuOp):
Ignored otherwise.
"""
if
self
.
border_mode
!=
"valid"
:
raise
ValueError
(
"mode must be 'valid'"
)
dH
,
dW
,
dD
=
self
.
subsample
if
self
.
pad
==
"half"
:
dilH
,
dilW
,
dilD
=
self
.
filter_dilation
if
self
.
border_mode
==
"half"
:
padH
=
padW
=
padD
=
-
1
elif
self
.
pad
==
"full"
:
elif
self
.
border_mode
==
"full"
:
padH
=
padW
=
padD
=
-
2
elif
isinstance
(
self
.
border_mode
,
tuple
):
padH
,
padW
,
padD
=
self
.
border_mode
else
:
padH
,
padW
,
padD
=
self
.
pad
assert
self
.
border_mode
==
"valid"
padH
=
padW
=
padD
=
0
if
direction
==
"forward"
:
direction
=
0
out
=
top
...
...
@@ -1556,6 +1593,9 @@ class BaseGpuCorr3dMM(GpuOp):
int dH =
%(dH)
s;
int dW =
%(dW)
s;
int dD =
%(dD)
s;
int dilH =
%(dilH)
s;
int dilW =
%(dilW)
s;
int dilD =
%(dilD)
s;
int padH =
%(padH)
s;
int padW =
%(padW)
s;
int padD =
%(padD)
s;
...
...
@@ -1585,12 +1625,12 @@ class BaseGpuCorr3dMM(GpuOp):
else if (padH == -2)
{
// vertical full padding, we can infer the kernel height
kH =
2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH
;
kH =
(2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1
;
}
else
{
// explicit padding, we can infer the kernel height
kH =
CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH
;
kH =
(CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1
;
}
if ((dW != 1) || (padW == -1))
{
...
...
@@ -1598,11 +1638,11 @@ class BaseGpuCorr3dMM(GpuOp):
}
else if (padW == -2)
{
kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW
;
kW = (2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1
;
}
else
{
kW =
CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW
;
kW =
(CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1
;
}
if ((dD != 1) || (padD == -1))
{
...
...
@@ -1610,22 +1650,27 @@ class BaseGpuCorr3dMM(GpuOp):
}
else if (padD == -2)
{
kD = 2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD
;
kD = (2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD + 1
;
}
else
{
kD =
CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD
;
kD =
(CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD+ 1
;
}
}
// Implicit dilated kernel size
int dil_kH = (kH - 1) * dilH + 1;
int dil_kW = (kW - 1) * dilW + 1;
int dil_kD = (kD - 1) * dilD + 1;
// Auto-padding if requested
if (padH == -1)
{ // vertical half padding
padH = kH / 2;
padH =
dil_
kH / 2;
}
else if (padH == -2)
{ // vertical full padding
padH = kH - 1;
padH =
dil_
kH - 1;
}
else if (padH < 0)
{
...
...
@@ -1633,10 +1678,10 @@ class BaseGpuCorr3dMM(GpuOp):
%(fail)
s
}
if (padW == -1) { // horizontal half padding
padW = kW / 2;
padW =
dil_
kW / 2;
}
else if (padW == -2) { // horizontal full padding
padW = kW - 1;
padW =
dil_
kW - 1;
}
else if (padW < 0)
{
...
...
@@ -1645,11 +1690,11 @@ class BaseGpuCorr3dMM(GpuOp):
}
if (padD == -1)
{ // horizontal half padding
padD = kD / 2;
padD =
dil_
kD / 2;
}
else if (padD == -2)
{ // horizontal full padding
padD = kD - 1;
padD =
dil_
kD - 1;
}
else if (padD < 0)
{
...
...
@@ -1662,16 +1707,16 @@ class BaseGpuCorr3dMM(GpuOp):
switch(direction) {
case 0: // forward pass
// output is top: (batchsize, num_filters, height, width, depth)
// height
and width: top = (bottom + 2*pad - weight
) / sample + 1
// height
, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)
) / sample + 1
out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH -
CudaNdarray_HOST_DIMS(weights)[2]
) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW -
CudaNdarray_HOST_DIMS(weights)[3]
) / dW + 1;
out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD -
CudaNdarray_HOST_DIMS(weights)[4]
) / dD + 1;
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH -
((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)
) / dH + 1;
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW -
((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)
) / dW + 1;
out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD -
((CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1)
) / dD + 1;
break;
case 1: // backprop wrt. weights
// output is weights: (num_filters, num_channels, height, width, depth)
// height, width and depth: weights =
bottom + 2*pad - (top-1) * sample
// height, width and depth: weights =
(bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
out_dim[2] = kH; // already inferred further above
...
...
@@ -1680,12 +1725,12 @@ class BaseGpuCorr3dMM(GpuOp):
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width, depth)
// height, width and depth: bottom = (top
-1) * sample + weights
- 2*pad
// height, width and depth: bottom = (top
- 1) * sample + (weights-1)*dil + 1
- 2*pad
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
out_dim[2] = (dH != 1) ?
%(height)
s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH +
CudaNdarray_HOST_DIMS(weights)[2]
- 2*padH;
out_dim[3] = (dW != 1) ?
%(width)
s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW +
CudaNdarray_HOST_DIMS(weights)[3]
- 2*padW;
out_dim[4] = (dD != 1) ?
%(depth)
s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD +
CudaNdarray_HOST_DIMS(weights)[4]
- 2*padD;
out_dim[2] = (dH != 1) ?
%(height)
s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH +
(CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1
- 2*padH;
out_dim[3] = (dW != 1) ?
%(width)
s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW +
(CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1
- 2*padW;
out_dim[4] = (dD != 1) ?
%(depth)
s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD +
(CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1
- 2*padD;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2
\\
n");
...
...
@@ -1716,7 +1761,8 @@ class BaseGpuCorr3dMM(GpuOp):
}
// Call CUDA code
out2 = corr3dMM(
%(bottom)
s,
%(weights)
s,
%(top)
s, direction, dH, dW, dD, padH, padW, padD);
out2 = corr3dMM(
%(bottom)
s,
%(weights)
s,
%(top)
s, direction, dH, dW, dD,
dilH, dilW, dilD, padH, padW, padD);
if (out2==NULL){
%(fail)
s
}
...
...
@@ -1734,19 +1780,28 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
Currently supports "valid" only; "full" can be simulated by setting
`pad="full"` (at the cost of performance), or by using
`GpuCorrMM_gradInputs`.
The width of a border of implicit zeros to pad the
input with. Must be a tuple with 3 elements giving the width of
the padding on each side, or a single integer to pad the same
on all sides, or a string shortcut setting the padding at runtime:
``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
(full convolution), ``'half'`` for ``(kernel_rows // 2,
kernel_columns // 2, kernel_depth // 2)`` (same convolution for
odd-sized kernels). Note that the three widths are each
applied twice, once per side (left and right, top and bottom, front
and back).
subsample
The subsample operation applied to each output image. Should be a tuple
with 3 elements. `(sv, sh, sl)` is equivalent to
`GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
Set to `(1, 1, 1)` to disable subsampling.
filter_dilation
The filter dilation operation applied to each input image.
Should be a tuple with 3 elements.
Set to `(1, 1, 1)` to disable filter dilation.
pad
The width of a border of implicit zeros to pad the input image with.
Should be a tuple with 3 elements giving the numbers of rows and columns
to pad on each side, or "half" to set the padding
to `(kernel_rows // 2, kernel_columns // 2, kernel_depth // 2)`,
or "full" to set the padding
to `(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)` at runtime.
Set to `(0, 0, 0)` to disable padding.
Deprecated alias for `border_mode`.
Notes
-----
...
...
@@ -1765,8 +1820,10 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
batchsize or number of filters) may also work around the CUBLAS bug.
"""
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
pad
=
(
0
,
0
,
0
)):
super
(
GpuCorr3dMM
,
self
)
.
__init__
(
border_mode
,
subsample
,
pad
)
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
filter_dilation
=
(
1
,
1
,
1
),
pad
=
(
0
,
0
,
0
)):
super
(
GpuCorr3dMM
,
self
)
.
__init__
(
border_mode
,
subsample
,
filter_dilation
,
pad
)
def
make_node
(
self
,
img
,
kern
):
img
=
as_cuda_ndarray_variable
(
img
)
...
...
@@ -1792,14 +1849,12 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
top
=
gpu_contiguous
(
top
)
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
weights
,
top
,
bottom
.
shape
[
-
3
:])
self
.
filter_dilation
)(
weights
,
top
,
bottom
.
shape
[
-
3
:])
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
self
.
filter_dilation
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
return
d_bottom
,
d_weights
...
...
@@ -1815,8 +1870,10 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
filter_dilation
=
(
1
,
1
,
1
),
pad
=
(
0
,
0
,
0
)):
super
(
GpuCorr3dMM_gradWeights
,
self
)
.
__init__
(
border_mode
,
subsample
,
pad
)
super
(
GpuCorr3dMM_gradWeights
,
self
)
.
__init__
(
border_mode
,
subsample
,
filter_dilation
,
pad
)
def
make_node
(
self
,
img
,
topgrad
,
shape
=
None
):
img
=
as_cuda_ndarray_variable
(
img
)
...
...
@@ -1828,10 +1885,14 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
raise
TypeError
(
'img must be 5D tensor'
)
if
topgrad
.
type
.
ndim
!=
5
:
raise
TypeError
(
'topgrad must be 5D tensor'
)
if
self
.
subsample
!=
(
1
,
1
,
1
)
or
self
.
pad
==
"half"
:
if
self
.
subsample
!=
(
1
,
1
,
1
)
or
self
.
border_mode
==
"half"
:
if
shape
is
None
:
raise
ValueError
(
'shape must be given if subsample != (1, 1, 1), or pad == "half"'
)
raise
ValueError
(
'shape must be given if subsample != (1, 1, 1)'
' or border_mode == "half"'
)
height_width_depth
=
[
shape
[
0
],
shape
[
1
],
shape
[
2
]]
assert
shape
[
0
]
.
ndim
==
0
assert
shape
[
1
]
.
ndim
==
0
assert
shape
[
2
]
.
ndim
==
0
else
:
height_width_depth
=
[]
...
...
@@ -1850,9 +1911,13 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
bottom
,
top
=
inp
[:
2
]
weights
,
=
grads
weights
=
gpu_contiguous
(
weights
)
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
weights
,
top
,
bottom
.
shape
[
-
3
:])
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
weights
)
d_bottom
=
GpuCorr3dMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
,
self
.
filter_dilation
)(
weights
,
top
,
bottom
.
shape
[
-
3
:])
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
filter_dilation
)(
bottom
,
weights
)
d_height_width_depth
=
(
theano
.
gradient
.
DisconnectedType
()(),)
*
3
if
len
(
inp
)
==
5
else
()
return
(
d_bottom
,
d_top
)
+
d_height_width_depth
...
...
@@ -1875,8 +1940,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
filter_dilation
=
(
1
,
1
,
1
),
pad
=
(
0
,
0
,
0
)):
super
(
GpuCorr3dMM_gradInputs
,
self
)
.
__init__
(
border_mode
,
subsample
,
pad
)
super
(
GpuCorr3dMM_gradInputs
,
self
)
.
__init__
(
border_mode
,
subsample
,
filter_dilation
,
pad
)
def
make_node
(
self
,
kern
,
topgrad
,
shape
=
None
):
kern
=
as_cuda_ndarray_variable
(
kern
)
...
...
@@ -1888,6 +1955,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
if
self
.
subsample
!=
(
1
,
1
,
1
)
and
shape
is
None
:
raise
ValueError
(
'shape must be given if subsample != (1, 1, 1)'
)
height_width_depth
=
[
shape
[
0
],
shape
[
1
],
shape
[
2
]]
if
self
.
subsample
!=
(
1
,
1
,
1
)
else
[]
if
height_width_depth
:
assert
shape
[
0
]
.
ndim
==
0
assert
shape
[
1
]
.
ndim
==
0
assert
shape
[
2
]
.
ndim
==
0
broadcastable
=
[
topgrad
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
1
],
False
,
False
,
False
]
...
...
@@ -1906,12 +1977,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
bottom
=
gpu_contiguous
(
bottom
)
d_weights
=
GpuCorr3dMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
self
.
filter_dilation
)(
bottom
,
top
,
weights
.
shape
[
-
3
:])
d_top
=
GpuCorr3dMM
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
weights
)
self
.
filter_dilation
)(
bottom
,
weights
)
d_height_width_depth
=
(
theano
.
gradient
.
DisconnectedType
()(),)
\
*
3
if
len
(
inp
)
==
5
else
()
return
(
d_weights
,
d_top
)
+
d_height_width_depth
...
...
theano/sandbox/cuda/corr3d_gemm.cu
浏览文件 @
1d2411c6
...
...
@@ -52,6 +52,54 @@ inline int GET_BLOCKS(const int N) {
// (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// CUDA kernel for the case of dilation
__global__ void dilated_im3d2col_kernel(const int n, const float* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
const int w_index = index / depth_col;
const int h_index = w_index / width_col;
const int d_col = index % depth_col;
const int h_col = h_index % height_col;
const int w_col = w_index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w * kernel_d;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
const int d_offset = d_col * stride_d - pad_d;
float* data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const float* data_im_ptr = data_im;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (int i = 0; i < kernel_h; ++i)
{
int h_im = h_offset + i * dilation_h;
for (int j = 0; j < kernel_w; ++j)
{
int w_im = w_offset + j * dilation_w;
for (int k = 0; k < kernel_d; ++k)
{
int d_im = d_offset + k * dilation_d;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * dilation_h * (width * depth) +
j * dilation_w * depth +
k * dilation_d] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
__global__ void im3d2col_kernel(const int n, const float* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
...
...
@@ -62,41 +110,35 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
{
CUDA_KERNEL_LOOP(index, n)
{
int d_out = index % depth_col;
int w_index = index / depth_col;
int w_out = w_index % width_col;
int h_index = w_index / width_col;
int h_out = h_index % height_col;
int channel_in = h_index / height_col;
//channel_in = 1;
int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
int h_in = h_out * stride_h - pad_h;
int w_in = w_out * stride_w - pad_w;
int d_in = d_out * stride_d - pad_d;
const int w_index = index / depth_col;
const int h_index = w_index / width_col;
const int d_col = index % depth_col;
const int h_col = h_index % height_col;
const int w_col = w_index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w * kernel_d;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
const int d_offset = d_col * stride_d - pad_d;
float* data_col_ptr = data_col;
data_col_ptr += channel_out * (height_col * width_col * depth_col) +
h_out * (width_col * depth_col) + w_out * depth_col + d_out;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const float* data_im_ptr = data_im;
data_im_ptr += c
hannel_in
* (height * width * depth) +
h_
in * (width * depth) + w_in * depth + d_in
;
data_im_ptr += c
_im
* (height * width * depth) +
h_
offset * (width * depth) + w_offset * depth + d_offset
;
for (int i = 0; i < kernel_h; ++i)
{
int h
= h_in
+ i;
int h
_im = h_offset
+ i;
for (int j = 0; j < kernel_w; ++j)
{
int w
= w_in
+ j;
int w
_im = w_offset
+ j;
for (int k = 0; k < kernel_d; ++k)
{
int d
= d_in
+ k;
*data_col_ptr = (h
>= 0 && w >= 0 && d
>= 0 &&
h
< height && w < width && d
< depth) ?
data_im_ptr[i * (width * depth) + j *depth + k] : 0;
int d
_im = d_offset
+ k;
*data_col_ptr = (h
_im >= 0 && w_im >= 0 && d_im
>= 0 &&
h
_im < height && w_im < width && d_im
< depth) ?
data_im_ptr[i * (width * depth) + j *
depth + k] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
...
...
@@ -107,31 +149,105 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
void im3d2col(const float* data_im, const int channels,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
float* data_col)
{
// We are going to launch channels * height_col * width_col * depth_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
int num_kernels = channels * height_col * width_col * depth_col;
im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
dilated_im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
}
else{
im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
}
}
// CUDA kernel for the case of dilation
__global__ void dilated_col2im3d_kernel(
const int n, const float* data_col,
const int height, const int width, const int depth,
const int channels,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{
float val = 0;
const int d_im = index % depth + pad_d;
const int w_index = index / depth;
const int w_im = w_index % width + pad_w;
const int h_index = w_index / width;
const int h_im = h_index % height + pad_h;
const int c_im = h_index / height;
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
int kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
// compute the start and end of the output
const int d_col_start = (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
const int d_col_end = min(d_im / stride_d + 1, depth_col);
const int w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
int d_k = (d_im - d_col * stride_d);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
d_k /= dilation_d;
int data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
h_k * kernel_w * kernel_d * height_col * width_col * depth_col +
w_k * kernel_d * height_col * width_col * depth_col +
d_k * height_col * width_col * depth_col +
h_col * width_col * depth_col +
w_col * depth_col +
d_col;
val += data_col[data_col_index];
}
}
}
}
data_im[index] = val;
}
}
__global__ void col2im3d_kernel(const int n, const float* data_col,
const int height, const int width, const int depth,
const int channels,
const int
patch_h, const int patch_w, const int patch
_d,
const int
kernel_h, const int kernel_w, const int kernel
_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
...
...
@@ -140,59 +256,78 @@ __global__ void col2im3d_kernel(const int n, const float* data_col,
CUDA_KERNEL_LOOP(index, n)
{
float val = 0;
int d
= index % depth + pad_d;
int w_index = index / depth;
int w
= w_index % width + pad_w;
int h_index = w_index / width;
int h
= h_index % height + pad_h;
int c
= h_index / height;
const int d_im
= index % depth + pad_d;
const
int w_index = index / depth;
const int w_im
= w_index % width + pad_w;
const
int h_index = w_index / width;
const int h_im
= h_index % height + pad_h;
const int c_im
= h_index / height;
// compute the start and end of the output
int d_col_start = (d < patch_d) ? 0 : (d - patch
_d) / stride_d + 1;
int d_col_end = min(d
/ stride_d + 1, depth_col);
int w_col_start = (w < patch_w) ? 0 : (w - patch
_w) / stride_w + 1;
int w_col_end = min(w
/ stride_w + 1, width_col);
int h_col_start = (h < patch_h) ? 0 : (h - patch
_h) / stride_h + 1;
int h_col_end = min(h
/ stride_h + 1, height_col);
const int d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel
_d) / stride_d + 1;
const int d_col_end = min(d_im
/ stride_d + 1, depth_col);
const int w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel
_w) / stride_w + 1;
const int w_col_end = min(w_im
/ stride_w + 1, width_col);
const int h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel
_h) / stride_h + 1;
const int h_col_end = min(h_im
/ stride_h + 1, height_col);
int offset =
(c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
(c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
w_im * kernel_d + d_im) * height_col * width_col * depth_col;
int coeff_h_col = (1 - stride_h *
patch_w * patch
_d * height_col) * width_col * depth_col;
int coeff_w_col = (1 - stride_w *
patch
_d * height_col * width_col) * depth_col;
int coeff_h_col = (1 - stride_h *
kernel_w * kernel
_d * height_col) * width_col * depth_col;
int coeff_w_col = (1 - stride_w *
kernel
_d * height_col * width_col) * depth_col;
int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
{
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
}
}
}
data_im[index] = val;
data_im[index] = val;
}
}
void col2im3d(const float* data_col, const int channels,
const int height, const int width, const int depth,
const int patch_h, const int patch_w, const int patch_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
float* data_im)
{
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
int dil_patch_h = (patch_h - 1) * dilation_h + 1;
int dil_patch_w = (patch_w - 1) * dilation_w + 1;
int dil_patch_d = (patch_d - 1) * dilation_d + 1;
int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
int num_kernels = channels * height * width * depth;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
dilated_col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
}
else{
col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
}
}
...
...
@@ -210,6 +345,9 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
const int dH = 1,
const int dW = 1,
const int dD = 1,
const int dilH = 1,
const int dilW = 1,
const int dilD = 1,
const int padH = 0,
const int padW = 0,
const int padD = 0)
...
...
@@ -286,10 +424,14 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
"GpuCorr3dMM images and kernel must have the same stack size\n");
return 0;
}
// implicit dilated filter
const int dil_kH = (kH - 1) * dilH + 1;
const int dil_kW = (kW - 1) * dilW + 1;
const int dil_kD = (kD - 1) * dilD + 1;
// top: (batchSize, nFilters, topHeight, topWidth, topDepth)
const int topHeight = int((bottomHeight + 2*padH - kH) / dH) + 1;
const int topWidth = int((bottomWidth + 2*padW - kW) / dW) + 1;
const int topDepth = int((bottomDepth + 2*padD - kD) / dD) + 1;
const int topHeight = int((bottomHeight + 2*padH -
dil_
kH) / dH) + 1;
const int topWidth = int((bottomWidth + 2*padW -
dil_
kW) / dW) + 1;
const int topDepth = int((bottomDepth + 2*padD -
dil_
kD) / dD) + 1;
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
...
...
@@ -345,6 +487,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD,
col->devdata);
...
...
@@ -392,6 +535,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
im3d2col(bottom->devdata + n * bottom_stride, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD,
col->devdata);
...
...
@@ -461,6 +605,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
col2im3d(col->devdata, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD, bottom->devdata + n * bottom_stride);
cudaError_t err = cudaGetLastError();
...
...
theano/sandbox/cuda/tests/test_gemmcorr3d.py
浏览文件 @
1d2411c6
from
__future__
import
absolute_import
,
print_function
,
division
import
unittest
import
numpy
from
six.moves
import
xrange
try
:
from
scipy
import
ndimage
except
ImportError
:
ndimage
=
None
import
theano
from
theano.tests
import
unittest_tools
as
utt
...
...
@@ -21,31 +26,127 @@ else:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
# python reference implementation of a 3D convolution
# see also: theano.tensor.nnet.tests.test_conv3d2d
# expects: (batch, 0, channels, 1, 2)
def
pyconv3d
(
signals
,
filters
,
border_mode
=
'valid'
,
dilation
=
(
1
,
1
,
1
)):
Ns
,
Ts
,
C
,
Hs
,
Ws
=
signals
.
shape
Nf
,
Tf
,
C
,
Hf
,
Wf
=
filters
.
shape
Tdil
,
Hdil
,
Wdil
=
dilation
Tfdil
=
(
Tf
-
1
)
*
Tdil
+
1
Hfdil
=
(
Hf
-
1
)
*
Hdil
+
1
Wfdil
=
(
Wf
-
1
)
*
Wdil
+
1
# if border_mode is not 'valid', the signals need zero-padding
if
border_mode
==
'full'
:
Tpad
=
Tfdil
-
1
Hpad
=
Hfdil
-
1
Wpad
=
Wfdil
-
1
elif
border_mode
==
'half'
:
Tpad
=
Tfdil
//
2
Hpad
=
Hfdil
//
2
Wpad
=
Wfdil
//
2
elif
isinstance
(
border_mode
,
tuple
):
Tpad
,
Hpad
,
Wpad
=
map
(
int
,
border_mode
)
else
:
Tpad
=
0
Hpad
=
0
Wpad
=
0
if
Tpad
>
0
or
Hpad
>
0
or
Wpad
>
0
:
# zero-pad signals
signals_padded
=
numpy
.
zeros
((
Ns
,
Ts
+
2
*
Tpad
,
C
,
Hs
+
2
*
Hpad
,
Ws
+
2
*
Wpad
),
'float32'
)
signals_padded
[:,
Tpad
:(
Ts
+
Tpad
),
:,
Hpad
:(
Hs
+
Hpad
),
Wpad
:(
Ws
+
Wpad
)]
=
signals
Ns
,
Ts
,
C
,
Hs
,
Ws
=
signals_padded
.
shape
signals
=
signals_padded
Tfdil2
=
Tfdil
//
2
Hfdil2
=
Hfdil
//
2
Wfdil2
=
Wfdil
//
2
dilated_filters
=
numpy
.
zeros
((
Nf
,
Tfdil
,
C
,
Hfdil
,
Wfdil
),
dtype
=
filters
.
dtype
)
dilated_filters
[:,
::
Tdil
,
:,
::
Hdil
,
::
Wdil
]
=
filters
# perform valid convolution on the padded signals
rval
=
numpy
.
zeros
((
Ns
,
Ts
-
Tfdil
+
1
,
Nf
,
Hs
-
Hfdil
+
1
,
Ws
-
Wfdil
+
1
))
for
ns
in
xrange
(
Ns
):
for
nf
in
xrange
(
Nf
):
for
c
in
xrange
(
C
):
s_i
=
signals
[
ns
,
:,
c
,
:,
:]
f_i
=
dilated_filters
[
nf
,
:,
c
,
:,
:]
r_i
=
rval
[
ns
,
:,
nf
,
:,
:]
# scipy.signal.convolve performs valid convolution,
# but is quite slow. scipy.ndimage.convolve is faster
# only supports 'same' convolution.
# origin must be -1 for even filters, 0 for odd filters
o_i
=
ndimage
.
convolve
(
s_i
,
f_i
,
mode
=
'constant'
,
cval
=
1
,
origin
=
(
f_i
.
shape
[
0
]
%
2
-
1
,
f_i
.
shape
[
1
]
%
2
-
1
,
f_i
.
shape
[
2
]
%
2
-
1
))
# crop to get the result of 'valid' convolution
o_i
=
o_i
[
Tfdil2
:(
r_i
.
shape
[
0
]
+
Tfdil2
),
Hfdil2
:(
r_i
.
shape
[
1
]
+
Hfdil2
),
Wfdil2
:(
r_i
.
shape
[
2
]
+
Wfdil2
)]
# the result should be equal to 'valid' convolution
# utt.assert_allclose(o_i, signal.convolve(s_i, f_i, mode='valid'))
r_i
+=
o_i
return
rval
class
TestCorr3DMM
(
unittest
.
TestCase
):
def
run_conv_valid
(
self
,
inputs_shape
,
filters_shape
,
subsample
=
(
1
,
1
,
1
)):
border_mode
=
'valid'
,
filter_dilation
=
(
1
,
1
,
1
),
subsample
=
(
1
,
1
,
1
),
verify_grad
=
False
):
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
filters_val
=
numpy
.
random
.
random
(
filters_shape
)
.
astype
(
'float32'
)
inputs
=
shared
(
inputs_val
)
filters
=
shared
(
filters_val
)
bias
=
shared
(
numpy
.
zeros
(
filters_shape
[
0
])
.
astype
(
'float32'
))
conv_ref
=
theano
.
tensor
.
nnet
.
conv3D
(
V
=
inputs
,
W
=
filters
,
b
=
bias
,
d
=
subsample
)
conv
=
GpuCorr3dMM
(
border_mode
=
"valid"
,
if
filter_dilation
==
(
1
,
1
,
1
)
and
border_mode
in
(
'valid'
,
(
0
,
0
,
0
)):
conv_ref
=
theano
.
tensor
.
nnet
.
conv3D
(
V
=
inputs
,
W
=
filters
,
b
=
bias
,
d
=
subsample
)
f_ref
=
theano
.
function
([],
conv_ref
)
res_ref
=
f_ref
()
elif
subsample
==
(
1
,
1
,
1
):
if
ndimage
is
None
:
raise
SkipTest
(
'This test needs SciPy.'
)
# input = b012c
# pyconv3d wants = b0c12 = (0, 1, 4, 2, 3)
# pyconv3d outputs = b0c12 = (0, 1, 3, 4, 2)
res_ref
=
pyconv3d
(
signals
=
inputs_val
.
transpose
(
0
,
1
,
4
,
2
,
3
),
filters
=
filters_val
.
transpose
(
0
,
1
,
4
,
2
,
3
)[:,
::
-
1
,
:,
::
-
1
,
::
-
1
],
dilation
=
filter_dilation
,
border_mode
=
border_mode
)
.
transpose
(
0
,
1
,
3
,
4
,
2
)
else
:
raise
SkipTest
(
'No reference implementation that combines '
'border_mode and subsampling.'
)
conv
=
GpuCorr3dMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
)(
inputs
.
dimshuffle
(
0
,
4
,
1
,
2
,
3
),
filters
.
dimshuffle
(
0
,
4
,
1
,
2
,
3
))
conv
=
conv
.
dimshuffle
(
0
,
2
,
3
,
4
,
1
)
f_ref
=
theano
.
function
([],
conv_ref
)
f
=
theano
.
function
([],
conv
,
mode
=
mode_with_gpu
)
res_ref
=
f_ref
()
res
=
f
()
utt
.
assert_allclose
(
res_ref
,
res
)
if
verify_grad
:
utt
.
verify_grad
(
GpuCorr3dMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
),
[
inputs_val
.
transpose
(
0
,
4
,
1
,
2
,
3
),
filters_val
.
transpose
(
0
,
4
,
1
,
2
,
3
)])
def
test_valid
(
self
):
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
16
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
))
...
...
@@ -68,6 +169,50 @@ class TestCorr3DMM(unittest.TestCase):
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
1
,
2
,
3
))
def
test_border_mode
(
self
):
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
'valid'
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
'half'
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
'full'
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
(
0
,
0
,
0
))
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
(
1
,
2
,
3
))
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
(
3
,
2
,
1
))
def
test_filter_dilation
(
self
):
inputs_shape
=
[
16
,
20
,
12
,
15
,
1
]
filters_shape
=
[
10
,
6
,
5
,
4
,
1
]
for
filter_dilation
in
[(
2
,
1
,
1
),
(
1
,
2
,
1
),
(
1
,
1
,
2
)]:
for
border_mode
in
[
'valid'
,
'half'
,
'full'
]:
self
.
run_conv_valid
(
inputs_shape
=
inputs_shape
,
filters_shape
=
filters_shape
,
filter_dilation
=
filter_dilation
,
border_mode
=
border_mode
)
def
test_verify_gradients
(
self
):
# use a small example to check the gradients
inputs_shape
=
[
2
,
7
,
9
,
6
,
1
]
filters_shape
=
[
1
,
3
,
3
,
2
,
1
]
for
filter_dilation
in
[(
2
,
1
,
1
),
(
1
,
2
,
1
),
(
1
,
1
,
2
)]:
for
border_mode
in
[
'valid'
,
'half'
,
'full'
,
(
2
,
1
,
3
)]:
self
.
run_conv_valid
(
inputs_shape
=
inputs_shape
,
filters_shape
=
filters_shape
,
filter_dilation
=
filter_dilation
,
border_mode
=
border_mode
,
verify_grad
=
True
)
def
run_gradweight
(
self
,
inputs_shape
,
filters_shape
,
dCdH_shape
,
subsample
=
(
1
,
1
,
1
)):
inputs_val
=
numpy
.
random
.
random
(
inputs_shape
)
.
astype
(
'float32'
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论