Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
a725adf3
提交
a725adf3
authored
8月 18, 2014
作者:
f0k
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactored GpuCorrMM to be split into separate ops for the forward pass and the two backward passes
上级
e76a29d9
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
375 行增加
和
227 行删除
+375
-227
blas.py
theano/sandbox/cuda/blas.py
+223
-111
conv_gemm.cu
theano/sandbox/cuda/conv_gemm.cu
+134
-103
opt.py
theano/sandbox/cuda/opt.py
+13
-8
test_conv_cuda_ndarray.py
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+5
-5
没有找到文件。
theano/sandbox/cuda/blas.py
浏览文件 @
a725adf3
...
@@ -8,6 +8,7 @@ from theano.compat.six import StringIO
...
@@ -8,6 +8,7 @@ from theano.compat.six import StringIO
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda
import
GpuOp
from
theano.sandbox.cuda
import
GpuOp
from
theano.sandbox.cuda
import
as_cuda_ndarray_variable
from
theano.sandbox.cuda
import
as_cuda_ndarray_variable
from
theano.sandbox.cuda.basic_ops
import
gpu_contiguous
class
GpuDot22
(
GpuOp
):
class
GpuDot22
(
GpuOp
):
...
@@ -500,60 +501,22 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
...
@@ -500,60 +501,22 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
gpu_ger_inplace
=
GpuGer
(
inplace
=
True
)
gpu_ger_inplace
=
GpuGer
(
inplace
=
True
)
class
GpuCorrMM
(
GpuOp
):
class
BaseGpuCorrMM
(
GpuOp
):
"""GPU correlation/convolution implementation using Matrix Multiplication.
"""Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
`GpuCorrMM_gradInputs`. Cannot be used directly."""
:note: It doesn't implement the grad. So you shouldn't use it directly, but
def
__init__
(
self
,
border_mode
=
"valid"
,
use :func:`conv2d <theano.tensor.nnet.conv.conv2d>` and then enable the
Theano flag ``optimizer_including=conv_gemm`` to automatically replace
all convolution operations with `GpuCorrMM`.
"""
def
__init__
(
self
,
border_mode
,
subsample
=
(
1
,
1
),
subsample
=
(
1
,
1
),
pad
=
(
0
,
0
)):
pad
=
(
0
,
0
)):
"""
if
border_mode
!=
"valid"
:
:param border_mode: "valid" or "full"
raise
ValueError
(
"border_mode must be 'valid'"
)
:param subsample: the subsample operation applied to each output image.
Should be a tuple with 2 elements.
(sv, sh) is equivalent to GpuCorrMM(...)(...)[:,:,::sv, ::sh]
If border_mode="full", this is instead treated as an upsampling
operation applied to each input image.
Set to (1, 1) to disable downsampling/upsampling.
:param pad: the width of a border of implicit zeros to pad the input
image with. Should be a tuple with 2 elements giving the numbers of
rows and columns to pad on each side, or "auto" to set the padding
to (kernel_rows - 1, kernel_columns - 1) at runtime.
If border_mode="full", this is instead treated as the width of a
border to crop from the output image.
Set to (0, 0) to disable padding/cropping.
:note: The border_mode changes the meaning of several parameters.
If border_mode="valid", the Op does a valid correlation of a padded
input image and subsamples it. (To perform a convolution instead,
you will need to flip the kernels.)
If border_mode="full", the Op does a full convolution of an
upsampled input image and crops it. (This can be used as a backward
pass of the valid correlation done with border_mode="valid".)
Combined with pad="auto", you can use border_mode="valid" to
simulate a full correlation with subsampling, or border_mode="full"
to simulate a valid convolution with upsampling.
:note: Currently, the Op requires a very specific memory layout.
For border_mode="valid", inputs, filters and outputs must be
C-contiguous. For border_mode="full", the same applies, except that
the strides of the first two dimensions of the filters (output and
input channels) must be swapped compared to C-contiguity.
"""
self
.
border_mode
=
border_mode
self
.
border_mode
=
border_mode
if
len
(
subsample
)
!=
2
:
raise
ValueError
(
"subsample must have two elements"
)
self
.
subsample
=
subsample
self
.
subsample
=
subsample
#if (border_mode == "full") and (subsample != (1,1)):
if
(
pad
!=
"auto"
)
and
(
len
(
pad
)
!=
2
):
# raise NotImplementedError(
raise
ValueError
(
"pad must be 'auto' or have two elements"
)
# "GpuCorrMM doesn't support subsampling for border_mode='full'")
self
.
pad
=
pad
self
.
pad
=
pad
#if (border_mode == "full") and (pad != (0,0)):
# raise NotImplementedError(
# "GpuCorrMM doesn't support padding for border_mode='full'")
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
\
return
type
(
self
)
==
type
(
other
)
\
...
@@ -576,34 +539,19 @@ class GpuCorrMM(GpuOp):
...
@@ -576,34 +539,19 @@ class GpuCorrMM(GpuOp):
str
(
self
.
subsample
),
str
(
self
.
subsample
),
self
.
pad
)
self
.
pad
)
def
make_node
(
self
,
img
,
kern
):
def
flops
(
self
,
inp
,
outp
):
img
=
as_cuda_ndarray_variable
(
img
)
""" Useful with the hack in profilemode to print the MFlops"""
kern
=
as_cuda_ndarray_variable
(
kern
)
# if the output shape is correct, then this gives the correct
if
img
.
type
.
ndim
!=
4
:
# flops for any direction, sampling, padding, and border mode
raise
TypeError
(
'img must be 4D tensor'
)
inputs
,
filters
=
inp
if
kern
.
type
.
ndim
!=
4
:
outputs
,
=
outp
raise
TypeError
(
'kern must be 4D tensor'
)
assert
inputs
[
1
]
==
filters
[
1
]
# nb mul and add by output pixel
broadcastable
=
[
img
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
0
],
flops
=
filters
[
2
]
*
filters
[
3
]
*
2
False
,
False
]
# nb flops by output image
return
Apply
(
self
,
[
img
,
kern
],
[
CudaNdarrayType
(
broadcastable
)()])
flops
*=
outputs
[
2
]
*
outputs
[
3
]
# nb patch multiplied
def
flops
(
self
,
inputs
,
outputs
):
flops
*=
inputs
[
1
]
*
filters
[
0
]
*
inputs
[
0
]
images
,
kerns
=
inputs
out
,
=
outputs
assert
images
[
1
]
==
kerns
[
1
]
flops
=
0
if
self
.
border_mode
==
"valid"
:
# nb mul and add by output pixel
flops
=
kerns
[
2
]
*
kerns
[
3
]
*
2
# nb flops by output image
flops
*=
out
[
2
]
*
out
[
3
]
# nb patch multiplied
flops
*=
images
[
1
]
*
kerns
[
0
]
*
images
[
0
]
else
:
flops
=
(
images
[
0
]
*
kerns
[
0
]
*
images
[
1
]
*
kerns
[
2
]
*
kerns
[
3
]
*
images
[
2
]
*
images
[
3
]
*
2
)
return
flops
return
flops
def
c_headers
(
self
):
def
c_headers
(
self
):
...
@@ -621,61 +569,98 @@ class GpuCorrMM(GpuOp):
...
@@ -621,61 +569,98 @@ class GpuCorrMM(GpuOp):
for
f
in
files
]
for
f
in
files
]
return
reduce
(
str
.
__add__
,
codes
)
return
reduce
(
str
.
__add__
,
codes
)
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
def
c_code
(
self
,
bottom
,
weights
,
top
,
direction
,
sub
):
img
,
kern
=
inp
# This is the shared code for GpuCorrMM (direction="forward"),
out
,
=
out_
# GpuCorrMM_gradWeights (direction="backprop weights"), and
dx
=
self
.
subsample
[
0
]
# GpuCorrMM_gradInputs (direction="backprop inputs").
dy
=
self
.
subsample
[
1
]
# Depending on the direction, one of bottom, weights, top will
# receive the output, while the other two serve as inputs.
if
self
.
border_mode
!=
"valid"
:
raise
ValueError
(
"mode must be 'valid'"
)
dH
,
dW
=
self
.
subsample
if
self
.
pad
==
"auto"
:
if
self
.
pad
==
"auto"
:
padH
=
padW
=
-
1
padH
=
padW
=
-
1
else
:
else
:
padH
=
self
.
pad
[
0
]
padH
,
padW
=
self
.
pad
padW
=
self
.
pad
[
1
]
if
direction
==
"forward"
:
if
self
.
border_mode
==
"valid"
:
direction
=
0
bmode
=
1
out
=
top
elif
self
.
border_mode
==
"full"
:
elif
direction
==
"backprop weights"
:
bmode
=
0
direction
=
1
else
:
out
=
weights
raise
ValueError
(
"mode must be one of 'full' or 'valid'"
)
elif
direction
==
"backprop inputs"
:
direction
=
2
out
=
bottom
sub
=
sub
.
copy
()
sub
=
sub
.
copy
()
sub
.
update
(
locals
())
sub
.
update
(
locals
())
return
"""
return
"""
//Mandatory args
//
Mandatory args
int
mode =
%(bmode)
s;
int
direction =
%(direction)
s; // forward, bprop weights, bprop inputs
//Optional args
//
Optional args
int d
x =
%(dx
)
s;
int d
H =
%(dH
)
s;
int d
y =
%(dy
)
s;
int d
W =
%(dW
)
s;
int padH =
%(padH)
s;
int padH =
%(padH)
s;
int padW =
%(padW)
s;
int padW =
%(padW)
s;
CudaNdarray * img =
%(img)
s;
CudaNdarray * bottom =
%(bottom)
s;
CudaNdarray * kern =
%(kern)
s;
CudaNdarray * weights =
%(weights)
s;
CudaNdarray * top =
%(top)
s;
CudaNdarray * out2 = NULL;
CudaNdarray * out2 = NULL;
//Auto-padding if requested
// Obtain or infer kernel width and height
int kH, kW;
if (direction != 1) {
kH = CudaNdarray_HOST_DIMS(weights)[2];
kW = CudaNdarray_HOST_DIMS(weights)[3];
}
else {
kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
}
// Auto-padding if requested
if (padH < 0) {
if (padH < 0) {
padH =
CudaNdarray_HOST_DIMS(kern)[2]
- 1;
padH =
kH
- 1;
}
}
if (padW < 0) {
if (padW < 0) {
padW =
CudaNdarray_HOST_DIMS(kern)[3]
- 1;
padW =
kW
- 1;
}
}
// Infer output shape
int out_dim[4];
int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
switch(direction) {
out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
case 0: // forward pass
if (mode == 1) // valid correlation with padding and subsampling
// output is top: (batchsize, num_filters, height, width)
{
// height and width: top = (bottom + 2*pad - weight) / sample + 1
out_dim[2] = ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + 2*padH - CudaNdarray_HOST_DIMS(kern)[2] + 1, dx);
out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[3] = ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + 2*padW - CudaNdarray_HOST_DIMS(kern)[3] + 1, dy);
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
}
out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
else // full convolution with upsampling and cropping
out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
{
break;
out_dim[2] = (CudaNdarray_HOST_DIMS(img)[2] - 1) * dx + CudaNdarray_HOST_DIMS(kern)[2] - 2*padH;
case 1: // backprop wrt. weights
out_dim[3] = (CudaNdarray_HOST_DIMS(img)[3] - 1) * dy + CudaNdarray_HOST_DIMS(kern)[3] - 2*padW;
// output is weights: (num_filters, num_channels, height, width)
// height and width: weights = bottom + 2*pad - (top - 1) * sample
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[0];
out_dim[2] = kH; // already inferred further above
out_dim[3] = kW; // how convenient
break;
case 2: // backprop wrt. inputs
// output is bottom: (batchsize, num_channels, height, width)
// height and width: bottom = (top - 1) * sample + weights - 2*pad
out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
out_dim[2] = (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
out_dim[3] = (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
break;
default:
PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2
\\
n");
%(fail)
s
}
}
// Prepare output array
if ( !(
%(out)
s
if ( !(
%(out)
s
&&
%(out)
s->nd==4
&&
%(out)
s->nd==4
&& CudaNdarray_is_c_contiguous(
%(out)
s)
&& CudaNdarray_is_c_contiguous(
%(out)
s)
...
@@ -688,7 +673,8 @@ class GpuCorrMM(GpuOp):
...
@@ -688,7 +673,8 @@ class GpuCorrMM(GpuOp):
%(out)
s = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
%(out)
s = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
}
}
out2 = corrMM(
%(img)
s,
%(kern)
s,
%(out)
s, mode, dx, dy, padH, padW);
// Call CUDA code
out2 = corrMM(
%(bottom)
s,
%(weights)
s,
%(top)
s, direction, dH, dW, padH, padW);
if (out2==NULL){
if (out2==NULL){
%(fail)
s
%(fail)
s
}
}
...
@@ -697,6 +683,132 @@ class GpuCorrMM(GpuOp):
...
@@ -697,6 +683,132 @@ class GpuCorrMM(GpuOp):
"""
%
sub
"""
%
sub
class
GpuCorrMM
(
BaseGpuCorrMM
):
"""GPU correlation implementation using Matrix Multiplication.
:note: You can either enable the Theano flag `optimizer_including=conv_gemm`
to automatically replace all convolution operations with `GpuCorrMM`
or one of its gradients, or you can use it as a replacement for
:func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
`GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
faster, but note that it computes a correlation -- if you need to
compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
"""
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
),
pad
=
(
0
,
0
)):
"""
:param border_mode: currently supports "valid" only; "full" can be
simulated by setting `pad="auto"` (at the cost of performance), or
by using `GpuCorrMM_gradInputs`
:param subsample: the subsample operation applied to each output image.
Should be a tuple with 2 elements.
`(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
but faster.
Set to `(1, 1)` to disable subsampling.
:param pad: the width of a border of implicit zeros to pad the input
image with. Should be a tuple with 2 elements giving the numbers of
rows and columns to pad on each side, or "auto" to set the padding
to `(kernel_rows - 1, kernel_columns - 1)` at runtime.
Set to `(0, 0)` to disable padding.
:note: Currently, the Op requires the inputs, filters and outputs to be
C-contiguous. Use :func:`gpu_contiguous
<theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
if needed.
"""
super
(
GpuCorrMM
,
self
)
.
__init__
(
border_mode
,
subsample
,
pad
)
def
make_node
(
self
,
img
,
kern
):
img
=
as_cuda_ndarray_variable
(
img
)
kern
=
as_cuda_ndarray_variable
(
kern
)
if
img
.
type
.
ndim
!=
4
:
raise
TypeError
(
'img must be 4D tensor'
)
if
kern
.
type
.
ndim
!=
4
:
raise
TypeError
(
'kern must be 4D tensor'
)
broadcastable
=
[
img
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
0
],
False
,
False
]
return
Apply
(
self
,
[
img
,
kern
],
[
CudaNdarrayType
(
broadcastable
)()])
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
bottom
,
weights
=
inp
top
,
=
out_
direction
=
"forward"
return
super
(
GpuCorrMM
,
self
)
.
c_code
(
bottom
,
weights
,
top
,
direction
,
sub
)
def
grad
(
self
,
inp
,
grads
):
bottom
,
weights
=
inp
top
,
=
grads
top
=
gpu_contiguous
(
top
)
d_bottom
=
GpuCorrMM_gradInputs
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
weights
,
top
)
d_weights
=
GpuCorrMM_gradWeights
(
self
.
border_mode
,
self
.
subsample
,
self
.
pad
)(
bottom
,
top
)
return
d_bottom
,
d_weights
class
GpuCorrMM_gradWeights
(
BaseGpuCorrMM
):
"""Gradient wrt. filters for `GpuCorrMM`.
:note: You will not want to use this directly, but rely on Theano's
automatic differentiation or graph optimization to use it as needed."""
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
),
pad
=
(
0
,
0
)):
super
(
GpuCorrMM_gradWeights
,
self
)
.
__init__
(
border_mode
,
subsample
,
pad
)
def
make_node
(
self
,
img
,
topgrad
):
img
=
as_cuda_ndarray_variable
(
img
)
topgrad
=
as_cuda_ndarray_variable
(
topgrad
)
if
img
.
type
.
ndim
!=
4
:
raise
TypeError
(
'img must be 4D tensor'
)
if
topgrad
.
type
.
ndim
!=
4
:
raise
TypeError
(
'topgrad must be 4D tensor'
)
broadcastable
=
[
topgrad
.
type
.
broadcastable
[
1
],
img
.
type
.
broadcastable
[
1
],
False
,
False
]
return
Apply
(
self
,
[
img
,
topgrad
],
[
CudaNdarrayType
(
broadcastable
)()])
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
bottom
,
top
=
inp
weights
,
=
out_
direction
=
"backprop weights"
return
super
(
GpuCorrMM_gradWeights
,
self
)
.
c_code
(
bottom
,
weights
,
top
,
direction
,
sub
)
class
GpuCorrMM_gradInputs
(
BaseGpuCorrMM
):
"""Gradient wrt. inputs for `GpuCorrMM`.
:note: You will not want to use this directly, but rely on Theano's
automatic differentiation or graph optimization to use it as needed."""
def
__init__
(
self
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
),
pad
=
(
0
,
0
)):
super
(
GpuCorrMM_gradInputs
,
self
)
.
__init__
(
border_mode
,
subsample
,
pad
)
def
make_node
(
self
,
kern
,
topgrad
):
kern
=
as_cuda_ndarray_variable
(
kern
)
topgrad
=
as_cuda_ndarray_variable
(
topgrad
)
if
kern
.
type
.
ndim
!=
4
:
raise
TypeError
(
'kern must be 4D tensor'
)
if
topgrad
.
type
.
ndim
!=
4
:
raise
TypeError
(
'topgrad must be 4D tensor'
)
broadcastable
=
[
topgrad
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
1
],
False
,
False
]
return
Apply
(
self
,
[
kern
,
topgrad
],
[
CudaNdarrayType
(
broadcastable
)()])
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
weights
,
top
=
inp
bottom
,
=
out_
direction
=
"backprop inputs"
return
super
(
GpuCorrMM_gradInputs
,
self
)
.
c_code
(
bottom
,
weights
,
top
,
direction
,
sub
)
##
##
# Not really a BLAS operation, but whatever.
# Not really a BLAS operation, but whatever.
#
#
...
...
theano/sandbox/cuda/conv_gemm.cu
浏览文件 @
a725adf3
...
@@ -161,18 +161,18 @@ void col2im(const float* data_col, const int channels,
...
@@ -161,18 +161,18 @@ void col2im(const float* data_col, const int channels,
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
CudaNdarray* corrMM(
const CudaNdarray *input,
CudaNdarray* corrMM(
CudaNdarray *const bottom,
CudaNdarray *weight,
CudaNdarray *
const
weight,
CudaNdarray *
output
,
CudaNdarray *
const top
,
int mode
,
const int direction
,
int dH = 1,
const
int dH = 1,
int dW = 1,
const
int dW = 1,
int padH = 0,
const
int padH = 0,
int padW = 0)
const
int padW = 0)
{
{
if (
input
->nd != 4)
if (
bottom
->nd != 4)
{
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires
input
of 4D");
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires
bottom
of 4D");
}
}
if (weight->nd != 4)
if (weight->nd != 4)
...
@@ -180,83 +180,75 @@ CudaNdarray* corrMM(const CudaNdarray *input,
...
@@ -180,83 +180,75 @@ CudaNdarray* corrMM(const CudaNdarray *input,
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
}
}
if (
output
->nd != 4)
if (
top
->nd != 4)
{
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires
output
of 4D");
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires
top
of 4D");
}
}
// Extract some shape information for later and check shape consistency
// Extract some shape information for later and check shape consistency
//
inputs: (batchSize, nInputPlane, inputHeight, input
Width)
//
bottom: (batchSize, nChannels, bottomHeight, bottom
Width)
const int batchSize = CudaNdarray_HOST_DIMS(
input
)[0];
const int batchSize = CudaNdarray_HOST_DIMS(
bottom
)[0];
const int n
InputPlane = CudaNdarray_HOST_DIMS(input
)[1];
const int n
Channels = CudaNdarray_HOST_DIMS(bottom
)[1];
const int
inputHeight = CudaNdarray_HOST_DIMS(input
)[2];
const int
bottomHeight = CudaNdarray_HOST_DIMS(bottom
)[2];
const int
inputWidth = CudaNdarray_HOST_DIMS(input
)[3];
const int
bottomWidth = CudaNdarray_HOST_DIMS(bottom
)[3];
//
filters: (nOutputPlane, nInputPlane
, rows, columns)
//
weights: (nFilters, nChannels
, rows, columns)
const int n
OutputPlane
= CudaNdarray_HOST_DIMS(weight)[0];
const int n
Filters
= CudaNdarray_HOST_DIMS(weight)[0];
const int kH = CudaNdarray_HOST_DIMS(weight)[2];
const int kH = CudaNdarray_HOST_DIMS(weight)[2];
const int kW = CudaNdarray_HOST_DIMS(weight)[3];
const int kW = CudaNdarray_HOST_DIMS(weight)[3];
if (n
InputPlane
!= CudaNdarray_HOST_DIMS(weight)[1]) {
if (n
Channels
!= CudaNdarray_HOST_DIMS(weight)[1]) {
PyErr_SetString(PyExc_ValueError,
PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n");
"GpuCorrMM images and kernel must have the same stack size\n");
return NULL;
return NULL;
}
}
// outputs: (batchSize, nOutputPlane, outputHeight, outputWidth)
// top: (batchSize, nFilters, topHeight, topWidth)
int outputHeight, outputWidth;
const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
if (mode == 1) { // valid correlation with padding and subsampling
const int topWidth = (bottomWidth + 2*padW - kW) / dW + 1;
outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
}
topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
else if (mode == 0) { // full convolution with upsampling and cropping
topWidth != CudaNdarray_HOST_DIMS(top)[3]) {
// these would be the shapes for a standard full convolution:
//outputHeight = (inputHeight + 2*padH + kH - 2) / dH + 1;
//outputWidth = (inputWidth + 2*padW + kW - 2) / dW + 1;
// but here, dH and dW are *upsampling* factors, and padding is reversed
// (because the implementation was meant as a backward pass for a CNN)
outputHeight = (inputHeight - 1) * dH + kH - 2*padH;
outputWidth = (inputWidth - 1) * dW + kW - 2*padW;
}
if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
outputWidth != CudaNdarray_HOST_DIMS(output)[3]) {
PyErr_Format(PyExc_ValueError,
PyErr_Format(PyExc_ValueError,
"GpuCorrMM output parameter has wrong shape %d %d %d %d, expected %d %d %d %d\n",
"GpuCorrMM shape inconsistency: From bottom and weights, "
CudaNdarray_HOST_DIMS(output)[0], CudaNdarray_HOST_DIMS(output)[1],
"top shape should be %d %d %d %d, but is %d %d %d %d.\n",
CudaNdarray_HOST_DIMS(output)[2], CudaNdarray_HOST_DIMS(output)[3],
batchSize, nFilters, topHeight, topWidth,
batchSize, nOutputPlane, outputHeight, outputWidth);
CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3]);
return NULL;
return NULL;
}
}
if (mode == 1) { // valid correlation: im2col, then gemm
// Create temporary columns
// Create temporary columns (col_data)
int col_dim[2];
int col_dim[2];
col_dim[0] = nChannels * kW * kH;
col_dim[0] = nInputPlane * kW * kH;
col_dim[1] = topHeight * topWidth;
col_dim[1] = outputHeight * outputWidth;
CudaNdarray* col = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
CudaNdarray* col_data = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
// Define some useful variables
const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
const int K_ = col_dim[0];
const int N_ = col_dim[1];
const int M_ = nFilters;
const float one = 1.0f;
const float zero = 0.0f;
// Define some useful variables
CudaNdarray *output;
const int ip_stride = CudaNdarray_HOST_STRIDES(input)[0];
if (direction == 0) { // forward pass
const int op_stride = CudaNdarray_HOST_STRIDES(output)[0];
output = top;
const int K_ = col_dim[0];
// valid correlation: im2col, then gemm
const int N_ = col_dim[1];
const int M_ = nOutputPlane;
const float alpha = 1.0f;
const float beta = 0.0f;
// Iterate over batch
// Iterate over batch
for (int n = 0; n < batchSize; n++) {
for (int n = 0; n < batchSize; n++) {
// First, im2col
// First, im2col
im2col(
input->devdata + n * ip_stride, nInputPlane, input
Height,
im2col(
bottom->devdata + n * bottom_stride, nChannels, bottom
Height,
inputWidth, kH, kW, padH, padW, dH, dW, col_data
->devdata);
bottomWidth, kH, kW, padH, padW, dH, dW, col
->devdata);
// Second, gemm
// Second, gemm
cublasStatus_t status = cublasSgemm(handle,
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
CUBLAS_OP_N, CUBLAS_OP_N,
N_, M_, K_,
N_, M_, K_,
&
alpha
,
&
one
,
col
_data
->devdata, N_,
col->devdata, N_,
weight->devdata, K_,
weight->devdata, K_,
&
beta
,
&
zero
,
output->devdata + n *
op_stride, N_);
top->devdata + n * t
op_stride, N_);
if (status != CUBLAS_STATUS_SUCCESS) {
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n",
"GpuCorrMM encountered a CUBLAS error: %s\n",
...
@@ -264,17 +256,11 @@ CudaNdarray* corrMM(const CudaNdarray *input,
...
@@ -264,17 +256,11 @@ CudaNdarray* corrMM(const CudaNdarray *input,
return NULL;
return NULL;
}
}
}
}
// Free temporary columns
Py_DECREF(col_data);
/*
/*
// Original caffe code for comparison
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is for grouped convolution; we can ignore groups
// Note that this is for grouped convolution; we can ignore groups here,
const Dtype* bottom_data = bottom[i]->gpu_data();
// but the group-related offsets help explain what M_, N_ and K_ are
Dtype* top_data = (*top)[i]->mutable_gpu_data();
Dtype* col_data = col_buffer_.mutable_gpu_data();
const Dtype* weight = this->blobs_[0]->gpu_data();
int weight_offset = M_ * K_;
int weight_offset = M_ * K_;
int col_offset = K_ * N_;
int col_offset = K_ * N_;
int top_offset = M_ * N_;
int top_offset = M_ * N_;
...
@@ -300,33 +286,81 @@ CudaNdarray* corrMM(const CudaNdarray *input,
...
@@ -300,33 +286,81 @@ CudaNdarray* corrMM(const CudaNdarray *input,
}
}
*/
*/
}
}
else if (mode == 0) { // full convolution: gemm, then col2im
else if (direction == 1) { // backprop wrt. weights
// Create temporary columns (col_diff)
output = weight;
int col_dim[2];
// valid convolution: im2col, then gemm
col_dim[0] = nOutputPlane * kW * kH;
// Initialize target with zeros as we will accumulate into it
col_dim[1] = inputHeight * inputWidth;
// (all kernels run on the null stream, so we don't need to synchronize)
CudaNdarray* col_diff = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
cudaError_t err = cudaMemsetAsync(weight->devdata, 0,
sizeof(float) * M_ * K_);
// Define some useful variables
if (err != cudaSuccess) {
const int ip_stride = CudaNdarray_HOST_STRIDES(input)[0];
PyErr_Format(PyExc_RuntimeError,
const int op_stride = CudaNdarray_HOST_STRIDES(output)[0];
"GpuCorrMM encountered a CUDA error: %s\n",
const int K_ = col_dim[0];
cudaGetErrorString(err));
const int N_ = col_dim[1];
return NULL;
const int M_ = nInputPlane;
}
const float alpha = 1.0f;
// Iterate over batch
const float beta = 0.0f;
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
// Second, gemm
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
K_, M_, N_,
&one,
col->devdata, N_,
top->devdata + n * top_stride, N_,
&one,
weight->devdata, K_);
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n",
cublasGetErrorString(status));
return NULL;
}
}
/*
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is for grouped convolution; we can ignore groups
for (int n = 0; n < num_; ++n) {
// Since we saved memory in the forward pass by not storing all col
// data, we will need to recompute them.
im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
stride_h_, stride_w_, col_data);
// gradient w.r.t. weight. Note that we will accumulate diffs.
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
(Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
col_data + col_offset * g, (Dtype)1.,
weight_diff + weight_offset * g);
== (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
1.0,
col_data + col_offset * g, N_,
top_diff + top[i]->offset(n) + top_offset * g, N_,
1.0,
weight_diff + weight_offset * g, K_);
}
}
*/
}
else if (direction == 2) { // backprop wrt. inputs
output = bottom;
// full convolution: gemm, then col2im
// Iterate over batch
// Iterate over batch
for (int n = 0; n < batchSize; n++) {
for (int n = 0; n < batchSize; n++) {
// gemm into columns
// gemm into columns
cublasStatus_t status = cublasSgemm(handle,
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_T,
CUBLAS_OP_N, CUBLAS_OP_T,
N_, K_, M_,
N_, K_, M_,
&
alpha
,
&
one
,
input->devdata + n * i
p_stride, N_,
top->devdata + n * to
p_stride, N_,
weight->devdata, K_,
weight->devdata, K_,
&
beta
,
&
zero
,
col
_diff
->devdata, N_);
col->devdata, N_);
if (status != CUBLAS_STATUS_SUCCESS) {
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n",
"GpuCorrMM encountered a CUBLAS error: %s\n",
...
@@ -334,22 +368,15 @@ CudaNdarray* corrMM(const CudaNdarray *input,
...
@@ -334,22 +368,15 @@ CudaNdarray* corrMM(const CudaNdarray *input,
return NULL;
return NULL;
}
}
// col2im back to the data
// col2im back to the data
col2im(col
_diff->devdata, nOutputPlane, outputHeight, output
Width,
col2im(col
->devdata, nChannels, bottomHeight, bottom
Width,
kH, kW, padH, padW, dH, dW,
output->devdata + n * op
_stride);
kH, kW, padH, padW, dH, dW,
bottom->devdata + n * bottom
_stride);
}
}
// Free temporary columns
Py_DECREF(col_diff);
/*
/*
// Original caffe code for comparison
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is the backward pass of a valid convolution, so
// top_diff is the input, bottom_diff is the output, weights are weights
Dtype* col_data = col_buffer_.mutable_gpu_data();
Dtype* col_diff = col_buffer_.mutable_gpu_diff();
Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff();
for (int n = 0; n < num_; ++n) {
for (int n = 0; n < num_; ++n) {
// gradient w.r.t. bottom data, if necessary
// gradient w.r.t. bottom data, if necessary
if (propagate_down[i]) {
for (int g = 0; g < group_; ++g) {
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
(Dtype)1., weight + weight_offset * g,
(Dtype)1., weight + weight_offset * g,
...
@@ -367,9 +394,13 @@ CudaNdarray* corrMM(const CudaNdarray *input,
...
@@ -367,9 +394,13 @@ CudaNdarray* corrMM(const CudaNdarray *input,
col2im_gpu(col_diff, channels_, height_, width_,
col2im_gpu(col_diff, channels_, height_, width_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
bottom_diff + (*bottom)[i]->offset(n));
bottom_diff + (*bottom)[i]->offset(n));
}
}
}
*/
*/
}
}
// Free temporary columns
Py_DECREF(col);
return output;
return output;
}
}
theano/sandbox/cuda/opt.py
浏览文件 @
a725adf3
...
@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import (
...
@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import (
GpuIncSubtensor
,
gpu_alloc
,
GpuAlloc
,
gpu_shape
)
GpuIncSubtensor
,
gpu_alloc
,
GpuAlloc
,
gpu_shape
)
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.blas
import
(
gpu_dot22
,
gpu_dot22scalar
,
from
theano.sandbox.cuda.blas
import
(
gpu_dot22
,
gpu_dot22scalar
,
gpu_gemm_inplace
,
gpu_gemm_no_inplace
,
GpuConv
,
GpuCorrMM
)
gpu_gemm_inplace
,
gpu_gemm_no_inplace
,
GpuConv
,
GpuCorrMM
,
GpuCorrMM_gradInputs
,
GpuCorrMM_gradWeights
)
from
theano.sandbox.cuda.blas
import
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_gemv_inplace
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
...
@@ -1354,19 +1355,23 @@ def local_conv_gemm(node):
...
@@ -1354,19 +1355,23 @@ def local_conv_gemm(node):
border_mode
=
node
.
op
.
border_mode
border_mode
=
node
.
op
.
border_mode
subsample
=
node
.
op
.
subsample
subsample
=
node
.
op
.
subsample
pad
=
(
0
,
0
)
pad
=
(
0
,
0
)
if
(
border_mode
==
'full'
)
and
(
(
subsample
!=
(
1
,
1
))
or
(
pad
!=
(
0
,
0
)
)):
if
(
border_mode
==
'full'
)
and
(
subsample
!=
(
1
,
1
)):
# need to simulate this via a padded valid convolution
# need to simulate this via a padded valid convolution
pad
=
'auto'
pad
=
'auto'
border_mode
=
'valid'
border_mode
=
'valid'
if
(
border_mode
==
'valid'
):
if
(
border_mode
==
'valid'
):
# need to flip the kernel for valid convolution
# need to flip the kernel for valid convolution
kern
=
gpu_contiguous
(
kern
[:,
:,
::
-
1
,
::
-
1
])
kern
=
kern
[:,
:,
::
-
1
,
::
-
1
]
# call GpuCorrMM
# TODO: call GpuCorrMM_gradWeights instead if appropriate
return
[
GpuCorrMM
(
'valid'
,
subsample
,
pad
)(
gpu_contiguous
(
img
),
gpu_contiguous
(
kern
))]
elif
(
border_mode
==
'full'
):
elif
(
border_mode
==
'full'
):
# need to
bring kernel into correct memory layout
for full convolution
# need to
dimshuffle the kernel
for full convolution
kern
=
gpu_contiguous
(
kern
.
dimshuffle
(
1
,
0
,
2
,
3
))
.
dimshuffle
(
1
,
0
,
2
,
3
)
kern
=
kern
.
dimshuffle
(
1
,
0
,
2
,
3
)
# need C-contiguous i
nputs
# call GpuCorrMM_gradI
nputs
img
=
gpu_contiguous
(
img
)
return
[
GpuCorrMM_gradInputs
(
'valid'
,
subsample
,
pad
)(
return
[
GpuCorrMM
(
border_mode
,
subsample
,
pad
)(
img
,
kern
)]
gpu_contiguous
(
kern
),
gpu_contiguous
(
img
)
)]
gpu_optimizer
.
register
(
"conv_gemm"
,
local_conv_gemm
)
gpu_optimizer
.
register
(
"conv_gemm"
,
local_conv_gemm
)
...
...
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
浏览文件 @
a725adf3
...
@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
...
@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
if
cls
is
not
None
:
if
cls
is
not
None
:
assert
any
([
isinstance
(
node
.
op
,
cls
)
assert
any
([
isinstance
(
node
.
op
,
cls
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]),
f
.
maker
.
fgraph
.
toposort
(
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]),
"Cannot find class
%
r in
%
r"
%
(
cls
,
f
.
maker
.
fgraph
.
toposort
()
)
gpuval
=
f
(
img
,
kern
)
gpuval
=
f
(
img
,
kern
)
t2
=
time
.
time
()
t2
=
time
.
time
()
for
i
in
range
(
nb_iter
):
for
i
in
range
(
nb_iter
):
...
@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode,
...
@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode,
cls
=
cls
)
cls
=
cls
)
except
Exception
,
e
:
except
Exception
,
e
:
print
ver
,
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
print
ver
,
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
print
e
print
"Exception"
,
type
(
e
),
e
pass
pass
if
not
ret
:
if
not
ret
:
failed_version
.
add
(
ver
)
failed_version
.
add
(
ver
)
...
@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False):
...
@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False):
if
conv_gemm
:
if
conv_gemm
:
# Test the GpuCorrMM version
# Test the GpuCorrMM version
mode
=
theano_mode
.
including
(
"conv_gemm"
)
mode
=
theano_mode
.
including
(
"conv_gemm"
)
cls
=
cuda
.
blas
.
GpuCorrMM
cls
=
cuda
.
blas
.
Base
GpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
# dummy version; not used by GpuCorrMM so one version is enough
version
=
[
-
1
]
version
=
[
-
1
]
# Add tests with strided inputs by still square images and filters.
# Add tests with strided inputs by still square images and filters.
...
@@ -713,7 +713,7 @@ def test_full(conv_gemm=False):
...
@@ -713,7 +713,7 @@ def test_full(conv_gemm=False):
if
conv_gemm
:
if
conv_gemm
:
# Test the GpuCorrMM version
# Test the GpuCorrMM version
mode
=
theano_mode
.
including
(
"conv_gemm"
)
mode
=
theano_mode
.
including
(
"conv_gemm"
)
cls
=
cuda
.
blas
.
GpuCorrMM
cls
=
cuda
.
blas
.
Base
GpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
# dummy version; not used by GpuCorrMM so one version is enough
version
=
[
-
1
]
version
=
[
-
1
]
else
:
else
:
...
@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False):
...
@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False):
if
conv_gemm
:
if
conv_gemm
:
# Test the GpuCorrMM version
# Test the GpuCorrMM version
mode
=
theano_mode
.
including
(
"conv_gemm"
)
mode
=
theano_mode
.
including
(
"conv_gemm"
)
cls
=
cuda
.
blas
.
GpuCorrMM
cls
=
cuda
.
blas
.
Base
GpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
# dummy version; not used by GpuCorrMM so one version is enough
version_valid
=
version_full
=
[
-
1
]
version_valid
=
version_full
=
[
-
1
]
else
:
else
:
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论