Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
0037c724
提交
0037c724
authored
8月 14, 2014
作者:
abergeron
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2023 from stencilman/conv_gemm
Conv gemm non-square kernel support
上级
74e83a72
b5e340ba
显示空白字符变更
内嵌
并排
正在显示
7 个修改的文件
包含
125 行增加
和
118 行删除
+125
-118
conv.txt
doc/library/tensor/nnet/conv.txt
+8
-9
blas.py
theano/sandbox/cuda/blas.py
+16
-11
caffe_common.hpp
theano/sandbox/cuda/caffe_common.hpp
+0
-6
conv_gemm.cu
theano/sandbox/cuda/conv_gemm.cu
+43
-43
opt.py
theano/sandbox/cuda/opt.py
+2
-3
test_conv_cuda_ndarray.py
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+53
-45
Conv3D.py
theano/tensor/nnet/Conv3D.py
+3
-1
没有找到文件。
doc/library/tensor/nnet/conv.txt
浏览文件 @
0037c724
...
@@ -53,19 +53,18 @@ TODO: Give examples for how to use these things! They are pretty complicated.
...
@@ -53,19 +53,18 @@ TODO: Give examples for how to use these things! They are pretty complicated.
Also, there is restrictions on which shape are supported.
Also, there is restrictions on which shape are supported.
- :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`
- :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`
This is a GPU-only version of a correlation that computes correlations
This is a GPU-only version of a correlation that computes correlations
as `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu>`
.
as `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu>`
_.
For each element in a batch, it first creates a
For each element in a batch, it first creates a
Toeplitz<http://en.wikipedia.org/wiki/Toeplitz_matrix> matrix in a cuda kernel.
`Toeplitz <http://en.wikipedia.org/wiki/Toeplitz_matrix>`_ matrix in a cuda kernel.
Then, it performs a `
gemm` call to multiply this Toeplitz matrix and to the kernel.
Then, it performs a `
`gemm`` call to multiply this Toeplitz matrix and the kernel.
It need extra memory
for this, which is
the size of the Toeplitz matrix. Precisely,
It need extra memory
equal to
the size of the Toeplitz matrix. Precisely,
the dimensions of this
Toeplitz matrix is equal to
the dimensions of this
2D Toeplitz matrix is equal to
(no of channels * filter width * filter height, output width * output height)
.
``(no of channels * filter width * filter height, output width * output height)``
.
You can enable it for call to conv2d 2d by setting
'THEANO_FLAGS=optimizer_including=conv_gemm'
You can enable it for call to conv2d 2d by setting
``THEANO_FLAGS=optimizer_including=conv_gemm``
in your environment. This is not enabled by default because it
in your environment. This is not enabled by default because it
uses some extra memory.
It don't support strides for now and requires square kernels
.
uses some extra memory.
MM mean matrix multiply
.
.. autofunction:: theano.tensor.nnet.conv.conv2d
.. autofunction:: theano.tensor.nnet.conv.conv2d
.. autofunction:: theano.tensor.nnet.Conv3D.conv3D
.. autofunction:: theano.tensor.nnet.Conv3D.conv3D
.. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
.. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
.. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
.. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
.. autofunction:: theano.sandbox.cuda.blas.GpuCorrMM
theano/sandbox/cuda/blas.py
浏览文件 @
0037c724
...
@@ -501,16 +501,21 @@ gpu_ger_inplace = GpuGer(inplace=True)
...
@@ -501,16 +501,21 @@ gpu_ger_inplace = GpuGer(inplace=True)
class
GpuCorrMM
(
GpuOp
):
class
GpuCorrMM
(
GpuOp
):
"""
"""GPU correlation implementation using Matrix Multiply.
Author: Arjun Jain
Implement the caffe convolution
:note: It don't implement the grad. So you should use it by
enabling the Theano flag ``optimizer_including=conv_gemm`` and
use :func:`conv2d <theano.tensor.nnet.conv.conv2d>`.
"""
"""
def
__init__
(
self
,
border_mode
,
def
__init__
(
self
,
border_mode
,
subsample
=
(
1
,
1
),
subsample
=
(
1
,
1
),
pad
=
0
):
pad
=
0
):
"""
"""
:param border_mode: "valid" or "full"
:param border_mode: "valid" or "full"
:param subsample: not yet supported
:param subsample: the subsample operation applied on each output image.
Should be a tuple with 2 elements.
(sv, sh) is equivalent to GpuCorrMM(...)(...)[:,:,::sv, ::sh]
:param pad: not yet supported
:param pad: not yet supported
"""
"""
self
.
border_mode
=
border_mode
self
.
border_mode
=
border_mode
...
@@ -519,9 +524,6 @@ class GpuCorrMM(GpuOp):
...
@@ -519,9 +524,6 @@ class GpuCorrMM(GpuOp):
if
pad
!=
0
:
if
pad
!=
0
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"GpuCorrMM don't implement the pad parameter"
)
"GpuCorrMM don't implement the pad parameter"
)
if
subsample
!=
(
1
,
1
):
raise
NotImplementedError
(
"GpuCorrMM we don't implement the subsample parameter"
)
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
\
return
type
(
self
)
==
type
(
other
)
\
...
@@ -557,7 +559,6 @@ class GpuCorrMM(GpuOp):
...
@@ -557,7 +559,6 @@ class GpuCorrMM(GpuOp):
return
Apply
(
self
,
[
img
,
kern
],
[
CudaNdarrayType
(
broadcastable
)()])
return
Apply
(
self
,
[
img
,
kern
],
[
CudaNdarrayType
(
broadcastable
)()])
def
flops
(
self
,
inputs
,
outputs
):
def
flops
(
self
,
inputs
,
outputs
):
""" Useful with the hack in profilemode to print the MFlops"""
images
,
kerns
=
inputs
images
,
kerns
=
inputs
out
,
=
outputs
out
,
=
outputs
assert
images
[
1
]
==
kerns
[
1
]
assert
images
[
1
]
==
kerns
[
1
]
...
@@ -611,7 +612,9 @@ class GpuCorrMM(GpuOp):
...
@@ -611,7 +612,9 @@ class GpuCorrMM(GpuOp):
//Optional args
//Optional args
int dx =
%(dx)
s;
int dx =
%(dx)
s;
int dy =
%(dy)
s;
int dy =
%(dy)
s;
int pad = 0;
int padH = 0;
int padW = 0;
CudaNdarray * img =
%(img)
s;
CudaNdarray * img =
%(img)
s;
CudaNdarray * kern =
%(kern)
s;
CudaNdarray * kern =
%(kern)
s;
CudaNdarray * out2 = NULL;
CudaNdarray * out2 = NULL;
...
@@ -630,7 +633,9 @@ class GpuCorrMM(GpuOp):
...
@@ -630,7 +633,9 @@ class GpuCorrMM(GpuOp):
{
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
pad = CudaNdarray_HOST_DIMS(kern)[2] - 1;
padH = CudaNdarray_HOST_DIMS(kern)[2] - 1;
padW = CudaNdarray_HOST_DIMS(kern)[3] - 1;
}
}
out_dim[2] = ceil_intdiv(logical_rows, dx);
out_dim[2] = ceil_intdiv(logical_rows, dx);
out_dim[3] = ceil_intdiv(logical_cols, dy);
out_dim[3] = ceil_intdiv(logical_cols, dy);
...
@@ -648,7 +653,7 @@ class GpuCorrMM(GpuOp):
...
@@ -648,7 +653,7 @@ class GpuCorrMM(GpuOp):
}
}
out2 = corrMM(
%(img)
s,
%(kern)
s,
%(out)
s,
pad
);
out2 = corrMM(
%(img)
s,
%(kern)
s,
%(out)
s,
dx, dy, padH, padW
);
if (out2==NULL){
if (out2==NULL){
%(fail)
s
%(fail)
s
}
}
...
...
theano/sandbox/cuda/caffe_common.hpp
浏览文件 @
0037c724
...
@@ -30,12 +30,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
...
@@ -30,12 +30,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <cuda.h>
#include <cuda.h>
#include <driver_types.h> // cuda driver types
#include <driver_types.h> // cuda driver types
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
// CUDA: thread number configuration.
// CUDA: thread number configuration.
// Use 1024 threads per block, which requires cuda sm_2x or above,
// Use 1024 threads per block, which requires cuda sm_2x or above,
// or fall back to attempt compatibility (best of luck to you).
// or fall back to attempt compatibility (best of luck to you).
...
...
theano/sandbox/cuda/conv_gemm.cu
浏览文件 @
0037c724
...
@@ -22,30 +22,44 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
...
@@ -22,30 +22,44 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
*/
// Reference code: https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
#undef _GLIBCXX_ATOMIC_BUILTINS
#undef _GLIBCXX_ATOMIC_BUILTINS
#include <Python.h>
#include <Python.h>
#include "cuda_ndarray.cuh"
#include "cuda_ndarray.cuh"
#include "caffe_common.hpp"
#include "caffe_common.hpp"
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
// Use 1024 threads per block, which requires cuda sm_2x or above
const int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
// Kernel for fast unfold+copy
// Kernel for fast unfold+copy
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
// Reference code: https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
__global__ void im2col_kernel(const int n, const float* data_im,
__global__ void im2col_kernel(const int n, const float* data_im,
const int height, const int width, const int ksize
, const int pad
,
const int height, const int width, const int ksize
_h, const int ksize_w, const int pad_h
,
const int stride
, const int height_col, const int width_col,
const int pad_w, const int stride_h, const int stride_w
, const int height_col, const int width_col,
float* data_col) {
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
CUDA_KERNEL_LOOP(index, n) {
int w_out = index % width_col;
int w_out = index % width_col;
index /= width_col;
index /= width_col;
int h_out = index % height_col;
int h_out = index % height_col;
int channel_in = index / height_col;
int channel_in = index / height_col;
int channel_out = channel_in * ksize
* ksize
;
int channel_out = channel_in * ksize
_h * ksize_w
;
int h_in = h_out * stride
- pad
;
int h_in = h_out * stride
_h - pad_h
;
int w_in = w_out * stride
- pad
;
int w_in = w_out * stride
_w - pad_w
;
data_col += (channel_out * height_col + h_out) * width_col + w_out;
data_col += (channel_out * height_col + h_out) * width_col + w_out;
data_im += (channel_in * height + h_in) * width + w_in;
data_im += (channel_in * height + h_in) * width + w_in;
for (int i = 0; i < ksize; ++i) {
for (int i = 0; i < ksize
_h
; ++i) {
for (int j = 0; j < ksize; ++j) {
for (int j = 0; j < ksize
_w
; ++j) {
int h = h_in + i;
int h = h_in + i;
int w = w_in + j;
int w = w_in + j;
*data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
*data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
...
@@ -57,18 +71,17 @@ __global__ void im2col_kernel(const int n, const float* data_im,
...
@@ -57,18 +71,17 @@ __global__ void im2col_kernel(const int n, const float* data_im,
}
}
void im2col(const float* data_im, const int channels,
void im2col(const float* data_im, const int channels,
const int height, const int width, const int ksize
, const int pad
,
const int height, const int width, const int ksize
_h, const int ksize_w, const int pad_h
,
const int stride
, float* data_col) {
const int pad_w, const int stride_h, const int stride_w
, float* data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad
- ksize) / stride
+ 1;
int height_col = (height + 2 * pad
_h - ksize_h) / stride_h
+ 1;
int width_col = (width + 2 * pad
- ksize) / stride
+ 1;
int width_col = (width + 2 * pad
_w - ksize_w) / stride_w
+ 1;
int num_kernels = channels * height_col * width_col;
int num_kernels = channels * height_col * width_col;
// Launch
// Launch
im2col_kernel <<<
CAFFE_GET_BLOCKS(num_kernels), CAFFE_
CUDA_NUM_THREADS>>> (
im2col_kernel <<<
GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>> (
num_kernels, data_im, height, width, ksize
,
num_kernels, data_im, height, width, ksize_h, ksize_w
,
pad, stride
,
pad_h, pad_w, stride_h, stride_w
,
height_col, width_col, data_col
height_col, width_col, data_col
);
);
}
}
...
@@ -79,7 +92,10 @@ void im2col(const float* data_im, const int channels,
...
@@ -79,7 +92,10 @@ void im2col(const float* data_im, const int channels,
CudaNdarray* corrMM(const CudaNdarray *input,
CudaNdarray* corrMM(const CudaNdarray *input,
CudaNdarray *weight,
CudaNdarray *weight,
CudaNdarray *output,
CudaNdarray *output,
int padding = 0)
int dH = 1,
int dW = 1,
int padH = 0,
int padW = 0)
{
{
cublasStatus_t status;
cublasStatus_t status;
...
@@ -94,30 +110,12 @@ CudaNdarray* corrMM(const CudaNdarray *input,
...
@@ -94,30 +110,12 @@ CudaNdarray* corrMM(const CudaNdarray *input,
PyErr_SetString(PyExc_ValueError, "required weight of 4D");
PyErr_SetString(PyExc_ValueError, "required weight of 4D");
}
}
// TODO: stride(dW, dH) and padding as function parameter
int dH = 1;
int dW = 1;
int kH = CudaNdarray_HOST_DIMS(weight)[2];
int kH = CudaNdarray_HOST_DIMS(weight)[2];
int kW = CudaNdarray_HOST_DIMS(weight)[3];
int kW = CudaNdarray_HOST_DIMS(weight)[3];
int nInputPlane = CudaNdarray_HOST_DIMS(input)[1];
int nInputPlane = CudaNdarray_HOST_DIMS(input)[1];
// filters: (number of filters, nInputPlane, rows, columns)
// filters: (number of filters, nInputPlane, rows, columns)
int nOutputPlane = CudaNdarray_HOST_DIMS(weight)[0];
int nOutputPlane = CudaNdarray_HOST_DIMS(weight)[0];
long batchSize = CudaNdarray_HOST_DIMS(input)[0];
long batchSize = CudaNdarray_HOST_DIMS(input)[0];
if (CudaNdarray_HOST_DIMS(input)[2] != CudaNdarray_HOST_DIMS(input)[3]){
PyErr_Format(PyExc_ValueError,
"GpuCorrMM support only square images. Got %dx%d images\n",
CudaNdarray_HOST_DIMS(input)[2],
CudaNdarray_HOST_DIMS(input)[3]
);
return NULL;
}
if (kW != kH){
PyErr_Format(PyExc_ValueError,
"GpuCorrMM support only square kernel. Got %dx%d kernel\n",
kW, kH
);
return NULL;
}
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(weight)[1]){
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(weight)[1]){
PyErr_SetString(PyExc_ValueError,
PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n"
"GpuCorrMM images and kernel must have the same stack size\n"
...
@@ -126,18 +124,20 @@ CudaNdarray* corrMM(const CudaNdarray *input,
...
@@ -126,18 +124,20 @@ CudaNdarray* corrMM(const CudaNdarray *input,
}
}
long inputHeight = CudaNdarray_HOST_DIMS(input)[2];
long inputHeight = CudaNdarray_HOST_DIMS(input)[2];
long inputWidth = CudaNdarray_HOST_DIMS(input)[3];
long inputWidth = CudaNdarray_HOST_DIMS(input)[3];
long outputWidth = (inputWidth + 2*pad
ding
- kW) / dW + 1;
long outputWidth = (inputWidth + 2*pad
W
- kW) / dW + 1;
long outputHeight = (inputHeight + 2*pad
ding
- kH) / dH + 1;
long outputHeight = (inputHeight + 2*pad
H
- kH) / dH + 1;
// check output, size (batchSize, nOutputPlane,
// check output, size (batchSize, nOutputPlane,
// outputHeight, outputWidth);
// outputHeight, outputWidth);
if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
outputWidth != CudaNdarray_HOST_DIMS(output)[3]){
outputWidth != CudaNdarray_HOST_DIMS(output)[3]){
PyErr_SetString(PyExc_ValueError,
PyErr_Format(
"GpuCorrMM outputs parameter don't have the good shape\n"
PyExc_ValueError,
);
"GpuCorrMM outputs parameter don't have the good shape %d %d %d %d, %d %d %d %d\n",
batchSize, nOutputPlane, outputHeight, outputWidth,
CudaNdarray_HOST_DIMS(output)[0], CudaNdarray_HOST_DIMS(output)[1],
CudaNdarray_HOST_DIMS(output)[2], CudaNdarray_HOST_DIMS(output)[3]);
return NULL;
return NULL;
}
}
// Create temporary columns
// Create temporary columns
...
@@ -158,7 +158,7 @@ CudaNdarray* corrMM(const CudaNdarray *input,
...
@@ -158,7 +158,7 @@ CudaNdarray* corrMM(const CudaNdarray *input,
// 1. Extract columns:
// 1. Extract columns:
im2col(
im2col(
input->devdata + elt*ip_stride,
input->devdata + elt*ip_stride,
nInputPlane, input
Width, inputHeight, kW, padding
, dW,
nInputPlane, input
Height, inputWidth, kH, kW, padH, padW, dH
, dW,
columns->devdata
columns->devdata
);
);
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
0037c724
...
@@ -1286,13 +1286,12 @@ def local_gpu_downsample_factor_max_grad(node):
...
@@ -1286,13 +1286,12 @@ def local_gpu_downsample_factor_max_grad(node):
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_gemm
(
node
):
def
local_conv_gemm
(
node
):
if
(
isinstance
(
node
.
op
,
GpuConv
)
and
if
(
isinstance
(
node
.
op
,
GpuConv
)
and
node
.
op
.
border_mode
in
[
'full'
,
'valid'
]
and
node
.
op
.
border_mode
in
[
'full'
,
'valid'
]):
node
.
op
.
subsample
==
(
1
,
1
)):
img
,
kern
=
node
.
inputs
img
,
kern
=
node
.
inputs
img
=
gpu_contiguous
(
img
)
img
=
gpu_contiguous
(
img
)
kern
=
kern
[:,
:,
::
-
1
,
::
-
1
]
kern
=
kern
[:,
:,
::
-
1
,
::
-
1
]
kern
=
gpu_contiguous
(
kern
)
kern
=
gpu_contiguous
(
kern
)
return
[
GpuCorrMM
(
node
.
op
.
border_mode
)(
img
,
kern
)]
return
[
GpuCorrMM
(
node
.
op
.
border_mode
,
node
.
op
.
subsample
)(
img
,
kern
)]
gpu_optimizer
.
register
(
"conv_gemm"
,
local_conv_gemm
)
gpu_optimizer
.
register
(
"conv_gemm"
,
local_conv_gemm
)
...
...
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
浏览文件 @
0037c724
...
@@ -114,6 +114,7 @@ def py_conv_scipy(img, kern, mode, subsample):
...
@@ -114,6 +114,7 @@ def py_conv_scipy(img, kern, mode, subsample):
for
b
in
xrange
(
out
.
shape
[
0
]):
for
b
in
xrange
(
out
.
shape
[
0
]):
for
k
in
xrange
(
out
.
shape
[
1
]):
for
k
in
xrange
(
out
.
shape
[
1
]):
for
s
in
xrange
(
img
.
shape
[
1
]):
for
s
in
xrange
(
img
.
shape
[
1
]):
#convolve2d or correlate
out
[
b
,
k
,
:,
:]
+=
convolve2d
(
img
[
b
,
s
,
:,
:],
out
[
b
,
k
,
:,
:]
+=
convolve2d
(
img
[
b
,
s
,
:,
:],
kern
[
k
,
s
,
:,
:],
kern
[
k
,
s
,
:,
:],
mode
)
mode
)
...
@@ -261,7 +262,6 @@ def exec_conv(version, shapes, verbose, random, mode,
...
@@ -261,7 +262,6 @@ def exec_conv(version, shapes, verbose, random, mode,
failed_version
=
set
()
failed_version
=
set
()
failed_id
=
[]
failed_id
=
[]
# I put -1 in case we forget to add version in the test to.
for
ver
in
version
:
for
ver
in
version
:
for
id
,
(
ishape
,
kshape
,
subshape
,
for
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
in
enumerate
(
shapes
):
istride
,
kstride
)
in
enumerate
(
shapes
):
...
@@ -615,7 +615,7 @@ def test_valid_9_10():
...
@@ -615,7 +615,7 @@ def test_valid_9_10():
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
)
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
)
def
test_valid
():
def
test_valid
(
conv_gemm
=
False
):
seed_rng
()
seed_rng
()
shapes
=
get_valid_shapes
()
shapes
=
get_valid_shapes
()
...
@@ -624,7 +624,6 @@ def test_valid():
...
@@ -624,7 +624,6 @@ def test_valid():
# I put -2 to test the reference version.
# I put -2 to test the reference version.
version
=
[
-
2
,
-
1
,
6
]
version
=
[
-
2
,
-
1
,
6
]
verbose
=
0
verbose
=
0
# version=[1]
random
=
True
random
=
True
print_
=
False
print_
=
False
...
@@ -632,26 +631,25 @@ def test_valid():
...
@@ -632,26 +631,25 @@ def test_valid():
if
ones
:
if
ones
:
random
=
False
random
=
False
# exec_conv(version, shapes, verbose, random, 'valid',
if
conv_gemm
:
# print_=print_, ones=ones, rtol=1.1e-5)
# Test the GpuCorrMM version
mode
=
theano_mode
.
including
(
"conv_gemm"
)
mode
=
theano_mode
.
including
(
"conv_gemm"
)
cls
=
cuda
.
blas
.
GpuCorrMM
version
=
[
-
1
]
version
=
[
-
1
]
# dummy version; not used by GpuCorrMM so one version is enough
# Remove case not supported
# Add tests with strided inputs by still square images and filters.
# Add tests with strided inputs by still square images and filters.
shapes
+=
get_shapes2
(
scales_img
=
(
2
,
2
),
img_stride
=
(
2
,
2
))
shapes
+=
get_shapes2
(
scales_img
=
(
2
,
2
),
img_stride
=
(
2
,
2
))
shapes
+=
get_shapes2
(
scales_kern
=
(
2
,
2
),
kern_stride
=
(
2
,
2
))
shapes
+=
get_shapes2
(
scales_kern
=
(
2
,
2
),
kern_stride
=
(
2
,
2
))
# Keep only tests with square images and filters even with inputs strides
else
:
shapes
=
[
shp
for
shp
in
shapes
if
(
mode
=
cls
=
None
shp
[
0
][
2
]
/
shp
[
3
][
0
]
==
shp
[
0
][
3
]
/
shp
[
3
][
1
]
and
shp
[
1
][
2
]
/
shp
[
4
][
0
]
==
shp
[
1
][
3
]
/
shp
[
4
][
1
])]
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
,
theano_mode
=
mode
,
cls
=
cuda
.
blas
.
GpuCorrMM
)
theano_mode
=
mode
,
cls
=
cls
)
def
test_gemm_valid
():
test_valid
(
conv_gemm
=
True
)
def
test_full
():
def
test_full
(
conv_gemm
=
False
):
seed_rng
()
seed_rng
()
shapes
=
get_basic_shapes
()
shapes
=
get_basic_shapes
()
shapes
+=
get_shapes2
()
shapes
+=
get_shapes2
()
...
@@ -708,24 +706,24 @@ def test_full():
...
@@ -708,24 +706,24 @@ def test_full():
# shapes=shapes[:277]
# shapes=shapes[:277]
version
=
[
-
2
,
-
1
,
0
,
1
,
2
,
3
,
4
,
5
]
version
=
[
-
2
,
-
1
,
0
,
1
,
2
,
3
,
4
,
5
]
verbose
=
0
verbose
=
0
# version=[4]
random
=
True
random
=
True
# exec_conv(version, shapes, verbose, random, 'full')
if
conv_gemm
:
# Test the GpuCorrMM version
# Test the GpuCorrMM version
mode
=
theano_mode
.
including
(
"conv_gemm"
)
mode
=
theano_mode
.
including
(
"conv_gemm"
)
cls
=
cuda
.
blas
.
GpuCorrMM
shapes
=
[
shp
for
shp
in
shapes
if
shp
[
1
][
2
]
==
shp
[
1
][
3
]]
version
=
[
-
1
]
# dummy version; not used by GpuCorrMM so one version is enough
shapes
=
[
shp
for
shp
in
shapes
if
shp
[
0
][
2
]
==
shp
[
0
][
3
]]
else
:
shapes
=
shapes
[
0
:
10
]
mode
=
cls
=
None
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'full'
,
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'full'
,
theano_mode
=
mode
,
cls
=
c
uda
.
blas
.
GpuCorrMM
)
theano_mode
=
mode
,
cls
=
c
ls
)
def
test_gemm_full
():
test_full
(
conv_gemm
=
True
)
def
test_subsample
():
def
test_subsample
(
conv_gemm
=
False
):
seed_rng
()
seed_rng
()
# implement when
shapes
=
[((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
shapes
=
[((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
2
,
2
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
2
,
2
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
2
),
(
1
,
3
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
2
),
(
1
,
3
),
(
1
,
1
),
(
1
,
1
)),
...
@@ -747,10 +745,23 @@ def test_subsample():
...
@@ -747,10 +745,23 @@ def test_subsample():
if
ones
:
if
ones
:
random
=
False
random
=
False
if
conv_gemm
:
# Test the GpuCorrMM version
mode
=
theano_mode
.
including
(
"conv_gemm"
)
cls
=
cuda
.
blas
.
GpuCorrMM
version_valid
=
version_full
=
[
-
1
]
# dummy version; not used by GpuCorrMM so one version is enough
else
:
mode
=
cls
=
None
exec_conv
(
version_valid
,
shapes
,
verbose
,
random
,
'valid'
,
exec_conv
(
version_valid
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
)
print_
=
print_
,
ones
=
ones
,
theano_mode
=
mode
,
cls
=
cls
)
exec_conv
(
version_full
,
shapes
,
verbose
,
random
,
'full'
,
exec_conv
(
version_full
,
shapes
,
verbose
,
random
,
'full'
,
print_
=
print_
,
ones
=
ones
)
print_
=
print_
,
ones
=
ones
,
theano_mode
=
mode
,
cls
=
cls
)
def
test_gemm_subsample
():
test_subsample
(
conv_gemm
=
True
)
class
TestConv2DGPU
(
unittest
.
TestCase
):
class
TestConv2DGPU
(
unittest
.
TestCase
):
...
@@ -825,23 +836,28 @@ class TestConv2DGPU(unittest.TestCase):
...
@@ -825,23 +836,28 @@ class TestConv2DGPU(unittest.TestCase):
def
test_gemm
():
def
test_gemm
_directly
():
"""
"""
input: (batch size, channels, rows, columns)
input: (batch size, channels, rows, columns)
filters: (number of filters, channels, rows, columns)
filters: (number of filters, channels, rows, columns)
"""
"""
for
mode
in
[
'
valid'
,
'full
'
]:
for
mode
in
[
'
full'
,
'valid
'
]:
print
'Testing mode: '
+
mode
print
'Testing mode: '
+
mode
for
bs
in
range
(
1
,
5
):
for
bs
in
range
(
1
,
5
):
for
ch
in
range
(
1
,
4
):
for
ch
in
range
(
1
,
4
):
for
nf
in
range
(
1
,
4
):
for
nf
in
range
(
1
,
4
):
for
rImg
in
range
(
5
,
9
):
for
rImg1
in
range
(
5
,
9
):
for
rFlt
in
range
(
2
,
4
):
for
rImg2
in
range
(
5
,
9
):
ishape
=
(
bs
,
ch
,
rImg
,
rImg
)
for
rFlt1
in
range
(
2
,
4
):
kshape
=
(
nf
,
ch
,
rFlt
,
rFlt
)
for
rFlt2
in
range
(
2
,
4
):
for
subsx
in
range
(
1
,
3
):
for
subsy
in
range
(
1
,
3
):
ishape
=
(
bs
,
ch
,
rImg1
,
rImg2
)
kshape
=
(
nf
,
ch
,
rFlt1
,
rFlt2
)
print
"ishape: "
,
ishape
print
"ishape: "
,
ishape
print
"kshape: "
,
kshape
print
"kshape: "
,
kshape
subsample
=
(
1
,
1
)
subsample
=
(
subsx
,
subsy
)
print
"subsample: "
,
subsample
npy_img
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
ishape
),
dtype
=
'float32'
)
npy_img
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
ishape
),
dtype
=
'float32'
)
npy_kern
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
kshape
),
dtype
=
'float32'
)
npy_kern
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
kshape
),
dtype
=
'float32'
)
...
@@ -849,24 +865,16 @@ def test_gemm():
...
@@ -849,24 +865,16 @@ def test_gemm():
i
=
cuda_tensor4
()
i
=
cuda_tensor4
()
k
=
cuda_tensor4
()
k
=
cuda_tensor4
()
t2
=
None
t0
=
time
.
time
()
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
mode
,
subsample
)
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
mode
,
subsample
)
t1
=
time
.
time
()
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM
(
border_mode
=
mode
,
\
subsample
=
subsample
)(
i
,
k
)
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM
(
border_mode
=
mode
)(
i
,
k
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
for
k
in
range
(
npy_kern
.
shape
[
0
]):
npy_kern
=
npy_kern
[:,:,::
-
1
,::
-
1
]
for
s
in
range
(
npy_kern
.
shape
[
1
]):
npy_kern
[
k
,
s
,:,:]
=
numpy
.
rot90
(
npy_kern
[
k
,
s
,:,:],
2
)
gpuval
=
f
(
npy_img
,
npy_kern
)
gpuval
=
f
(
npy_img
,
npy_kern
)
t2
=
time
.
time
()
gpuval
=
numpy
.
asarray
(
gpuval
)
gpuval
=
numpy
.
asarray
(
gpuval
)
rval
=
numpy
.
allclose
(
cpuval
,
gpuval
,
rtol
=
1e-4
)
rval
=
numpy
.
allclose
(
cpuval
,
gpuval
,
rtol
=
1e-4
)
assert
(
rval
==
True
)
assert
(
rval
==
True
)
...
...
theano/tensor/nnet/Conv3D.py
浏览文件 @
0037c724
...
@@ -40,7 +40,9 @@ from theano.gradient import grad_undefined
...
@@ -40,7 +40,9 @@ from theano.gradient import grad_undefined
#the output function is only defined when dr, dc, dt are natural numbers.
#the output function is only defined when dr, dc, dt are natural numbers.
class
Conv3D
(
theano
.
Op
):
class
Conv3D
(
theano
.
Op
):
""" 3D "convolution" of multiple filters on a minibatch (does not flip the kernel, moves kernel with a user specified stride) """
""" 3D `convolution` of multiple filters on a minibatch
:note: does not flip the kernel, moves kernel with a user specified stride
"""
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
return
type
(
self
)
==
type
(
other
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论