Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b998dc61
提交
b998dc61
authored
8月 21, 2017
作者:
Frédéric Bastien
提交者:
GitHub
8月 21, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #6302 from borisfom/tensor_op
Tensor op, cache
上级
c470bd38
e51b6a22
隐藏空白字符变更
内嵌
并排
正在显示
8 个修改的文件
包含
1103 行增加
和
541 行删除
+1103
-541
conv_desc.c
theano/gpuarray/c_code/conv_desc.c
+15
-0
cudnn_helper.h
theano/gpuarray/c_code/cudnn_helper.h
+9
-1
dnn_conv_base.c
theano/gpuarray/c_code/dnn_conv_base.c
+255
-1
dnn_fwd.c
theano/gpuarray/c_code/dnn_fwd.c
+225
-168
dnn_gi.c
theano/gpuarray/c_code/dnn_gi.c
+225
-199
dnn_gw.c
theano/gpuarray/c_code/dnn_gw.c
+169
-140
dnn.py
theano/gpuarray/dnn.py
+38
-26
test_dnn.py
theano/gpuarray/tests/test_dnn.py
+167
-6
没有找到文件。
theano/gpuarray/c_code/conv_desc.c
浏览文件 @
b998dc61
#section support_code_apply
static
int
c_set_groups_for_conv
(
cudnnConvolutionDescriptor_t
desc
,
int
groups
)
{
#if CUDNN_MAJOR >= 7
cudnnStatus_t
err
=
cudnnSetConvolutionGroupCount
(
desc
,
groups
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error setting groups for convolution : %s"
,
cudnnGetErrorString
(
err
));
return
-
1
;
}
#endif
return
0
;
}
int
APPLY_SPECIFIC
(
conv_desc
)(
PyArrayObject
*
filt_shp
,
cudnnConvolutionDescriptor_t
*
desc
,
PARAMS_TYPE
*
params
)
{
...
...
@@ -43,5 +56,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
"descriptor: %s"
,
cudnnGetErrorString
(
err
));
return
-
1
;
}
if
(
c_set_groups_for_conv
(
*
desc
,
params
->
num_groups
)
==
-
1
)
return
-
1
;
return
0
;
}
theano/gpuarray/c_code/cudnn_helper.h
浏览文件 @
b998dc61
...
...
@@ -11,6 +11,14 @@ static inline int cudnnGetVersion() {
}
#endif
#if CUDNN_MAJOR < 7
enum
cudnnMathType_t
{
CUDNN_DEFAULT_MATH
=
0
,
CUDNN_TENSOR_OP_MATH
=
1
};
#endif
/* a common struct for all 3 CUDNN enums */
struct
AlgoRec
{
int
algo
;
size_t
wsSize
;
cudnnMathType_t
mathType
;
};
#endif
theano/gpuarray/c_code/dnn_conv_base.c
浏览文件 @
b998dc61
...
...
@@ -3,6 +3,43 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t
APPLY_SPECIFIC
(
output
);
cudnnFilterDescriptor_t
APPLY_SPECIFIC
(
kerns
);
static
int
c_get_groups_for_conv
(
cudnnConvolutionDescriptor_t
desc
,
int
groups
)
{
#if CUDNN_MAJOR >= 7
int
desc_groups
;
if
(
groups
>
1
)
{
cudnnStatus_t
err
=
cudnnGetConvolutionGroupCount
(
desc
,
&
desc_groups
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting groups for convolution : %s"
,
cudnnGetErrorString
(
err
));
return
-
1
;
}
if
(
groups
!=
desc_groups
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"groups specified different from convolution descriptor"
);
return
-
1
;
}
}
return
1
;
#else
return
groups
;
#endif
}
static
int
c_set_math_type_for_conv
(
cudnnConvolutionDescriptor_t
desc
,
cudnnMathType_t
mathtype
)
{
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
cudnnStatus_t
err
=
cudnnSetConvolutionMathType
(
desc
,
mathtype
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error setting math type for convolution : %s"
,
cudnnGetErrorString
(
err
));
return
-
1
;
}
#endif
return
0
;
}
#section init_code_struct
cudnnStatus_t
APPLY_SPECIFIC
(
err
);
...
...
@@ -20,7 +57,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
FAIL
;
}
if
((
APPLY_SPECIFIC
(
err
)
=
cudnnCreateFilterDescriptor
(
&
APPLY_SPECIFIC
(
kerns
)))
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_MemoryError
,
"could not allocate filter descriptor: %s"
,
PyErr_Format
(
PyExc_MemoryError
,
"could not allocate filter descriptor: %s"
,
cudnnGetErrorString
(
APPLY_SPECIFIC
(
err
)));
FAIL
;
}
...
...
@@ -33,3 +70,220 @@ if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor
(
APPLY_SPECIFIC
(
output
));
if
(
APPLY_SPECIFIC
(
kerns
)
!=
NULL
)
cudnnDestroyFilterDescriptor
(
APPLY_SPECIFIC
(
kerns
));
#section support_code
#include <sstream>
#include <string>
#if __cplusplus < 201103L
#include <tr1/unordered_map>
typedef
std
::
tr1
::
unordered_map
<
std
::
string
,
AlgoRec
>
AlgoCache
;
#else
#include <unordered_map>
typedef
std
::
unordered_map
<
std
::
string
,
AlgoRec
>
AlgoCache
;
#endif
#include "pthread.h"
#line 87 "dnn_conv_base.c"
pthread_mutex_t
algoMutex
;
AlgoCache
algoCache
;
static
cudnnStatus_t
checkCudnnStatus
(
cudnnStatus_t
err
,
const
char
*
msg
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"CUDNN Error: %s: %s"
,
msg
,
cudnnGetErrorString
(
err
));
}
return
err
;
}
static
size_t
c_get_largest_free_block_size
(
PyGpuContextObject
*
c
)
{
size_t
maxfree
=
0
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
maxfree
);
if
(
err2
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
"memory information on the GPU"
);
}
// Guess 4Mb if the info is not available
if
(
maxfree
==
0
)
maxfree
=
4
*
1024
*
1024
;
return
maxfree
;
}
/** Check if convolution output tensor has expected dimensions
depending on given inputs and number of groups.
return 0 if everything is ok, non-0 on error.
**/
static
int
dnn_check_convolution_output
(
cudnnConvolutionDescriptor_t
convDesc
,
cudnnTensorDescriptor_t
inputDesc
,
cudnnFilterDescriptor_t
filterDesc
,
size_t
tensorNdim
,
PyGpuArrayObject
*
output
,
int
groups
)
{
int
expected_output_dims
[
5
]
=
{
0
};
cudnnStatus_t
err
=
cudnnGetConvolutionNdForwardOutputDim
(
convDesc
,
inputDesc
,
filterDesc
,
tensorNdim
,
expected_output_dims
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error computing convolution output dim: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
if
(
tensorNdim
==
4
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
/
groups
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %dx%dx%dx%d"
" but received %ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
]
*
groups
,
expected_output_dims
[
2
],
expected_output_dims
[
3
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
]);
return
1
;
}
}
else
if
(
tensorNdim
==
5
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
/
groups
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
])
||
(
PyGpuArray_DIMS
(
output
)[
4
]
!=
expected_output_dims
[
4
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %dx%dx%dx%dx%d"
" but received %ldx%ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
]
*
groups
,
expected_output_dims
[
2
],
expected_output_dims
[
3
],
expected_output_dims
[
4
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
],
PyGpuArray_DIMS
(
output
)[
4
]);
return
1
;
}
}
return
0
;
}
static
std
::
string
shape
(
int
*
res
,
int
size
)
{
std
::
ostringstream
s
;
if
(
size
>
0
)
{
s
<<
res
[
0
];
for
(
int
i
=
1
;
i
<
size
;
++
i
)
s
<<
','
<<
res
[
i
];
}
return
s
.
str
();
}
static
std
::
string
shape
(
cudnnTensorDescriptor_t
t
)
{
// cuDNN can handle up to CUDNN_DIM_MAX dimensions.
int
res
[
CUDNN_DIM_MAX
];
int
stride
[
CUDNN_DIM_MAX
];
int
nbDims
;
cudnnDataType_t
type
;
checkCudnnStatus
(
cudnnGetTensorNdDescriptor
(
t
,
CUDNN_DIM_MAX
,
&
type
,
&
nbDims
,
res
,
stride
),
"error getting tensor description"
);
if
(
PyErr_Occurred
())
return
""
;
return
shape
(
res
,
nbDims
)
+
","
+
shape
(
stride
,
nbDims
);
};
static
std
::
string
shape
(
cudnnFilterDescriptor_t
t
,
cudnnDataType_t
*
type
)
{
cudnnTensorFormat_t
format
;
int
res
[
CUDNN_DIM_MAX
];
int
outDims
;
checkCudnnStatus
(
cudnnGetFilterNdDescriptor
(
t
,
CUDNN_DIM_MAX
,
type
,
&
format
,
&
outDims
,
res
),
"error getting filter description"
);
if
(
PyErr_Occurred
())
return
""
;
return
shape
(
res
,
outDims
);
};
static
std
::
string
shape
(
cudnnConvolutionDescriptor_t
convDesc
)
{
int
nDim
;
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
computeType
;
int
padA
[
5
];
int
strideA
[
5
];
int
dilationA
[
5
];
checkCudnnStatus
(
cudnnGetConvolutionNdDescriptor
(
convDesc
,
5
,
&
nDim
,
&
padA
[
0
],
&
strideA
[
0
],
&
dilationA
[
0
],
&
mode
,
&
computeType
),
"error getting convolution description"
);
if
(
PyErr_Occurred
())
return
""
;
return
(
std
::
string
(
"-mode "
)
+
((
mode
==
CUDNN_CONVOLUTION
)
?
"conv"
:
"cross"
)
+
" -pad "
+
shape
(
padA
,
nDim
)
+
" -subsample "
+
shape
(
strideA
,
nDim
)
+
" -dilation "
+
shape
(
dilationA
,
nDim
));
}
static
bool
all_aligned
(
cudnnDataType_t
type
,
void
*
in
,
void
*
out
,
void
*
filter
)
{
size_t
alignMask
=
(
type
==
CUDNN_DATA_HALF
)
?
0x7F
:
0xFF
;
// there have to be entries for both aligned and not
if
(((
size_t
)
in
|
(
size_t
)
out
|
(
size_t
)
filter
)
&
alignMask
)
{
return
false
;
}
return
true
;
}
static
std
::
string
dnn_conv_shape
(
cudnnTensorDescriptor_t
inputDesc
,
PyGpuArrayObject
*
input
,
cudnnFilterDescriptor_t
filterDesc
,
PyGpuArrayObject
*
filter
,
cudnnConvolutionDescriptor_t
convDesc
,
PyGpuArrayObject
*
output
,
int
groups
)
{
cudnnDataType_t
dType
;
std
::
ostringstream
s
;
int
expected_output_dims
[
5
]
=
{
0
};
if
(
dnn_check_convolution_output
(
convDesc
,
inputDesc
,
filterDesc
,
PyGpuArray_NDIM
(
filter
),
output
,
groups
)
!=
0
)
return
""
;
std
::
string
shapeInput
=
shape
(
inputDesc
);
std
::
string
shapeFilter
=
shape
(
filterDesc
,
&
dType
);
std
::
string
shapeConvDesc
=
shape
(
convDesc
);
if
(
shapeInput
.
empty
()
||
shapeFilter
.
empty
()
||
shapeConvDesc
.
empty
())
return
""
;
s
<<
"-g "
<<
groups
<<
" -dim "
<<
shapeInput
<<
" -filt "
<<
shapeFilter
<<
" "
<<
shapeConvDesc
;
// there have to be entries for both aligned and not.
if
(
!
all_aligned
(
dType
,
PyGpuArray_DEV_DATA
(
input
),
PyGpuArray_DEV_DATA
(
output
),
PyGpuArray_DEV_DATA
(
filter
)))
{
s
<<
" [unaligned]"
;
}
return
s
.
str
();
}
static
void
dnn_conv_update_cache
(
const
std
::
string
&
hash
,
const
AlgoRec
&
rec
)
{
pthread_mutex_lock
(
&
algoMutex
);
algoCache
[
hash
]
=
rec
;
pthread_mutex_unlock
(
&
algoMutex
);
}
static
const
AlgoRec
*
dnn_conv_check_cache
(
const
std
::
string
&
hash
)
{
pthread_mutex_lock
(
&
algoMutex
);
const
AlgoRec
*
ret
=
0
;
AlgoCache
::
iterator
hit
=
algoCache
.
find
(
hash
);
if
(
hit
!=
algoCache
.
end
())
ret
=
&
hit
->
second
;
pthread_mutex_unlock
(
&
algoMutex
);
return
ret
;
}
theano/gpuarray/c_code/dnn_fwd.c
浏览文件 @
b998dc61
#section init_code_struct
prev_algo
.
algo
=
PARAMS
->
conv_algo
;
prev_algo
.
mathType
=
CUDNN_DEFAULT_MATH
;
reuse_algo
=
0
;
prev_algo
=
PARAMS
->
conv_algo
;
memset
(
prev_img_dims
,
0
,
sizeof
(
prev_img_dims
));
memset
(
prev_kern_dims
,
0
,
sizeof
(
prev_kern_dims
));
hash_prefix
=
std
::
string
(
"FWD|GPU#"
);
#section support_code_struct
#line 9 "dnn_fwd.c"
int
reuse_algo
;
AlgoRec
prev_algo
;
std
::
string
hash_prefix
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int
dnn_conv_fwd_fallback
(
cudnnConvolutionFwdAlgo_t
*
_algo
,
const
PyGpuArrayObject
*
input
,
const
PyGpuArrayObject
*
kerns
,
cudnnConvolutionDescriptor_t
desc
)
{
cudnnConvolutionFwdAlgo_t
algo
=
*
_algo
;
int
reuse_algo
;
cudnnConvolutionFwdAlgo_t
prev_algo
;
size_t
prev_img_dims
[
5
];
size_t
prev_kern_dims
[
5
];
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
if
(
PyGpuArray_NDIM
(
input
)
==
5
&&
!
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
))
{
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
return
1
;
fprintf
(
stderr
,
"(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
// NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
if
((
cudnnGetVersion
()
<
6100
||
PyGpuArray_NDIM
(
input
)
==
5
)
&&
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
&&
PyGpuArray_DIM
(
input
,
0
)
>
65536
)
{
#ifdef DEBUG
fprintf
(
stderr
,
"(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if
((
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
)
&&
PyGpuArray_NDIM
(
input
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
dilation
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
cudnnStatus_t
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
dilation
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
if
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
)
{
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing fwd algo fft with none)
\n
"
);
#endif
}
}
else
{
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing fwd algo fft_tiling with none)
\n
"
);
#endif
}
}
}
*
_algo
=
algo
;
return
0
;
}
int
APPLY_SPECIFIC
(
conv_fwd
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
kerns
,
...
...
@@ -24,6 +117,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
void
*
beta_p
;
float
af
=
alpha
,
bf
=
beta
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
bool
use_cached
=
0
;
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
]
*
params
->
num_groups
)
{
PyErr_SetString
(
PyExc_ValueError
,
...
...
@@ -73,65 +167,76 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return
0
;
}
if
(
c_set_tensor_for_conv
(
input
,
APPLY_SPECIFIC
(
input
),
params
->
num_groups
)
==
-
1
)
int
groups
=
c_get_groups_for_conv
(
desc
,
params
->
num_groups
);
if
(
groups
==
-
1
)
return
1
;
if
(
c_set_tensor_for_conv
(
input
,
APPLY_SPECIFIC
(
input
),
groups
)
==
-
1
)
return
1
;
if
(
c_set_filter
(
kerns
,
APPLY_SPECIFIC
(
kerns
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_filter
(
kerns
,
APPLY_SPECIFIC
(
kerns
),
groups
)
==
-
1
)
return
1
;
if
(
c_set_tensor_for_conv
(
*
output
,
APPLY_SPECIFIC
(
output
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_tensor_for_conv
(
*
output
,
APPLY_SPECIFIC
(
output
),
groups
)
==
-
1
)
return
1
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
input
,
0
)
/
params
->
num_
groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
kerns
,
0
)
*
PyGpuArray_DIM
(
kerns
,
0
)
/
params
->
num_
groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
*
output
,
0
)
/
params
->
num_
groups
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
input
,
0
)
/
groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
kerns
,
0
)
*
PyGpuArray_DIM
(
kerns
,
0
)
/
groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
*
output
,
0
)
/
groups
;
cudnnConvolutionFwdAlgo_t
algo
=
params
->
conv_algo
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
size_t
worksize
=
0
;
cudnnMathType_t
mathtype
=
CUDNN_DEFAULT_MATH
;
std
::
string
hashkey
;
cuda_enter
(
c
->
ctx
);
size_t
maxfree
=
c_get_largest_free_block_size
(
c
);
if
(
PyErr_Occurred
())
return
1
;
if
(
params
->
choose_algo
)
{
if
(
!
params
->
choose_once
)
{
reuse_algo
=
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
input
,
i
)
==
prev_img_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
kerns
,
i
)
==
prev_kern_dims
[
i
]);
}
}
if
(
!
reuse_algo
)
{
size_t
free
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
if
(
err2
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
"memory information on the GPU"
);
char
pci_id
[
16
];
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_PCIBUSID
,
pci_id
);
// check out cache
hashkey
=
dnn_conv_shape
(
APPLY_SPECIFIC
(
input
),
input
,
APPLY_SPECIFIC
(
kerns
),
kerns
,
desc
,
*
output
,
groups
);
if
(
hashkey
.
empty
())
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
hashkey
=
hash_prefix
+
pci_id
+
(
params
->
choose_time
?
" -t "
:
" "
)
+
hashkey
;
const
AlgoRec
*
cached
=
dnn_conv_check_cache
(
hashkey
);
if
(
cached
)
{
prev_algo
=
*
cached
;
use_cached
=
1
;
}
}
// Guess 4Mb if the info is not available
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
if
(
reuse_algo
||
use_cached
)
{
algo
=
(
cudnnConvolutionFwdAlgo_t
)
prev_algo
.
algo
;
worksize
=
prev_algo
.
wsSize
;
mathtype
=
prev_algo
.
mathType
;
}
else
{
if
(
params
->
choose_time
)
{
int
count
;
cudnnConvolutionFwdAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
max
free
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate GPU memory for FindEx"
);
cuda_exit
(
c
->
ctx
);
return
-
1
;
}
// set the 'tensor math ok' flag
c_set_math_type_for_conv
(
desc
,
CUDNN_TENSOR_OP_MATH
);
// We don't sync the buffer as we don't care about the values.
err
=
cudnnFindConvolutionForwardAlgorithmEx
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
*
output
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
max
free
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
...
...
@@ -141,138 +246,56 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit
(
c
->
ctx
);
return
1
;
}
algo
=
choice
.
algo
;
#ifdef DEBUG
if
(
count
==
0
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv fwd algorithm found"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting best-timed FWD algo: %s"
,
cudnnGetErrorString
(
choice
.
status
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
// Else, count is necessarly 1 for current implementation.
#endif
algo
=
choice
.
algo
;
prev_algo
.
algo
=
(
int
)
algo
;
prev_algo
.
wsSize
=
worksize
=
choice
.
memory
;
#if CUDNN_MAJOR >= 7
prev_algo
.
mathType
=
mathtype
=
choice
.
mathType
;
#endif
}
else
{
err
=
cudnnGetConvolutionForwardAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
return
1
;
// NB: This is printed only when algorithm is chosen at runtime.
if
(
reuse_algo
)
fprintf
(
stderr
,
"(reused %s)
\n
"
,
algorithm_name
);
else
fprintf
(
stderr
,
"(using %s)
\n
"
,
algorithm_name
);
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
else
{
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
prev_img_dims
[
i
]
=
PyGpuArray_DIM
(
input
,
i
);
prev_kern_dims
[
i
]
=
PyGpuArray_DIM
(
kerns
,
i
);
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
maxfree
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
prev_algo
.
algo
=
algo
;
// no tensor_op returned from Get()
prev_algo
.
mathType
=
mathtype
=
CUDNN_DEFAULT_MATH
;
}
}
}
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
if
(
PyGpuArray_NDIM
(
input
)
==
5
&&
!
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
))
{
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
return
1
;
fprintf
(
stderr
,
"(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
// NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
if
((
cudnnGetVersion
()
<
6100
||
PyGpuArray_NDIM
(
input
)
==
5
)
&&
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
&&
PyGpuArray_DIM
(
input
,
0
)
>
65536
)
{
#ifdef DEBUG
fprintf
(
stderr
,
"(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if
((
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
)
&&
PyGpuArray_NDIM
(
input
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
dilation
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
dilation
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
if
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
)
{
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
}
else
{
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
}
if
(
c_set_math_type_for_conv
(
desc
,
mathtype
)
==
-
1
||
dnn_conv_fwd_fallback
(
&
algo
,
input
,
kerns
,
desc
)
!=
0
)
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
// if FindEx was used (choose_time), workspace size is set.
if
(
!
(
reuse_algo
||
use_cached
||
params
->
choose_time
))
{
size_t
worksize
;
gpudata
*
workspace
;
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
...
...
@@ -280,19 +303,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
APPLY_SPECIFIC
(
output
),
algo
,
&
worksize
);
if
(
err
==
CUDNN_STATUS_NOT_SUPPORTED
)
{
// Fallback to none algo if not supported
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
fprintf
(
stderr
,
"(%s error getting worksize: "
"fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
,
algorithm_name
);
}
fprintf
(
stderr
,
"(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
...
...
@@ -303,13 +324,47 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
if
(
params
->
choose_algo
&&
(
!
params
->
choose_once
||
!
reuse_algo
))
{
// algo may have changed due to fallback, we must update it.
prev_algo
.
algo
=
algo
;
// save worksize for next time/cache
prev_algo
.
wsSize
=
worksize
;
// Add to the cache if we choose on shape change, or first time if we choose once.
dnn_conv_update_cache
(
hashkey
,
prev_algo
);
}
#ifdef DEBUG
if
(
params
->
choose_algo
)
{
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
fprintf
(
stderr
,
"(using %s%s %s%s%s, ws:%ld, hash:%s)
\n
"
,
algorithm_name
,
mathtype
==
CUDNN_TENSOR_OP_MATH
?
"[T]"
:
""
,
params
->
choose_time
?
"(timed)"
:
""
,
reuse_algo
?
"(reused)"
:
""
,
use_cached
?
"(cache)"
:
""
,
worksize
,
hashkey
.
c_str
()
);
}
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
{
gpudata
*
workspace
=
0
;
/*
* This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module
...
...
@@ -318,8 +373,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if
(
worksize
!=
0
)
{
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
if
(
workspace
==
NULL
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"Could not allocate working memory"
);
PyErr_SetString
(
PyExc_RuntimeError
,
"Could not allocate working memory"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
...
...
@@ -329,16 +383,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait
(
kerns
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
((
*
output
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
for
(
int
g
=
0
;
g
<
params
->
num_
groups
;
g
++
)
{
err
=
cudnnConvolutionForward
(
params
->
handle
,
alpha_p
,
APPLY_SPECIFIC
(
input
),
((
char
*
)
PyGpuArray_DEV_DATA
(
input
))
+
input_offset
*
g
,
APPLY_SPECIFIC
(
kerns
),
((
char
*
)
PyGpuArray_DEV_DATA
(
kerns
))
+
kern_offset
*
g
,
desc
,
algo
,
worksize
==
0
?
NULL
:
*
(
void
**
)
workspace
,
worksize
,
beta_p
,
APPLY_SPECIFIC
(
output
),
((
char
*
)
PyGpuArray_DEV_DATA
(
*
output
))
+
output_offset
*
g
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
err
=
cudnnConvolutionForward
(
params
->
handle
,
alpha_p
,
APPLY_SPECIFIC
(
input
),
((
char
*
)
PyGpuArray_DEV_DATA
(
input
))
+
input_offset
*
g
,
APPLY_SPECIFIC
(
kerns
),
((
char
*
)
PyGpuArray_DEV_DATA
(
kerns
))
+
kern_offset
*
g
,
desc
,
algo
,
worksize
==
0
?
NULL
:
*
(
void
**
)
workspace
,
worksize
,
beta_p
,
APPLY_SPECIFIC
(
output
),
((
char
*
)
PyGpuArray_DEV_DATA
(
*
output
))
+
output_offset
*
g
);
}
if
(
worksize
!=
0
)
...
...
@@ -348,12 +402,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_record
(
kerns
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_record
((
*
output
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
}
cuda_exit
(
c
->
ctx
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error doing operation: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error doing
cuDNN conv FWD
operation: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
return
0
;
}
theano/gpuarray/c_code/dnn_gi.c
浏览文件 @
b998dc61
#section init_code_struct
prev_algo
.
algo
=
PARAMS
->
conv_algo
;
prev_algo
.
mathType
=
CUDNN_DEFAULT_MATH
;
reuse_algo
=
0
;
prev_algo
=
PARAMS
->
conv_algo
;
memset
(
prev_kern_dims
,
0
,
sizeof
(
prev_kern_dims
));
memset
(
prev_top_dims
,
0
,
sizeof
(
prev_top_dims
));
hash_prefix
=
std
::
string
(
"GI|GPU#"
);
#section support_code_struct
#line 9 "dnn_gi.c"
int
reuse_algo
;
AlgoRec
prev_algo
;
std
::
string
hash_prefix
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int
dnn_conv_gi_fallback
(
cudnnConvolutionBwdDataAlgo_t
*
_algo
,
const
PyGpuArrayObject
*
input
,
const
PyGpuArrayObject
*
kerns
,
cudnnConvolutionDescriptor_t
desc
)
{
cudnnConvolutionBwdDataAlgo_t
algo
=
*
_algo
;
int
reuse_algo
;
cudnnConvolutionBwdDataAlgo_t
prev_algo
;
size_t
prev_kern_dims
[
5
];
size_t
prev_top_dims
[
5
];
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if
((
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
||
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
)
&&
PyGpuArray_NDIM
(
kerns
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
upscale
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
cudnnStatus_t
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
upscale
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
if
(
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
)
{
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing gradinput algo fft with none)
\n
"
);
#endif
}
}
else
{
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing gradinput algo fft_tiling with none)
\n
"
);
#endif
}
}
}
*
_algo
=
algo
;
return
0
;
}
int
APPLY_SPECIFIC
(
conv_gi
)(
PyGpuArrayObject
*
kerns
,
PyGpuArrayObject
*
output
,
...
...
@@ -23,6 +82,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
void
*
beta_p
;
float
af
=
alpha
,
bf
=
beta
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
bool
use_cached
=
0
;
if
(
PyGpuArray_DIMS
(
im
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
]
*
params
->
num_groups
)
{
PyErr_SetString
(
PyExc_ValueError
,
"images and kernel must have the same "
...
...
@@ -72,233 +132,200 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return
0
;
}
if
(
c_set_tensor_for_conv
(
output
,
APPLY_SPECIFIC
(
output
),
params
->
num_groups
)
==
-
1
)
int
groups
=
c_get_groups_for_conv
(
desc
,
params
->
num_groups
);
if
(
groups
==
-
1
)
return
1
;
if
(
c_set_
filter
(
kerns
,
APPLY_SPECIFIC
(
kerns
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_
tensor_for_conv
(
output
,
APPLY_SPECIFIC
(
output
),
groups
)
==
-
1
)
return
1
;
if
(
c_set_
tensor_for_conv
(
*
input
,
APPLY_SPECIFIC
(
input
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_
filter
(
kerns
,
APPLY_SPECIFIC
(
kerns
),
groups
)
==
-
1
)
return
1
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
*
input
,
0
)
/
params
->
num_groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
kerns
,
0
)
*
PyGpuArray_DIM
(
kerns
,
0
)
/
params
->
num_groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
output
,
0
)
/
params
->
num_groups
;
cudnnConvolutionBwdDataAlgo_t
algo
=
params
->
conv_algo
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
cuda_enter
(
c
->
ctx
);
int
expected_output_dims
[
5
]
=
{
0
};
err
=
cudnnGetConvolutionNdForwardOutputDim
(
desc
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_NDIM
(
im
),
expected_output_dims
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error computing convolution output dim: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
if
(
c_set_tensor_for_conv
(
*
input
,
APPLY_SPECIFIC
(
input
),
groups
)
==
-
1
)
return
1
;
}
if
(
PyGpuArray_NDIM
(
im
)
==
4
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
/
params
->
num_groups
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
],
expected_output_dims
[
2
],
expected_output_dims
[
3
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
]);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
else
if
(
PyGpuArray_NDIM
(
im
)
==
5
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
])
||
(
PyGpuArray_DIMS
(
output
)[
4
]
!=
expected_output_dims
[
4
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
],
expected_output_dims
[
2
],
expected_output_dims
[
3
],
expected_output_dims
[
4
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
],
PyGpuArray_DIMS
(
output
)[
4
]);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
if
(
params
->
choose_algo
)
{
if
(
!
params
->
choose_once
)
{
reuse_algo
=
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
kerns
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
kerns
,
i
)
==
prev_kern_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
output
,
i
)
==
prev_top_dims
[
i
]);
}
}
if
(
0
!=
dnn_check_convolution_output
(
desc
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_NDIM
(
kerns
),
output
,
groups
))
return
1
;
if
(
!
reuse_algo
)
{
size_t
free
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
)
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
*
input
,
0
)
/
groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
kerns
,
0
)
*
PyGpuArray_DIM
(
kerns
,
0
)
/
groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
output
,
0
)
/
groups
;
if
(
err2
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
"memory information on the GPU"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
cudnnConvolutionBwdDataAlgo_t
algo
=
params
->
conv_algo
;
size_t
worksize
=
0
;
cudnnMathType_t
mathtype
=
CUDNN_DEFAULT_MATH
;
// Guess 4Mb if the info is not available
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
std
::
string
hashkey
;
if
(
params
->
choose_time
)
{
int
count
;
cudnnConvolutionBwdDataAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
size_t
maxfree
=
c_get_largest_free_block_size
(
c
);
if
(
PyErr_Occurred
())
return
1
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
return
-
1
;
}
cuda_enter
(
c
->
ctx
);
err
=
cudnnFindConvolutionBackwardDataAlgorithmEx
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
*
input
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
gpudata_release
(
tmpmem
);
if
(
params
->
choose_algo
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
if
(
!
reuse_algo
)
{
char
pci_id
[
16
];
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_PCIBUSID
,
pci_id
);
// check out cache
hashkey
=
dnn_conv_shape
(
APPLY_SPECIFIC
(
input
),
*
input
,
APPLY_SPECIFIC
(
kerns
),
kerns
,
desc
,
output
,
groups
);
if
(
hashkey
.
empty
())
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
hashkey
=
hash_prefix
+
pci_id
+
(
params
->
choose_time
?
" -t "
:
" "
)
+
hashkey
;
const
AlgoRec
*
cached
=
dnn_conv_check_cache
(
hashkey
);
if
(
cached
)
{
prev_algo
=
*
cached
;
use_cached
=
1
;
}
}
algo
=
choice
.
algo
;
#ifdef DEBUG
if
(
count
==
0
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv gradinput algorithm found"
);
if
(
reuse_algo
||
use_cached
)
{
algo
=
(
cudnnConvolutionBwdDataAlgo_t
)
prev_algo
.
algo
;
worksize
=
prev_algo
.
wsSize
;
mathtype
=
prev_algo
.
mathType
;
}
else
{
if
(
params
->
choose_time
)
{
int
count
;
cudnnConvolutionBwdDataAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
// set the 'tensor math ok' flag
c_set_math_type_for_conv
(
desc
,
CUDNN_TENSOR_OP_MATH
);
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
maxfree
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
cuda_exit
(
c
->
ctx
);
return
-
1
;
}
err
=
cudnnFindConvolutionBackwardDataAlgorithmEx
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
*
input
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
maxfree
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting best-timed gradinput algo: %s"
,
cudnnGetErrorString
(
choice
.
status
));
}
#ifdef DEBUG
if
(
count
==
0
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv gradinput algorithm found"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting best-timed gradinput algo: %s"
,
cudnnGetErrorString
(
choice
.
status
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
// Else, count is necessarly 1 for current implementation.
#endif
algo
=
choice
.
algo
;
prev_algo
.
algo
=
(
int
)
algo
;
prev_algo
.
wsSize
=
worksize
=
choice
.
memory
;
#if CUDNN_MAJOR >= 7
prev_algo
.
mathType
=
mathtype
=
choice
.
mathType
;
#endif
}
else
{
err
=
cudnnGetConvolutionBackwardDataAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
maxfree
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
// Else, count is necessarly 1 for current implementation.
#endif
}
else
{
err
=
cudnnGetConvolutionBackwardDataAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
prev_algo
.
algo
=
algo
;
// no tensor_op returned from Get()
prev_algo
.
mathType
=
mathtype
=
CUDNN_DEFAULT_MATH
;
}
}
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
}
if
(
c_set_math_type_for_conv
(
desc
,
mathtype
)
==
-
1
||
dnn_conv_gi_fallback
(
&
algo
,
*
input
,
kerns
,
desc
)
!=
0
)
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
#ifdef DEBUG
char
algorithm_name
[
128
];
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t
(
algo
,
algorithm_name
))
// if FindEx was used (choose_time), workspace size is set.
if
(
!
(
reuse_algo
||
use_cached
||
params
->
choose_time
))
{
err
=
cudnnGetConvolutionBackwardDataWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
algo
,
&
worksize
);
if
(
err
==
CUDNN_STATUS_NOT_SUPPORTED
)
{
// Fallback to none algo if not supported
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
// NB: This is printed only when algorithm is chosen at runtime.
if
(
reuse_algo
)
fprintf
(
stderr
,
"(reused %s)
\n
"
,
algorithm_name
);
else
fprintf
(
stderr
,
"(using %s)
\n
"
,
algorithm_name
);
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
else
{
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
kerns
);
i
++
)
{
prev_kern_dims
[
i
]
=
PyGpuArray_DIM
(
kerns
,
i
);
prev_top_dims
[
i
]
=
PyGpuArray_DIM
(
output
,
i
);
}
fprintf
(
stderr
,
"(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_BWD_DATA_ALGO_0)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
err
=
cudnnGetConvolutionBackwardDataWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
algo
,
&
worksize
);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if
((
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
||
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
)
&&
PyGpuArray_NDIM
(
kerns
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
upscale
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
upscale
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
// !(reuse_algo || use_cached || params->choose_time)
if
(
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
)
{
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
*
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
*
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
}
}
else
{
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
}
}
}
if
(
params
->
choose_algo
&&
(
!
params
->
choose_once
||
!
reuse_algo
))
{
// algo may have changed due to fallback, we must update it.
prev_algo
.
algo
=
algo
;
// save worksize for next time/cache
prev_algo
.
wsSize
=
worksize
;
size_t
worksize
;
gpudata
*
workspace
;
// Add to the cache
dnn_conv_update_cache
(
hashkey
,
prev_algo
);
}
err
=
cudnnGetConvolutionBackwardDataWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
algo
,
&
worksize
);
#ifdef DEBUG
if
(
params
->
choose_algo
)
{
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
// NB: This is printed only when algorithm is chosen at runtime.
fprintf
(
stderr
,
"(using %s %s%s%s%s, ws:%ld, hash:%s)
\n
"
,
algorithm_name
,
params
->
choose_time
?
"(timed)"
:
""
,
reuse_algo
?
"(reused)"
:
""
,
use_cached
?
"(cache)"
:
""
,
mathtype
==
CUDNN_TENSOR_OP_MATH
?
"(tensor op)"
:
""
,
worksize
,
hashkey
.
c_str
()
);
}
#endif
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
gpudata
*
workspace
=
0
;
if
(
worksize
!=
0
)
{
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
if
(
workspace
==
NULL
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"Could not allocate working memory"
);
PyErr_SetString
(
PyExc_RuntimeError
,
"Could not allocate working memory"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
...
...
@@ -308,8 +335,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait
(
output
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
((
*
input
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
for
(
int
g
=
0
;
g
<
params
->
num_groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
err
=
cudnnConvolutionBackwardData
(
params
->
handle
,
alpha_p
,
...
...
@@ -330,7 +356,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_exit
(
c
->
ctx
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error doing operation: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error doing
cuDNN conv gradinput
operation: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
...
...
theano/gpuarray/c_code/dnn_gw.c
浏览文件 @
b998dc61
#section init_code_struct
prev_algo
.
algo
=
PARAMS
->
conv_algo
;
prev_algo
.
mathType
=
CUDNN_DEFAULT_MATH
;
reuse_algo
=
0
;
prev_algo
=
PARAMS
->
conv_algo
;
memset
(
prev_img_dims
,
0
,
sizeof
(
prev_img_dims
));
memset
(
prev_top_dims
,
0
,
sizeof
(
prev_top_dims
));
hash_prefix
=
std
::
string
(
"GW|GPU#"
);
#section support_code_struct
#line 9 "dnn_gw.c"
int
reuse_algo
;
AlgoRec
prev_algo
;
std
::
string
hash_prefix
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int
dnn_conv_gw_fallback
(
cudnnConvolutionBwdFilterAlgo_t
*
_algo
,
const
PyGpuArrayObject
*
input
,
const
PyGpuArrayObject
*
kerns
,
cudnnConvolutionDescriptor_t
desc
)
{
cudnnConvolutionBwdFilterAlgo_t
algo
=
*
_algo
;
int
reuse_algo
;
cudnnConvolutionBwdFilterAlgo_t
prev_algo
;
size_t
prev_img_dims
[
5
];
size_t
prev_top_dims
[
5
];
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if
(
algo
==
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT
&&
PyGpuArray_NDIM
(
input
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
upscale
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
cudnnStatus_t
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
upscale
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing gradweight algo fft with none)
\n
"
);
#endif
}
}
*
_algo
=
algo
;
return
0
;
}
int
APPLY_SPECIFIC
(
conv_gw
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
output
,
...
...
@@ -23,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
void
*
beta_p
;
float
af
=
alpha
,
bf
=
beta
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
bool
use_cached
=
0
;
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
km
)[
1
]
*
params
->
num_groups
)
{
PyErr_SetString
(
PyExc_ValueError
,
...
...
@@ -72,99 +119,71 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return
0
;
}
if
(
c_set_tensor_for_conv
(
input
,
APPLY_SPECIFIC
(
input
),
params
->
num_groups
)
==
-
1
)
int
groups
=
c_get_groups_for_conv
(
desc
,
params
->
num_groups
);
if
(
groups
==
-
1
)
return
1
;
if
(
c_set_tensor_for_conv
(
output
,
APPLY_SPECIFIC
(
output
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_tensor_for_conv
(
input
,
APPLY_SPECIFIC
(
input
),
groups
)
==
-
1
)
return
1
;
if
(
c_set_filter
(
*
kerns
,
APPLY_SPECIFIC
(
kerns
),
params
->
num_groups
)
==
-
1
)
if
(
c_set_tensor_for_conv
(
output
,
APPLY_SPECIFIC
(
output
),
groups
)
==
-
1
)
return
1
;
if
(
c_set_filter
(
*
kerns
,
APPLY_SPECIFIC
(
kerns
),
groups
)
==
-
1
)
return
1
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
input
,
0
)
/
params
->
num_groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
*
kerns
,
0
)
*
PyGpuArray_DIM
(
*
kerns
,
0
)
/
params
->
num_groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
output
,
0
)
/
params
->
num_groups
;
if
(
0
!=
dnn_check_convolution_output
(
desc
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_NDIM
(
*
kerns
),
output
,
groups
))
return
1
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
input
,
0
)
/
groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
*
kerns
,
0
)
*
PyGpuArray_DIM
(
*
kerns
,
0
)
/
groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
output
,
0
)
/
groups
;
cudnnConvolutionBwdFilterAlgo_t
algo
=
params
->
conv_algo
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
size_t
worksize
=
0
;
cudnnMathType_t
mathtype
=
CUDNN_DEFAULT_MATH
;
cuda_enter
(
c
->
ctx
)
;
std
::
string
hashkey
;
int
expected_output_dims
[
5
]
=
{
0
};
err
=
cudnnGetConvolutionNdForwardOutputDim
(
desc
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_NDIM
(
input
),
expected_output_dims
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error computing convolution output dim: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
if
(
PyGpuArray_NDIM
(
input
)
==
4
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
/
params
->
num_groups
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %ldx%ldx%dx%ld"
" but received gradient with shape %ldx%ldx%dx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
],
expected_output_dims
[
2
],
expected_output_dims
[
3
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
]);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
else
if
(
PyGpuArray_NDIM
(
input
)
==
5
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
])
||
(
PyGpuArray_DIMS
(
output
)[
4
]
!=
expected_output_dims
[
4
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
],
expected_output_dims
[
2
],
expected_output_dims
[
3
],
expected_output_dims
[
4
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
],
PyGpuArray_DIMS
(
output
)[
4
]);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
size_t
maxfree
=
c_get_largest_free_block_size
(
c
);
if
(
PyErr_Occurred
())
return
1
;
cuda_enter
(
c
->
ctx
);
if
(
params
->
choose_algo
)
{
if
(
!
params
->
choose_once
)
{
reuse_algo
=
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
input
,
i
)
==
prev_img_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
output
,
i
)
==
prev_top_dims
[
i
]);
}
}
if
(
!
reuse_algo
)
{
size_t
free
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
if
(
err2
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
"memory information on the GPU"
);
char
pci_id
[
16
];
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_PCIBUSID
,
pci_id
);
// check out cache
hashkey
=
dnn_conv_shape
(
APPLY_SPECIFIC
(
input
),
input
,
APPLY_SPECIFIC
(
kerns
),
*
kerns
,
desc
,
output
,
groups
);
if
(
hashkey
.
empty
())
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
hashkey
=
hash_prefix
+
pci_id
+
(
params
->
choose_time
?
" -t "
:
" "
)
+
hashkey
;
const
AlgoRec
*
cached
=
dnn_conv_check_cache
(
hashkey
);
if
(
cached
)
{
prev_algo
=
*
cached
;
use_cached
=
1
;
}
}
// Guess 4Mb if the info is not available
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
if
(
reuse_algo
||
use_cached
)
{
algo
=
(
cudnnConvolutionBwdFilterAlgo_t
)
prev_algo
.
algo
;
worksize
=
prev_algo
.
wsSize
;
mathtype
=
prev_algo
.
mathType
;
}
else
{
if
(
params
->
choose_time
)
{
int
count
;
cudnnConvolutionBwdFilterAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
// set the 'tensor math ok' flag
c_set_math_type_for_conv
(
desc
,
CUDNN_TENSOR_OP_MATH
);
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
maxfree
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
cuda_exit
(
c
->
ctx
);
return
-
1
;
}
...
...
@@ -172,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
params
->
handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
*
kerns
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
max
free
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
...
...
@@ -183,25 +202,32 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return
1
;
}
algo
=
choice
.
algo
;
#ifdef DEBUG
if
(
count
==
0
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv gradweight algorithm found"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting best-timed gradweight algo: %s"
,
cudnnGetErrorString
(
choice
.
status
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
// Else, count is necessarly 1 for current implementation.
#endif
algo
=
choice
.
algo
;
prev_algo
.
algo
=
(
int
)
algo
;
prev_algo
.
wsSize
=
worksize
=
choice
.
memory
;
#if CUDNN_MAJOR >= 7
prev_algo
.
mathType
=
mathtype
=
choice
.
mathType
;
#endif
}
else
{
err
=
cudnnGetConvolutionBackwardFilterAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
max
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
...
...
@@ -209,79 +235,84 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit
(
c
->
ctx
);
return
1
;
}
prev_algo
.
algo
=
algo
;
// no tensor_op returned from Get()
prev_algo
.
mathType
=
mathtype
=
CUDNN_DEFAULT_MATH
;
}
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
}
/* choose_algo */
if
(
c_set_math_type_for_conv
(
desc
,
mathtype
)
==
-
1
||
dnn_conv_gw_fallback
(
&
algo
,
input
,
*
kerns
,
desc
)
!=
0
)
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t
(
algo
,
algorithm_name
))
// if FindEx was used (choose_time), workspace size is set.
if
(
!
(
reuse_algo
||
use_cached
||
params
->
choose_time
))
{
err
=
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
algo
,
&
worksize
);
if
(
err
==
CUDNN_STATUS_NOT_SUPPORTED
)
{
// Fallback to none algo if not supported
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
// NB: This is printed only when algorithm is chosen at runtime.
if
(
reuse_algo
)
fprintf
(
stderr
,
"(reused %s)
\n
"
,
algorithm_name
);
else
fprintf
(
stderr
,
"(using %s)
\n
"
,
algorithm_name
);
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
else
{
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
prev_img_dims
[
i
]
=
PyGpuArray_DIM
(
input
,
i
);
prev_top_dims
[
i
]
=
PyGpuArray_DIM
(
output
,
i
);
}
fprintf
(
stderr
,
"(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
;
err
=
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
algo
,
&
worksize
);
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if
(
algo
==
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT
&&
PyGpuArray_NDIM
(
input
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
upscale
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
upscale
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
*
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
*
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
;
}
}
size_t
worksize
;
gpudata
*
workspace
;
if
(
params
->
choose_algo
&&
(
!
params
->
choose_once
||
!
reuse_algo
))
{
// algo may have changed due to fallback, we must update it.
prev_algo
.
algo
=
algo
;
// save worksize for next time/cache
prev_algo
.
wsSize
=
worksize
;
err
=
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
algo
,
&
worksize
);
// Add to the cache
dnn_conv_update_cache
(
hashkey
,
prev_algo
);
}
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
cudnnGetErrorString
(
err
));
#ifdef DEBUG
if
(
params
->
choose_algo
)
{
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
// NB: This is printed only when algorithm is chosen at runtime.
fprintf
(
stderr
,
"(using %s %s%s%s%s, ws:%ld, hash:%s)
\n
"
,
algorithm_name
,
params
->
choose_time
?
"(timed)"
:
""
,
reuse_algo
?
"(reused)"
:
""
,
use_cached
?
"(cache)"
:
""
,
mathtype
==
CUDNN_TENSOR_OP_MATH
?
"(tensor op)"
:
""
,
worksize
,
hashkey
.
c_str
()
);
}
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
gpudata
*
workspace
=
0
;
if
(
worksize
!=
0
)
{
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
if
(
workspace
==
NULL
)
{
...
...
@@ -295,9 +326,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait
(
output
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
((
*
kerns
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
for
(
int
g
=
0
;
g
<
params
->
num_groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
err
=
cudnnConvolutionBackwardFilter
(
params
->
handle
,
alpha_p
,
...
...
@@ -318,7 +347,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit
(
c
->
ctx
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error doing operation: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error doing
cuDNN conv gradweight
operation: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
...
...
theano/gpuarray/dnn.py
浏览文件 @
b998dc61
...
...
@@ -399,7 +399,7 @@ class DnnBase(COp):
return
[]
def
c_code_cache_version
(
self
):
return
(
super
(
DnnBase
,
self
)
.
c_code_cache_version
(),
version
(),
1
)
return
(
super
(
DnnBase
,
self
)
.
c_code_cache_version
(),
version
(),
4
)
class
GpuDnnConvDesc
(
COp
):
...
...
@@ -412,7 +412,8 @@ class GpuDnnConvDesc(COp):
"""
__props__
=
(
'border_mode'
,
'subsample'
,
'dilation'
,
'conv_mode'
,
'precision'
)
__props__
=
(
'border_mode'
,
'subsample'
,
'dilation'
,
'conv_mode'
,
'precision'
,
'num_groups'
)
params_type
=
ParamsType
(
pad0
=
int_t
,
pad1
=
int_t
,
pad2
=
int_t
,
sub0
=
int_t
,
sub1
=
int_t
,
sub2
=
int_t
,
dil0
=
int_t
,
dil1
=
int_t
,
dil2
=
int_t
,
...
...
@@ -421,7 +422,8 @@ class GpuDnnConvDesc(COp):
(
'BORDER_MODE_VALID'
,
'valid'
),
(
'BORDER_MODE_HALF'
,
'half'
)),
conv_mode
=
cudnn
.
cudnnConvolutionMode_t
,
precision
=
cudnn
.
cudnnDataType_t
)
precision
=
cudnn
.
cudnnDataType_t
,
num_groups
=
int_t
)
def
c_headers
(
self
):
return
[
'cudnn.h'
,
'cudnn_helper.h'
]
...
...
@@ -448,7 +450,7 @@ class GpuDnnConvDesc(COp):
return
False
def
__init__
(
self
,
border_mode
,
subsample
=
(
1
,
1
),
dilation
=
(
1
,
1
),
conv_mode
=
'conv'
,
precision
=
"float32"
):
precision
=
"float32"
,
num_groups
=
1
):
COp
.
__init__
(
self
,
[
"c_code/conv_desc.c"
],
"APPLY_SPECIFIC(conv_desc)"
)
if
version
()
<
6000
and
any
([
d
!=
1
for
d
in
dilation
]):
...
...
@@ -470,6 +472,7 @@ class GpuDnnConvDesc(COp):
self
.
subsample
=
subsample
assert
cudnn
.
cudnnConvolutionMode_t
.
has_alias
(
conv_mode
)
self
.
conv_mode
=
conv_mode
self
.
num_groups
=
num_groups
assert
len
(
dilation
)
==
len
(
subsample
)
self
.
dilation
=
dilation
...
...
@@ -514,6 +517,8 @@ class GpuDnnConvDesc(COp):
self
.
__dict__
.
update
(
d
)
if
not
hasattr
(
self
,
"dilation"
):
self
.
dilation
=
(
1
,)
*
len
(
self
.
subsample
)
if
not
hasattr
(
self
,
"num_groups"
):
self
.
num_groups
=
1
# scalar constants
...
...
@@ -622,8 +627,6 @@ class GpuDnnConv(DnnBase):
SUPPORTED_DNN_CONV_ALGO_RUNTIME
):
raise
ValueError
(
"convolution algo
%
s can't be used for "
"3d convolutions"
,
(
self
.
algo
,))
if
img
.
type
.
ndim
==
5
and
self
.
num_groups
!=
1
:
raise
ValueError
(
"Grouped convolutions not implemented for 3D convolutions"
)
if
(
not
isinstance
(
desc
.
type
,
CDataType
)
or
desc
.
type
.
ctype
!=
'cudnnConvolutionDescriptor_t'
):
...
...
@@ -854,7 +857,6 @@ class GpuDnnConvGradI(DnnBase):
if
algo
is
None
:
algo
=
config
.
dnn
.
conv
.
algo_bwd_data
self
.
algo
=
algo
assert
cudnn
.
cudnnConvolutionBwdDataAlgo_t
.
has_alias
(
self
.
algo
)
or
self
.
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
self
.
conv_algo
=
cudnn
.
cudnnConvolutionBwdDataAlgo_t
.
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
...
...
@@ -1039,7 +1041,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
img
=
gpu_contiguous
(
img
)
kerns
=
gpu_contiguous
(
kerns
)
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
conv_mode
=
conv_mode
,
precision
=
precision
,
num_groups
=
num_groups
)(
kerns
.
shape
)
desc_op
=
desc
.
owner
.
op
# We can use Shape_i and bypass the infer_shape here as this is on
# the input of node and it will always be present.
...
...
@@ -1056,7 +1059,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
def
dnn_conv3d
(
img
,
kerns
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
direction_hint
=
None
,
algo
=
None
,
precision
=
None
):
algo
=
None
,
precision
=
None
,
num_groups
=
1
):
"""
GPU convolution using cuDNN from NVIDIA.
...
...
@@ -1099,6 +1102,9 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
should be done. Possible values are 'as_input', 'float16', 'float32'
and 'float64'. Default is the value of
:attr:`config.dnn.conv.precision`.
num_groups :
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
.. warning:: The cuDNN library only works with GPUs that have a compute
...
...
@@ -1113,7 +1119,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
fgraph
=
getattr
(
img
,
'fgraph'
,
None
)
or
getattr
(
kerns
,
'fgraph'
,
None
)
ctx_name
=
infer_context_name
(
img
,
kerns
)
if
(
border_mode
==
'valid'
and
subsample
==
(
1
,
1
,
1
)
and
dilation
==
(
1
,
1
,
1
)
and
direction_hint
==
'bprop weights'
):
direction_hint
==
'bprop weights'
and
num_groups
==
1
):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img
=
gpu_contiguous
(
img
.
dimshuffle
(
1
,
0
,
2
,
3
,
4
))
...
...
@@ -1135,7 +1141,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
return
as_gpuarray_variable
(
conv
.
dimshuffle
(
1
,
0
,
2
,
3
,
4
),
ctx_name
)
elif
(
border_mode
==
'full'
and
subsample
==
(
1
,
1
,
1
)
and
direction_hint
!=
'forward!'
):
direction_hint
!=
'forward!'
and
num_groups
==
1
):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
...
...
@@ -1159,7 +1165,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
img
=
gpu_contiguous
(
img
)
kerns
=
gpu_contiguous
(
kerns
)
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
conv_mode
=
conv_mode
,
precision
=
precision
,
num_groups
=
num_groups
)(
kerns
.
shape
)
desc_op
=
desc
.
owner
.
op
# We can use Shape_i and bypass the infer_shape here as this is on
# the input of node and it will always be present.
...
...
@@ -1171,7 +1178,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
filter_dilation
=
dilation
)
out_shp
=
assert_conv_shape
(
out_shp
)
out
=
GpuAllocEmpty
(
dtype
=
img
.
dtype
,
context_name
=
ctx_name
)(
*
out_shp
)
return
GpuDnnConv
(
algo
=
algo
)(
img
,
kerns
,
out
,
desc
)
return
GpuDnnConv
(
algo
=
algo
,
num_groups
=
num_groups
)(
img
,
kerns
,
out
,
desc
)
def
dnn_gradweight
(
img
,
topgrad
,
kerns_shp
,
border_mode
=
'valid'
,
...
...
@@ -1189,18 +1196,21 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
precision
=
get_precision
(
precision
,
[
img
,
topgrad
])
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns_shp
)
conv_mode
=
conv_mode
,
precision
=
precision
,
num_groups
=
num_groups
)(
kerns_shp
)
out
=
GpuAllocEmpty
(
dtype
=
img
.
dtype
,
context_name
=
ctx_name
)(
*
kerns_shp
)
return
GpuDnnConvGradW
(
algo
=
algo
,
num_groups
=
num_groups
)(
img
,
topgrad
,
out
,
desc
)
def
dnn_gradweight3d
(
img
,
topgrad
,
kerns_shp
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
precision
=
None
):
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
1
):
"""
3d version of dnn_gradweight
"""
return
dnn_gradweight
(
img
,
topgrad
,
kerns_shp
,
border_mode
,
subsample
,
dilation
,
conv_mode
,
precision
)
subsample
,
dilation
,
conv_mode
,
precision
,
algo
,
num_groups
)
def
dnn_gradinput
(
kerns
,
topgrad
,
img_shp
,
border_mode
=
'valid'
,
...
...
@@ -1218,18 +1228,21 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
precision
=
get_precision
(
precision
,
[
kerns
,
topgrad
])
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
conv_mode
=
conv_mode
,
precision
=
precision
,
num_groups
=
num_groups
)(
kerns
.
shape
)
out
=
GpuAllocEmpty
(
dtype
=
kerns
.
dtype
,
context_name
=
ctx_name
)(
*
img_shp
)
return
GpuDnnConvGradI
(
algo
=
algo
,
num_groups
=
num_groups
)(
kerns
,
topgrad
,
out
,
desc
)
def
dnn_gradinput3d
(
kerns
,
topgrad
,
img_shp
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
precision
=
None
):
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
1
):
"""
3d version of `dnn_gradinput`.
"""
return
dnn_gradinput
(
kerns
,
topgrad
,
img_shp
,
border_mode
,
subsample
,
dilation
,
conv_mode
,
precision
)
dilation
,
conv_mode
,
precision
,
algo
,
num_groups
)
class
GpuDnnPoolDesc
(
Op
):
...
...
@@ -3020,8 +3033,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if
version
(
raises
=
False
)
<
6000
and
op
.
filter_dilation
!=
(
1
,
1
):
return
None
if
op
.
num_groups
>
1
:
return
None
inp1
=
inputs
[
0
]
inp2
=
inputs
[
1
]
...
...
@@ -3071,8 +3082,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
if
version
(
raises
=
False
)
<
6000
and
op
.
filter_dilation
!=
(
1
,
1
,
1
):
return
None
if
op
.
num_groups
>
1
:
return
None
inp1
=
inputs
[
0
]
inp2
=
inputs
[
1
]
...
...
@@ -3091,7 +3100,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
subsample
=
op
.
subsample
,
dilation
=
op
.
filter_dilation
,
direction_hint
=
'forward!'
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
,
num_groups
=
op
.
num_groups
)
elif
isinstance
(
op
,
AbstractConv3d_gradWeights
):
shape
=
(
inp2
.
shape
[
1
],
inp1
.
shape
[
1
],
inputs
[
2
][
0
],
inputs
[
2
][
1
],
inputs
[
2
][
2
])
...
...
@@ -3099,7 +3109,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
border_mode
=
op
.
border_mode
,
subsample
=
op
.
subsample
,
dilation
=
op
.
filter_dilation
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
,
num_groups
=
op
.
num_groups
)
elif
isinstance
(
op
,
AbstractConv3d_gradInputs
):
shape
=
(
inp2
.
shape
[
0
],
inp1
.
shape
[
1
],
inputs
[
2
][
0
],
inputs
[
2
][
1
],
inputs
[
2
][
2
])
...
...
@@ -3107,7 +3118,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
border_mode
=
op
.
border_mode
,
subsample
=
op
.
subsample
,
dilation
=
op
.
filter_dilation
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
,
num_groups
=
op
.
num_groups
)
return
[
rval
]
...
...
theano/gpuarray/tests/test_dnn.py
浏览文件 @
b998dc61
...
...
@@ -26,6 +26,7 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer
from
theano.configdefaults
import
SUPPORTED_DNN_CONV_ALGO_FWD
from
theano.tensor.nnet.tests.test_abstract_conv
import
Grouped_conv_noOptim
from
theano.tensor.nnet.tests.test_abstract_conv
import
Grouped_conv3d_noOptim
try
:
import
pygpu
...
...
@@ -2264,7 +2265,7 @@ def test_dnn_rnn_lstm_grad_c():
utt
.
assert_allclose
(
ref_grads_layer
[
j
],
g
)
def
dconv
2
d
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconv
fw
d
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconv
(
img
,
kern
):
return
dnn
.
dnn_conv
(
img
,
kern
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
conv_mode
=
'conv'
,
direction_hint
=
'forward'
,
workmem
=
None
,
...
...
@@ -2272,14 +2273,14 @@ def dconv2d(border_mode, subsample, filter_dilation, num_groups):
return
dconv
def
dconv
2d
w
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconv
g
w
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconvw
(
img
,
topgrad
,
kshp
):
return
dnn
.
dnn_gradweight
(
img
,
topgrad
,
kshp
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
num_groups
)
return
dconvw
def
dconv
2d
i
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconv
g
i
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconvi
(
kern
,
topgrad
,
imshp
):
return
dnn
.
dnn_gradinput
(
kern
,
topgrad
,
imshp
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
num_groups
)
...
...
@@ -2288,9 +2289,21 @@ def dconv2di(border_mode, subsample, filter_dilation, num_groups):
class
Cudnn_grouped_conv
(
Grouped_conv_noOptim
):
mode
=
mode_with_gpu
conv
=
staticmethod
(
dconv2d
)
conv_gradw
=
staticmethod
(
dconv2dw
)
conv_gradi
=
staticmethod
(
dconv2di
)
conv
=
staticmethod
(
dconvfwd
)
conv_gradw
=
staticmethod
(
dconvgw
)
conv_gradi
=
staticmethod
(
dconvgi
)
conv_op
=
dnn
.
GpuDnnConv
conv_gradw_op
=
dnn
.
GpuDnnConvGradW
conv_gradi_op
=
dnn
.
GpuDnnConvGradI
flip_filter
=
False
is_dnn
=
True
class
Cudnn_grouped_conv3d
(
Grouped_conv3d_noOptim
):
mode
=
mode_with_gpu
conv
=
staticmethod
(
dconvfwd
)
conv_gradw
=
staticmethod
(
dconvgw
)
conv_gradi
=
staticmethod
(
dconvgi
)
conv_op
=
dnn
.
GpuDnnConv
conv_gradw_op
=
dnn
.
GpuDnnConvGradW
conv_gradi_op
=
dnn
.
GpuDnnConvGradI
...
...
@@ -2519,3 +2532,151 @@ def test_dnn_spatialtf_grad():
utt
.
verify_grad
(
grad_functor
,
[
inputs_val
,
theta_val
],
mode
=
mode_with_gpu
,
abs_tol
=
atol
,
rel_tol
=
rtol
)
class
TestDnnConv2DRuntimeAlgorithms
(
object
):
ndim
=
2
cpu_conv_class
=
theano
.
tensor
.
nnet
.
corr
.
CorrMM
runtime_shapes
=
[
(
3
,
[(
2
,
3
,
10
,
9
),
(
5
,
3
,
7
,
7
)]),
(
1
,
[(
1
,
1
,
100
,
200
),
(
1
,
1
,
50
,
200
)]),
(
1
,
[(
4
,
2
,
20
,
20
),
(
2
,
2
,
20
,
19
)]),
(
3
,
[(
2
,
3
,
10
,
9
),
(
5
,
3
,
7
,
7
)]),
# cache should be used
(
1
,
[(
2
,
2
,
50
,
50
),
(
5
,
2
,
25
,
31
)]),
(
1
,
[(
1
,
1
,
100
,
200
),
(
1
,
1
,
50
,
200
)]),
# cache should be used
(
1
,
[(
4
,
2
,
20
,
20
),
(
2
,
2
,
20
,
19
)]),
# cache should be used
(
1
,
[(
1
,
2
,
3
,
4
),
(
6
,
2
,
2
,
1
)])
]
def
__init__
(
self
):
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
utt
.
seed_rng
()
self
.
runtime_algorithms
=
(
'time_once'
,
'guess_once'
,
'time_on_shape_change'
,
'guess_on_shape_change'
)
def
test_fwd_runtime_algorithms
(
self
):
dtype
=
'float32'
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_fwd_runtime_algorithm
(
algo
):
inputs
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
filters
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
# Scale down the input values to prevent very large absolute errors
# due to float rounding
lower_inputs
=
inputs
/
10
lower_filters
=
filters
/
10
conv
=
dnn
.
dnn_conv
(
img
=
lower_inputs
,
kerns
=
lower_filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
)
f
=
theano
.
function
([
inputs
,
filters
],
conv
,
mode
=
mode_with_gpu
)
if
self
.
ndim
==
3
:
flipped_filters
=
lower_filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
lower_filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
lower_inputs
),
flipped_filters
)
f_ref
=
theano
.
function
([
inputs
,
filters
],
conv_ref
,
mode
=
'FAST_RUN'
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
'time_once'
,
'guess_once'
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
f
(
inputs_val
,
filters_val
)
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
utt
.
assert_allclose
(
cpu_res
,
np
.
asarray
(
gpu_res
))
for
algo
in
self
.
runtime_algorithms
:
yield
(
run_fwd_runtime_algorithm
,
algo
)
def
test_gradinput_runtime_algorithms
(
self
):
dtype
=
'float32'
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_gradinput_runtime_algorithm
(
algo
):
theano
.
config
.
dnn
.
conv
.
algo_bwd_data
=
algo
inputs
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
filters
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
conv
=
dnn
.
dnn_conv
(
img
=
inputs
,
kerns
=
filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
)
grad_i
=
theano
.
tensor
.
grad
(
conv
.
sum
(),
[
inputs
])
f
=
theano
.
function
([
inputs
,
filters
],
grad_i
,
mode
=
mode_with_gpu
)
assert
1
==
len
([
node
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
node
.
op
,
dnn
.
GpuDnnConvGradI
)])
assert
not
any
(
isinstance
(
node
.
op
,
dnn
.
GpuDnnConv
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
assert
not
any
(
isinstance
(
node
.
op
,
dnn
.
GpuDnnConvGradW
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
if
self
.
ndim
==
3
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
inputs
),
flipped_filters
)
grad_i_ref
=
theano
.
tensor
.
grad
(
conv_ref
.
sum
(),
[
inputs
])
f_ref
=
theano
.
function
([
inputs
,
filters
],
grad_i_ref
,
mode
=
'FAST_RUN'
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
'time_once'
,
'guess_once'
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
f
(
inputs_val
,
filters_val
)
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
utt
.
assert_allclose
(
cpu_res
,
np
.
asarray
(
gpu_res
))
for
algo
in
self
.
runtime_algorithms
:
yield
(
run_gradinput_runtime_algorithm
,
algo
)
def
test_gradweight_runtime_algorithms
(
self
):
dtype
=
'float32'
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_gradweight_runtime_algorithm
(
algo
):
theano
.
config
.
dnn
.
conv
.
algo_bwd_filter
=
algo
inputs
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
filters
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
conv
=
dnn
.
dnn_conv
(
img
=
inputs
,
kerns
=
filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
)
grad_w
=
theano
.
tensor
.
grad
(
conv
.
sum
(),
[
filters
])
f
=
theano
.
function
([
inputs
,
filters
],
grad_w
,
mode
=
mode_with_gpu
)
assert
1
==
len
([
node
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
node
.
op
,
dnn
.
GpuDnnConvGradW
)])
assert
not
any
(
isinstance
(
node
.
op
,
dnn
.
GpuDnnConv
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
assert
not
any
(
isinstance
(
node
.
op
,
dnn
.
GpuDnnConvGradI
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
if
self
.
ndim
==
3
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
inputs
),
flipped_filters
)
grad_w_ref
=
theano
.
tensor
.
grad
(
conv_ref
.
sum
(),
[
filters
])
f_ref
=
theano
.
function
([
inputs
,
filters
],
grad_w_ref
,
mode
=
'FAST_RUN'
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
'time_once'
,
'guess_once'
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
f
(
inputs_val
,
filters_val
)
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
utt
.
assert_allclose
(
cpu_res
,
np
.
asarray
(
gpu_res
))
for
algo
in
self
.
runtime_algorithms
:
yield
(
run_gradweight_runtime_algorithm
,
algo
)
class
TestDnnConv3DRuntimeAlgorithms
(
TestDnnConv2DRuntimeAlgorithms
):
ndim
=
3
cpu_conv_class
=
theano
.
tensor
.
nnet
.
corr3d
.
Corr3dMM
runtime_shapes
=
[
(
3
,
[(
2
,
3
,
5
,
10
,
9
),
(
5
,
3
,
4
,
7
,
7
)]),
(
1
,
[(
1
,
1
,
5
,
100
,
200
),
(
1
,
1
,
4
,
50
,
200
)]),
(
1
,
[(
4
,
2
,
20
,
20
,
20
),
(
2
,
2
,
20
,
19
,
18
)]),
(
3
,
[(
2
,
3
,
5
,
10
,
9
),
(
5
,
3
,
4
,
7
,
7
)]),
# cache should be used
(
1
,
[(
2
,
2
,
50
,
50
,
5
),
(
5
,
2
,
25
,
31
,
4
)]),
(
1
,
[(
1
,
1
,
5
,
100
,
200
),
(
1
,
1
,
4
,
50
,
200
)]),
# cache should be used
(
1
,
[(
4
,
2
,
20
,
20
,
20
),
(
2
,
2
,
20
,
19
,
18
)]),
# cache should be used
(
1
,
[(
1
,
2
,
3
,
4
,
5
),
(
6
,
2
,
3
,
2
,
1
)])
]
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论