Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b998dc61
提交
b998dc61
authored
8月 21, 2017
作者:
Frédéric Bastien
提交者:
GitHub
8月 21, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #6302 from borisfom/tensor_op
Tensor op, cache
上级
c470bd38
e51b6a22
隐藏空白字符变更
内嵌
并排
正在显示
8 个修改的文件
包含
1103 行增加
和
541 行删除
+1103
-541
conv_desc.c
theano/gpuarray/c_code/conv_desc.c
+15
-0
cudnn_helper.h
theano/gpuarray/c_code/cudnn_helper.h
+9
-1
dnn_conv_base.c
theano/gpuarray/c_code/dnn_conv_base.c
+255
-1
dnn_fwd.c
theano/gpuarray/c_code/dnn_fwd.c
+225
-168
dnn_gi.c
theano/gpuarray/c_code/dnn_gi.c
+225
-199
dnn_gw.c
theano/gpuarray/c_code/dnn_gw.c
+169
-140
dnn.py
theano/gpuarray/dnn.py
+38
-26
test_dnn.py
theano/gpuarray/tests/test_dnn.py
+167
-6
没有找到文件。
theano/gpuarray/c_code/conv_desc.c
浏览文件 @
b998dc61
#section support_code_apply
#section support_code_apply
static
int
c_set_groups_for_conv
(
cudnnConvolutionDescriptor_t
desc
,
int
groups
)
{
#if CUDNN_MAJOR >= 7
cudnnStatus_t
err
=
cudnnSetConvolutionGroupCount
(
desc
,
groups
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error setting groups for convolution : %s"
,
cudnnGetErrorString
(
err
));
return
-
1
;
}
#endif
return
0
;
}
int
APPLY_SPECIFIC
(
conv_desc
)(
PyArrayObject
*
filt_shp
,
int
APPLY_SPECIFIC
(
conv_desc
)(
PyArrayObject
*
filt_shp
,
cudnnConvolutionDescriptor_t
*
desc
,
cudnnConvolutionDescriptor_t
*
desc
,
PARAMS_TYPE
*
params
)
{
PARAMS_TYPE
*
params
)
{
...
@@ -43,5 +56,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
...
@@ -43,5 +56,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
"descriptor: %s"
,
cudnnGetErrorString
(
err
));
"descriptor: %s"
,
cudnnGetErrorString
(
err
));
return
-
1
;
return
-
1
;
}
}
if
(
c_set_groups_for_conv
(
*
desc
,
params
->
num_groups
)
==
-
1
)
return
-
1
;
return
0
;
return
0
;
}
}
theano/gpuarray/c_code/cudnn_helper.h
浏览文件 @
b998dc61
...
@@ -11,6 +11,14 @@ static inline int cudnnGetVersion() {
...
@@ -11,6 +11,14 @@ static inline int cudnnGetVersion() {
}
}
#endif
#endif
#if CUDNN_MAJOR < 7
enum
cudnnMathType_t
{
CUDNN_DEFAULT_MATH
=
0
,
CUDNN_TENSOR_OP_MATH
=
1
};
#endif
/* a common struct for all 3 CUDNN enums */
struct
AlgoRec
{
int
algo
;
size_t
wsSize
;
cudnnMathType_t
mathType
;
};
#endif
#endif
theano/gpuarray/c_code/dnn_conv_base.c
浏览文件 @
b998dc61
...
@@ -3,6 +3,43 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
...
@@ -3,6 +3,43 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t
APPLY_SPECIFIC
(
output
);
cudnnTensorDescriptor_t
APPLY_SPECIFIC
(
output
);
cudnnFilterDescriptor_t
APPLY_SPECIFIC
(
kerns
);
cudnnFilterDescriptor_t
APPLY_SPECIFIC
(
kerns
);
static
int
c_get_groups_for_conv
(
cudnnConvolutionDescriptor_t
desc
,
int
groups
)
{
#if CUDNN_MAJOR >= 7
int
desc_groups
;
if
(
groups
>
1
)
{
cudnnStatus_t
err
=
cudnnGetConvolutionGroupCount
(
desc
,
&
desc_groups
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting groups for convolution : %s"
,
cudnnGetErrorString
(
err
));
return
-
1
;
}
if
(
groups
!=
desc_groups
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"groups specified different from convolution descriptor"
);
return
-
1
;
}
}
return
1
;
#else
return
groups
;
#endif
}
static
int
c_set_math_type_for_conv
(
cudnnConvolutionDescriptor_t
desc
,
cudnnMathType_t
mathtype
)
{
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
cudnnStatus_t
err
=
cudnnSetConvolutionMathType
(
desc
,
mathtype
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error setting math type for convolution : %s"
,
cudnnGetErrorString
(
err
));
return
-
1
;
}
#endif
return
0
;
}
#section init_code_struct
#section init_code_struct
cudnnStatus_t
APPLY_SPECIFIC
(
err
);
cudnnStatus_t
APPLY_SPECIFIC
(
err
);
...
@@ -20,7 +57,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
...
@@ -20,7 +57,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
FAIL
;
FAIL
;
}
}
if
((
APPLY_SPECIFIC
(
err
)
=
cudnnCreateFilterDescriptor
(
&
APPLY_SPECIFIC
(
kerns
)))
!=
CUDNN_STATUS_SUCCESS
)
{
if
((
APPLY_SPECIFIC
(
err
)
=
cudnnCreateFilterDescriptor
(
&
APPLY_SPECIFIC
(
kerns
)))
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_MemoryError
,
"could not allocate filter descriptor: %s"
,
PyErr_Format
(
PyExc_MemoryError
,
"could not allocate filter descriptor: %s"
,
cudnnGetErrorString
(
APPLY_SPECIFIC
(
err
)));
cudnnGetErrorString
(
APPLY_SPECIFIC
(
err
)));
FAIL
;
FAIL
;
}
}
...
@@ -33,3 +70,220 @@ if (APPLY_SPECIFIC(output) != NULL)
...
@@ -33,3 +70,220 @@ if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor
(
APPLY_SPECIFIC
(
output
));
cudnnDestroyTensorDescriptor
(
APPLY_SPECIFIC
(
output
));
if
(
APPLY_SPECIFIC
(
kerns
)
!=
NULL
)
if
(
APPLY_SPECIFIC
(
kerns
)
!=
NULL
)
cudnnDestroyFilterDescriptor
(
APPLY_SPECIFIC
(
kerns
));
cudnnDestroyFilterDescriptor
(
APPLY_SPECIFIC
(
kerns
));
#section support_code
#include <sstream>
#include <string>
#if __cplusplus < 201103L
#include <tr1/unordered_map>
typedef
std
::
tr1
::
unordered_map
<
std
::
string
,
AlgoRec
>
AlgoCache
;
#else
#include <unordered_map>
typedef
std
::
unordered_map
<
std
::
string
,
AlgoRec
>
AlgoCache
;
#endif
#include "pthread.h"
#line 87 "dnn_conv_base.c"
pthread_mutex_t
algoMutex
;
AlgoCache
algoCache
;
static
cudnnStatus_t
checkCudnnStatus
(
cudnnStatus_t
err
,
const
char
*
msg
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"CUDNN Error: %s: %s"
,
msg
,
cudnnGetErrorString
(
err
));
}
return
err
;
}
static
size_t
c_get_largest_free_block_size
(
PyGpuContextObject
*
c
)
{
size_t
maxfree
=
0
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
maxfree
);
if
(
err2
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
"memory information on the GPU"
);
}
// Guess 4Mb if the info is not available
if
(
maxfree
==
0
)
maxfree
=
4
*
1024
*
1024
;
return
maxfree
;
}
/** Check if convolution output tensor has expected dimensions
depending on given inputs and number of groups.
return 0 if everything is ok, non-0 on error.
**/
static
int
dnn_check_convolution_output
(
cudnnConvolutionDescriptor_t
convDesc
,
cudnnTensorDescriptor_t
inputDesc
,
cudnnFilterDescriptor_t
filterDesc
,
size_t
tensorNdim
,
PyGpuArrayObject
*
output
,
int
groups
)
{
int
expected_output_dims
[
5
]
=
{
0
};
cudnnStatus_t
err
=
cudnnGetConvolutionNdForwardOutputDim
(
convDesc
,
inputDesc
,
filterDesc
,
tensorNdim
,
expected_output_dims
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error computing convolution output dim: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
if
(
tensorNdim
==
4
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
/
groups
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %dx%dx%dx%d"
" but received %ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
]
*
groups
,
expected_output_dims
[
2
],
expected_output_dims
[
3
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
]);
return
1
;
}
}
else
if
(
tensorNdim
==
5
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
/
groups
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
])
||
(
PyGpuArray_DIMS
(
output
)[
4
]
!=
expected_output_dims
[
4
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %dx%dx%dx%dx%d"
" but received %ldx%ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
]
*
groups
,
expected_output_dims
[
2
],
expected_output_dims
[
3
],
expected_output_dims
[
4
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
],
PyGpuArray_DIMS
(
output
)[
4
]);
return
1
;
}
}
return
0
;
}
static
std
::
string
shape
(
int
*
res
,
int
size
)
{
std
::
ostringstream
s
;
if
(
size
>
0
)
{
s
<<
res
[
0
];
for
(
int
i
=
1
;
i
<
size
;
++
i
)
s
<<
','
<<
res
[
i
];
}
return
s
.
str
();
}
static
std
::
string
shape
(
cudnnTensorDescriptor_t
t
)
{
// cuDNN can handle up to CUDNN_DIM_MAX dimensions.
int
res
[
CUDNN_DIM_MAX
];
int
stride
[
CUDNN_DIM_MAX
];
int
nbDims
;
cudnnDataType_t
type
;
checkCudnnStatus
(
cudnnGetTensorNdDescriptor
(
t
,
CUDNN_DIM_MAX
,
&
type
,
&
nbDims
,
res
,
stride
),
"error getting tensor description"
);
if
(
PyErr_Occurred
())
return
""
;
return
shape
(
res
,
nbDims
)
+
","
+
shape
(
stride
,
nbDims
);
};
static
std
::
string
shape
(
cudnnFilterDescriptor_t
t
,
cudnnDataType_t
*
type
)
{
cudnnTensorFormat_t
format
;
int
res
[
CUDNN_DIM_MAX
];
int
outDims
;
checkCudnnStatus
(
cudnnGetFilterNdDescriptor
(
t
,
CUDNN_DIM_MAX
,
type
,
&
format
,
&
outDims
,
res
),
"error getting filter description"
);
if
(
PyErr_Occurred
())
return
""
;
return
shape
(
res
,
outDims
);
};
static
std
::
string
shape
(
cudnnConvolutionDescriptor_t
convDesc
)
{
int
nDim
;
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
computeType
;
int
padA
[
5
];
int
strideA
[
5
];
int
dilationA
[
5
];
checkCudnnStatus
(
cudnnGetConvolutionNdDescriptor
(
convDesc
,
5
,
&
nDim
,
&
padA
[
0
],
&
strideA
[
0
],
&
dilationA
[
0
],
&
mode
,
&
computeType
),
"error getting convolution description"
);
if
(
PyErr_Occurred
())
return
""
;
return
(
std
::
string
(
"-mode "
)
+
((
mode
==
CUDNN_CONVOLUTION
)
?
"conv"
:
"cross"
)
+
" -pad "
+
shape
(
padA
,
nDim
)
+
" -subsample "
+
shape
(
strideA
,
nDim
)
+
" -dilation "
+
shape
(
dilationA
,
nDim
));
}
static
bool
all_aligned
(
cudnnDataType_t
type
,
void
*
in
,
void
*
out
,
void
*
filter
)
{
size_t
alignMask
=
(
type
==
CUDNN_DATA_HALF
)
?
0x7F
:
0xFF
;
// there have to be entries for both aligned and not
if
(((
size_t
)
in
|
(
size_t
)
out
|
(
size_t
)
filter
)
&
alignMask
)
{
return
false
;
}
return
true
;
}
static
std
::
string
dnn_conv_shape
(
cudnnTensorDescriptor_t
inputDesc
,
PyGpuArrayObject
*
input
,
cudnnFilterDescriptor_t
filterDesc
,
PyGpuArrayObject
*
filter
,
cudnnConvolutionDescriptor_t
convDesc
,
PyGpuArrayObject
*
output
,
int
groups
)
{
cudnnDataType_t
dType
;
std
::
ostringstream
s
;
int
expected_output_dims
[
5
]
=
{
0
};
if
(
dnn_check_convolution_output
(
convDesc
,
inputDesc
,
filterDesc
,
PyGpuArray_NDIM
(
filter
),
output
,
groups
)
!=
0
)
return
""
;
std
::
string
shapeInput
=
shape
(
inputDesc
);
std
::
string
shapeFilter
=
shape
(
filterDesc
,
&
dType
);
std
::
string
shapeConvDesc
=
shape
(
convDesc
);
if
(
shapeInput
.
empty
()
||
shapeFilter
.
empty
()
||
shapeConvDesc
.
empty
())
return
""
;
s
<<
"-g "
<<
groups
<<
" -dim "
<<
shapeInput
<<
" -filt "
<<
shapeFilter
<<
" "
<<
shapeConvDesc
;
// there have to be entries for both aligned and not.
if
(
!
all_aligned
(
dType
,
PyGpuArray_DEV_DATA
(
input
),
PyGpuArray_DEV_DATA
(
output
),
PyGpuArray_DEV_DATA
(
filter
)))
{
s
<<
" [unaligned]"
;
}
return
s
.
str
();
}
static
void
dnn_conv_update_cache
(
const
std
::
string
&
hash
,
const
AlgoRec
&
rec
)
{
pthread_mutex_lock
(
&
algoMutex
);
algoCache
[
hash
]
=
rec
;
pthread_mutex_unlock
(
&
algoMutex
);
}
static
const
AlgoRec
*
dnn_conv_check_cache
(
const
std
::
string
&
hash
)
{
pthread_mutex_lock
(
&
algoMutex
);
const
AlgoRec
*
ret
=
0
;
AlgoCache
::
iterator
hit
=
algoCache
.
find
(
hash
);
if
(
hit
!=
algoCache
.
end
())
ret
=
&
hit
->
second
;
pthread_mutex_unlock
(
&
algoMutex
);
return
ret
;
}
theano/gpuarray/c_code/dnn_fwd.c
浏览文件 @
b998dc61
#section init_code_struct
#section init_code_struct
prev_algo
.
algo
=
PARAMS
->
conv_algo
;
prev_algo
.
mathType
=
CUDNN_DEFAULT_MATH
;
reuse_algo
=
0
;
reuse_algo
=
0
;
prev_algo
=
PARAMS
->
conv_algo
;
hash_prefix
=
std
::
string
(
"FWD|GPU#"
);
memset
(
prev_img_dims
,
0
,
sizeof
(
prev_img_dims
));
memset
(
prev_kern_dims
,
0
,
sizeof
(
prev_kern_dims
));
#section support_code_struct
#section support_code_struct
#line 9 "dnn_fwd.c"
int
reuse_algo
;
AlgoRec
prev_algo
;
std
::
string
hash_prefix
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int
dnn_conv_fwd_fallback
(
cudnnConvolutionFwdAlgo_t
*
_algo
,
const
PyGpuArrayObject
*
input
,
const
PyGpuArrayObject
*
kerns
,
cudnnConvolutionDescriptor_t
desc
)
{
cudnnConvolutionFwdAlgo_t
algo
=
*
_algo
;
int
reuse_algo
;
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
cudnnConvolutionFwdAlgo_t
prev_algo
;
if
(
PyGpuArray_NDIM
(
input
)
==
5
&&
size_t
prev_img_dims
[
5
];
!
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
||
size_t
prev_kern_dims
[
5
];
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
))
{
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
return
1
;
fprintf
(
stderr
,
"(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
// NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
if
((
cudnnGetVersion
()
<
6100
||
PyGpuArray_NDIM
(
input
)
==
5
)
&&
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
&&
PyGpuArray_DIM
(
input
,
0
)
>
65536
)
{
#ifdef DEBUG
fprintf
(
stderr
,
"(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if
((
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
)
&&
PyGpuArray_NDIM
(
input
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
dilation
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
cudnnStatus_t
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
dilation
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
if
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
)
{
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing fwd algo fft with none)
\n
"
);
#endif
}
}
else
{
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing fwd algo fft_tiling with none)
\n
"
);
#endif
}
}
}
*
_algo
=
algo
;
return
0
;
}
int
int
APPLY_SPECIFIC
(
conv_fwd
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
kerns
,
APPLY_SPECIFIC
(
conv_fwd
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
kerns
,
...
@@ -24,6 +117,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -24,6 +117,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
void
*
beta_p
;
void
*
beta_p
;
float
af
=
alpha
,
bf
=
beta
;
float
af
=
alpha
,
bf
=
beta
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
bool
use_cached
=
0
;
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
]
*
params
->
num_groups
)
{
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
]
*
params
->
num_groups
)
{
PyErr_SetString
(
PyExc_ValueError
,
PyErr_SetString
(
PyExc_ValueError
,
...
@@ -73,65 +167,76 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -73,65 +167,76 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return
0
;
return
0
;
}
}
if
(
c_set_tensor_for_conv
(
input
,
APPLY_SPECIFIC
(
input
),
params
->
num_groups
)
==
-
1
)
int
groups
=
c_get_groups_for_conv
(
desc
,
params
->
num_groups
);
if
(
groups
==
-
1
)
return
1
;
if
(
c_set_tensor_for_conv
(
input
,
APPLY_SPECIFIC
(
input
),
groups
)
==
-
1
)
return
1
;
return
1
;
if
(
c_set_filter
(
kerns
,
APPLY_SPECIFIC
(
kerns
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_filter
(
kerns
,
APPLY_SPECIFIC
(
kerns
),
groups
)
==
-
1
)
return
1
;
return
1
;
if
(
c_set_tensor_for_conv
(
*
output
,
APPLY_SPECIFIC
(
output
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_tensor_for_conv
(
*
output
,
APPLY_SPECIFIC
(
output
),
groups
)
==
-
1
)
return
1
;
return
1
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
input
,
0
)
/
params
->
num_
groups
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
input
,
0
)
/
groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
kerns
,
0
)
*
PyGpuArray_DIM
(
kerns
,
0
)
/
params
->
num_
groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
kerns
,
0
)
*
PyGpuArray_DIM
(
kerns
,
0
)
/
groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
*
output
,
0
)
/
params
->
num_
groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
*
output
,
0
)
/
groups
;
cudnnConvolutionFwdAlgo_t
algo
=
params
->
conv_algo
;
cudnnConvolutionFwdAlgo_t
algo
=
params
->
conv_algo
;
#ifdef DEBUG
size_t
worksize
=
0
;
char
algorithm_name
[
128
];
cudnnMathType_t
mathtype
=
CUDNN_DEFAULT_MATH
;
#endif
std
::
string
hashkey
;
cuda_enter
(
c
->
ctx
);
cuda_enter
(
c
->
ctx
);
size_t
maxfree
=
c_get_largest_free_block_size
(
c
);
if
(
PyErr_Occurred
())
return
1
;
if
(
params
->
choose_algo
)
{
if
(
params
->
choose_algo
)
{
if
(
!
params
->
choose_once
)
{
reuse_algo
=
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
input
,
i
)
==
prev_img_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
kerns
,
i
)
==
prev_kern_dims
[
i
]);
}
}
if
(
!
reuse_algo
)
{
if
(
!
reuse_algo
)
{
size_t
free
;
char
pci_id
[
16
];
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_PCIBUSID
,
pci_id
);
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
// check out cache
if
(
err2
!=
GA_NO_ERROR
)
{
hashkey
=
dnn_conv_shape
(
APPLY_SPECIFIC
(
input
),
input
,
APPLY_SPECIFIC
(
kerns
),
kerns
,
desc
,
*
output
,
groups
);
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
if
(
hashkey
.
empty
())
{
"memory information on the GPU"
);
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
hashkey
=
hash_prefix
+
pci_id
+
(
params
->
choose_time
?
" -t "
:
" "
)
+
hashkey
;
const
AlgoRec
*
cached
=
dnn_conv_check_cache
(
hashkey
);
if
(
cached
)
{
prev_algo
=
*
cached
;
use_cached
=
1
;
}
}
// Guess 4Mb if the info is not available
if
(
reuse_algo
||
use_cached
)
{
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
algo
=
(
cudnnConvolutionFwdAlgo_t
)
prev_algo
.
algo
;
worksize
=
prev_algo
.
wsSize
;
mathtype
=
prev_algo
.
mathType
;
}
else
{
if
(
params
->
choose_time
)
{
if
(
params
->
choose_time
)
{
int
count
;
int
count
;
cudnnConvolutionFwdAlgoPerf_t
choice
;
cudnnConvolutionFwdAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
max
free
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate GPU memory for FindEx"
);
cuda_exit
(
c
->
ctx
);
return
-
1
;
return
-
1
;
}
}
// set the 'tensor math ok' flag
c_set_math_type_for_conv
(
desc
,
CUDNN_TENSOR_OP_MATH
);
// We don't sync the buffer as we don't care about the values.
// We don't sync the buffer as we don't care about the values.
err
=
cudnnFindConvolutionForwardAlgorithmEx
(
err
=
cudnnFindConvolutionForwardAlgorithmEx
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
params
->
handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
*
output
),
desc
,
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
*
output
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
max
free
);
gpudata_release
(
tmpmem
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
...
@@ -141,138 +246,56 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -141,138 +246,56 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
algo
=
choice
.
algo
;
#ifdef DEBUG
#ifdef DEBUG
if
(
count
==
0
)
{
if
(
count
==
0
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv fwd algorithm found"
);
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv fwd algorithm found"
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"error getting best-timed FWD algo: %s"
,
"error getting best-timed FWD algo: %s"
,
cudnnGetErrorString
(
choice
.
status
));
cudnnGetErrorString
(
choice
.
status
));
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
// Else, count is necessarly 1 for current implementation.
}
// Else, count is necessarly 1 for current implementation.
#endif
#endif
algo
=
choice
.
algo
;
prev_algo
.
algo
=
(
int
)
algo
;
prev_algo
.
wsSize
=
worksize
=
choice
.
memory
;
#if CUDNN_MAJOR >= 7
prev_algo
.
mathType
=
mathtype
=
choice
.
mathType
;
#endif
}
else
{
}
else
{
err
=
cudnnGetConvolutionForwardAlgorithm
(
err
=
cudnnGetConvolutionForwardAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
output
),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
maxfree
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
}
prev_algo
.
algo
=
algo
;
prev_algo
=
algo
;
// no tensor_op returned from Get()
}
else
{
prev_algo
.
mathType
=
mathtype
=
CUDNN_DEFAULT_MATH
;
algo
=
prev_algo
;
}
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
return
1
;
// NB: This is printed only when algorithm is chosen at runtime.
if
(
reuse_algo
)
fprintf
(
stderr
,
"(reused %s)
\n
"
,
algorithm_name
);
else
fprintf
(
stderr
,
"(using %s)
\n
"
,
algorithm_name
);
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
else
{
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
prev_img_dims
[
i
]
=
PyGpuArray_DIM
(
input
,
i
);
prev_kern_dims
[
i
]
=
PyGpuArray_DIM
(
kerns
,
i
);
}
}
}
}
}
}
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
if
(
c_set_math_type_for_conv
(
desc
,
mathtype
)
==
-
1
||
if
(
PyGpuArray_NDIM
(
input
)
==
5
&&
dnn_conv_fwd_fallback
(
&
algo
,
input
,
kerns
,
desc
)
!=
0
)
{
!
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
||
cuda_exit
(
c
->
ctx
);
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
||
return
1
;
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
))
{
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
return
1
;
fprintf
(
stderr
,
"(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
// NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
if
((
cudnnGetVersion
()
<
6100
||
PyGpuArray_NDIM
(
input
)
==
5
)
&&
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
&&
PyGpuArray_DIM
(
input
,
0
)
>
65536
)
{
#ifdef DEBUG
fprintf
(
stderr
,
"(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
);
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if
((
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
||
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
)
&&
PyGpuArray_NDIM
(
input
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
dilation
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
dilation
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
if
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
)
{
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
}
else
{
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
}
}
}
// if FindEx was used (choose_time), workspace size is set.
if
(
!
(
reuse_algo
||
use_cached
||
params
->
choose_time
))
{
{
size_t
worksize
;
gpudata
*
workspace
;
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
params
->
handle
,
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
kerns
),
...
@@ -280,19 +303,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -280,19 +303,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
APPLY_SPECIFIC
(
output
),
APPLY_SPECIFIC
(
output
),
algo
,
algo
,
&
worksize
);
&
worksize
);
if
(
err
==
CUDNN_STATUS_NOT_SUPPORTED
)
{
if
(
err
==
CUDNN_STATUS_NOT_SUPPORTED
)
{
// Fallback to none algo if not supported
// Fallback to none algo if not supported
#ifdef DEBUG
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
fprintf
(
stderr
,
"(%s error getting worksize: "
}
"fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
,
algorithm_name
);
fprintf
(
stderr
,
"(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
\n
"
,
algorithm_name
);
#endif
#endif
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
params
->
handle
,
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
kerns
),
...
@@ -303,13 +324,47 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -303,13 +324,47 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
}
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
"error getting worksize: %s"
,
cudnnGetErrorString
(
err
));
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
}
if
(
params
->
choose_algo
&&
(
!
params
->
choose_once
||
!
reuse_algo
))
{
// algo may have changed due to fallback, we must update it.
prev_algo
.
algo
=
algo
;
// save worksize for next time/cache
prev_algo
.
wsSize
=
worksize
;
// Add to the cache if we choose on shape change, or first time if we choose once.
dnn_conv_update_cache
(
hashkey
,
prev_algo
);
}
#ifdef DEBUG
if
(
params
->
choose_algo
)
{
if
(
0
!=
theano_enum_to_string_cudnnConvolutionFwdAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
fprintf
(
stderr
,
"(using %s%s %s%s%s, ws:%ld, hash:%s)
\n
"
,
algorithm_name
,
mathtype
==
CUDNN_TENSOR_OP_MATH
?
"[T]"
:
""
,
params
->
choose_time
?
"(timed)"
:
""
,
reuse_algo
?
"(reused)"
:
""
,
use_cached
?
"(cache)"
:
""
,
worksize
,
hashkey
.
c_str
()
);
}
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
{
gpudata
*
workspace
=
0
;
/*
/*
* This is less than ideal since we need to free it after (which
* This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module
* introduces a synchronization point. But we don't have a module
...
@@ -318,8 +373,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -318,8 +373,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if
(
worksize
!=
0
)
{
if
(
worksize
!=
0
)
{
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
if
(
workspace
==
NULL
)
{
if
(
workspace
==
NULL
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
PyErr_SetString
(
PyExc_RuntimeError
,
"Could not allocate working memory"
);
"Could not allocate working memory"
);
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
...
@@ -329,16 +383,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -329,16 +383,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait
(
kerns
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
(
kerns
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
((
*
output
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_wait
((
*
output
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
for
(
int
g
=
0
;
g
<
params
->
num_
groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
err
=
cudnnConvolutionForward
(
err
=
cudnnConvolutionForward
(
params
->
handle
,
params
->
handle
,
alpha_p
,
alpha_p
,
APPLY_SPECIFIC
(
input
),
((
char
*
)
PyGpuArray_DEV_DATA
(
input
))
+
input_offset
*
g
,
APPLY_SPECIFIC
(
input
),
((
char
*
)
PyGpuArray_DEV_DATA
(
input
))
+
input_offset
*
g
,
APPLY_SPECIFIC
(
kerns
),
((
char
*
)
PyGpuArray_DEV_DATA
(
kerns
))
+
kern_offset
*
g
,
APPLY_SPECIFIC
(
kerns
),
((
char
*
)
PyGpuArray_DEV_DATA
(
kerns
))
+
kern_offset
*
g
,
desc
,
algo
,
desc
,
algo
,
worksize
==
0
?
NULL
:
*
(
void
**
)
workspace
,
worksize
,
worksize
==
0
?
NULL
:
*
(
void
**
)
workspace
,
worksize
,
beta_p
,
beta_p
,
APPLY_SPECIFIC
(
output
),
((
char
*
)
PyGpuArray_DEV_DATA
(
*
output
))
+
output_offset
*
g
);
APPLY_SPECIFIC
(
output
),
((
char
*
)
PyGpuArray_DEV_DATA
(
*
output
))
+
output_offset
*
g
);
}
}
if
(
worksize
!=
0
)
if
(
worksize
!=
0
)
...
@@ -348,12 +402,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -348,12 +402,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_record
(
kerns
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_record
(
kerns
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_record
((
*
output
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_record
((
*
output
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
}
}
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error doing operation: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error doing
cuDNN conv FWD
operation: %s"
,
cudnnGetErrorString
(
err
));
cudnnGetErrorString
(
err
));
return
1
;
return
1
;
}
}
return
0
;
return
0
;
}
}
theano/gpuarray/c_code/dnn_gi.c
浏览文件 @
b998dc61
#section init_code_struct
#section init_code_struct
prev_algo
.
algo
=
PARAMS
->
conv_algo
;
prev_algo
.
mathType
=
CUDNN_DEFAULT_MATH
;
reuse_algo
=
0
;
reuse_algo
=
0
;
prev_algo
=
PARAMS
->
conv_algo
;
hash_prefix
=
std
::
string
(
"GI|GPU#"
);
memset
(
prev_kern_dims
,
0
,
sizeof
(
prev_kern_dims
));
memset
(
prev_top_dims
,
0
,
sizeof
(
prev_top_dims
));
#section support_code_struct
#section support_code_struct
#line 9 "dnn_gi.c"
int
reuse_algo
;
AlgoRec
prev_algo
;
std
::
string
hash_prefix
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int
dnn_conv_gi_fallback
(
cudnnConvolutionBwdDataAlgo_t
*
_algo
,
const
PyGpuArrayObject
*
input
,
const
PyGpuArrayObject
*
kerns
,
cudnnConvolutionDescriptor_t
desc
)
{
cudnnConvolutionBwdDataAlgo_t
algo
=
*
_algo
;
int
reuse_algo
;
// The FFT implementation does not support strides, 1x1 filters or inputs
cudnnConvolutionBwdDataAlgo_t
prev_algo
;
// with a spatial dimension larger than 1024. The tiled-FFT implementation
size_t
prev_kern_dims
[
5
];
// does not support strides.
size_t
prev_top_dims
[
5
];
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if
((
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
||
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
)
&&
PyGpuArray_NDIM
(
kerns
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
upscale
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
cudnnStatus_t
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
upscale
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
if
(
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
)
{
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing gradinput algo fft with none)
\n
"
);
#endif
}
}
else
{
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing gradinput algo fft_tiling with none)
\n
"
);
#endif
}
}
}
*
_algo
=
algo
;
return
0
;
}
int
int
APPLY_SPECIFIC
(
conv_gi
)(
PyGpuArrayObject
*
kerns
,
PyGpuArrayObject
*
output
,
APPLY_SPECIFIC
(
conv_gi
)(
PyGpuArrayObject
*
kerns
,
PyGpuArrayObject
*
output
,
...
@@ -23,6 +82,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -23,6 +82,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
void
*
beta_p
;
void
*
beta_p
;
float
af
=
alpha
,
bf
=
beta
;
float
af
=
alpha
,
bf
=
beta
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
bool
use_cached
=
0
;
if
(
PyGpuArray_DIMS
(
im
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
]
*
params
->
num_groups
)
{
if
(
PyGpuArray_DIMS
(
im
)[
1
]
!=
PyGpuArray_DIMS
(
kerns
)[
1
]
*
params
->
num_groups
)
{
PyErr_SetString
(
PyExc_ValueError
,
"images and kernel must have the same "
PyErr_SetString
(
PyExc_ValueError
,
"images and kernel must have the same "
...
@@ -72,233 +132,200 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -72,233 +132,200 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return
0
;
return
0
;
}
}
if
(
c_set_tensor_for_conv
(
output
,
APPLY_SPECIFIC
(
output
),
params
->
num_groups
)
==
-
1
)
int
groups
=
c_get_groups_for_conv
(
desc
,
params
->
num_groups
);
if
(
groups
==
-
1
)
return
1
;
return
1
;
if
(
c_set_
filter
(
kerns
,
APPLY_SPECIFIC
(
kerns
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_
tensor_for_conv
(
output
,
APPLY_SPECIFIC
(
output
),
groups
)
==
-
1
)
return
1
;
return
1
;
if
(
c_set_
tensor_for_conv
(
*
input
,
APPLY_SPECIFIC
(
input
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_
filter
(
kerns
,
APPLY_SPECIFIC
(
kerns
),
groups
)
==
-
1
)
return
1
;
return
1
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
*
input
,
0
)
/
params
->
num_groups
;
if
(
c_set_tensor_for_conv
(
*
input
,
APPLY_SPECIFIC
(
input
),
groups
)
==
-
1
)
size_t
kern_offset
=
PyGpuArray_STRIDE
(
kerns
,
0
)
*
PyGpuArray_DIM
(
kerns
,
0
)
/
params
->
num_groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
output
,
0
)
/
params
->
num_groups
;
cudnnConvolutionBwdDataAlgo_t
algo
=
params
->
conv_algo
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
cuda_enter
(
c
->
ctx
);
int
expected_output_dims
[
5
]
=
{
0
};
err
=
cudnnGetConvolutionNdForwardOutputDim
(
desc
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_NDIM
(
im
),
expected_output_dims
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error computing convolution output dim: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
if
(
PyGpuArray_NDIM
(
im
)
==
4
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
/
params
->
num_groups
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
],
expected_output_dims
[
2
],
expected_output_dims
[
3
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
]);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
else
if
(
PyGpuArray_NDIM
(
im
)
==
5
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
])
||
(
PyGpuArray_DIMS
(
output
)[
4
]
!=
expected_output_dims
[
4
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
],
expected_output_dims
[
2
],
expected_output_dims
[
3
],
expected_output_dims
[
4
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
],
PyGpuArray_DIMS
(
output
)[
4
]);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
if
(
params
->
choose_algo
)
{
if
(
0
!=
dnn_check_convolution_output
(
desc
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
if
(
!
params
->
choose_once
)
{
PyGpuArray_NDIM
(
kerns
),
output
,
groups
))
reuse_algo
=
1
;
return
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
kerns
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
kerns
,
i
)
==
prev_kern_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
output
,
i
)
==
prev_top_dims
[
i
]);
}
}
if
(
!
reuse_algo
)
{
size_t
input_offset
=
PyGpuArray_STRIDE
(
*
input
,
0
)
/
groups
;
size_t
free
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
kerns
,
0
)
*
PyGpuArray_DIM
(
kerns
,
0
)
/
groups
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
)
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
output
,
0
)
/
groups
;
if
(
err2
!=
GA_NO_ERROR
)
{
cudnnConvolutionBwdDataAlgo_t
algo
=
params
->
conv_algo
;
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
size_t
worksize
=
0
;
"memory information on the GPU"
);
cudnnMathType_t
mathtype
=
CUDNN_DEFAULT_MATH
;
cuda_exit
(
c
->
ctx
);
return
1
;
}
// Guess 4Mb if the info is not available
std
::
string
hashkey
;
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
if
(
params
->
choose_time
)
{
size_t
maxfree
=
c_get_largest_free_block_size
(
c
);
int
count
;
if
(
PyErr_Occurred
())
return
1
;
cudnnConvolutionBwdDataAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
cuda_enter
(
c
->
ctx
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
return
-
1
;
}
err
=
cudnnFindConvolutionBackwardDataAlgorithmEx
(
if
(
params
->
choose_algo
)
{
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
*
input
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
!
reuse_algo
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
char
pci_id
[
16
];
cudnnGetErrorString
(
err
));
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_PCIBUSID
,
pci_id
);
// check out cache
hashkey
=
dnn_conv_shape
(
APPLY_SPECIFIC
(
input
),
*
input
,
APPLY_SPECIFIC
(
kerns
),
kerns
,
desc
,
output
,
groups
);
if
(
hashkey
.
empty
())
{
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
hashkey
=
hash_prefix
+
pci_id
+
(
params
->
choose_time
?
" -t "
:
" "
)
+
hashkey
;
const
AlgoRec
*
cached
=
dnn_conv_check_cache
(
hashkey
);
if
(
cached
)
{
prev_algo
=
*
cached
;
use_cached
=
1
;
}
}
algo
=
choice
.
algo
;
if
(
reuse_algo
||
use_cached
)
{
algo
=
(
cudnnConvolutionBwdDataAlgo_t
)
prev_algo
.
algo
;
#ifdef DEBUG
worksize
=
prev_algo
.
wsSize
;
if
(
count
==
0
)
{
mathtype
=
prev_algo
.
mathType
;
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv gradinput algorithm found"
);
}
else
{
if
(
params
->
choose_time
)
{
int
count
;
cudnnConvolutionBwdDataAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
// set the 'tensor math ok' flag
c_set_math_type_for_conv
(
desc
,
CUDNN_TENSOR_OP_MATH
);
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
maxfree
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
cuda_exit
(
c
->
ctx
);
return
-
1
;
}
err
=
cudnnFindConvolutionBackwardDataAlgorithmEx
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
*
input
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
maxfree
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
}
PyErr_Format
(
PyExc_RuntimeError
,
"error getting best-timed gradinput algo: %s"
,
#ifdef DEBUG
cudnnGetErrorString
(
choice
.
status
));
if
(
count
==
0
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv gradinput algorithm found"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting best-timed gradinput algo: %s"
,
cudnnGetErrorString
(
choice
.
status
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
// Else, count is necessarly 1 for current implementation.
#endif
algo
=
choice
.
algo
;
prev_algo
.
algo
=
(
int
)
algo
;
prev_algo
.
wsSize
=
worksize
=
choice
.
memory
;
#if CUDNN_MAJOR >= 7
prev_algo
.
mathType
=
mathtype
=
choice
.
mathType
;
#endif
}
else
{
err
=
cudnnGetConvolutionBackwardDataAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
maxfree
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
// Else, count is necessarly 1 for current implementation.
}
#endif
prev_algo
.
algo
=
algo
;
// no tensor_op returned from Get()
}
else
{
prev_algo
.
mathType
=
mathtype
=
CUDNN_DEFAULT_MATH
;
err
=
cudnnGetConvolutionBackwardDataAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
}
}
prev_algo
=
algo
;
}
}
else
{
algo
=
prev_algo
;
if
(
c_set_math_type_for_conv
(
desc
,
mathtype
)
==
-
1
||
}
dnn_conv_gi_fallback
(
&
algo
,
*
input
,
kerns
,
desc
)
!=
0
)
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
#ifdef DEBUG
// if FindEx was used (choose_time), workspace size is set.
char
algorithm_name
[
128
];
if
(
!
(
reuse_algo
||
use_cached
||
params
->
choose_time
))
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t
(
algo
,
algorithm_name
))
{
err
=
cudnnGetConvolutionBackwardDataWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
algo
,
&
worksize
);
if
(
err
==
CUDNN_STATUS_NOT_SUPPORTED
)
{
// Fallback to none algo if not supported
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
// NB: This is printed only when algorithm is chosen at runtime.
if
(
reuse_algo
)
fprintf
(
stderr
,
"(reused %s)
\n
"
,
algorithm_name
);
else
fprintf
(
stderr
,
"(using %s)
\n
"
,
algorithm_name
);
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
else
{
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
kerns
);
i
++
)
{
prev_kern_dims
[
i
]
=
PyGpuArray_DIM
(
kerns
,
i
);
prev_top_dims
[
i
]
=
PyGpuArray_DIM
(
output
,
i
);
}
}
fprintf
(
stderr
,
"(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_BWD_DATA_ALGO_0)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
err
=
cudnnGetConvolutionBackwardDataWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
algo
,
&
worksize
);
}
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if
((
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
||
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
)
&&
PyGpuArray_NDIM
(
kerns
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
upscale
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
upscale
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
}
// !(reuse_algo || use_cached || params->choose_time)
if
(
algo
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
)
if
(
params
->
choose_algo
&&
(
!
params
->
choose_once
||
!
reuse_algo
))
{
{
// algo may have changed due to fallback, we must update it.
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
prev_algo
.
algo
=
algo
;
PyGpuArray_DIM
(
*
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
*
input
,
3
)
>
1024
||
// save worksize for next time/cache
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
prev_algo
.
wsSize
=
worksize
;
{
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
}
}
else
{
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
;
}
}
}
size_t
worksize
;
// Add to the cache
gpudata
*
workspace
;
dnn_conv_update_cache
(
hashkey
,
prev_algo
);
}
err
=
cudnnGetConvolutionBackwardDataWorkspaceSize
(
#ifdef DEBUG
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
if
(
params
->
choose_algo
)
{
APPLY_SPECIFIC
(
input
),
algo
,
&
worksize
);
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
// NB: This is printed only when algorithm is chosen at runtime.
fprintf
(
stderr
,
"(using %s %s%s%s%s, ws:%ld, hash:%s)
\n
"
,
algorithm_name
,
params
->
choose_time
?
"(timed)"
:
""
,
reuse_algo
?
"(reused)"
:
""
,
use_cached
?
"(cache)"
:
""
,
mathtype
==
CUDNN_TENSOR_OP_MATH
?
"(tensor op)"
:
""
,
worksize
,
hashkey
.
c_str
()
);
}
#endif
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
params
->
choose_once
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
reuse_algo
=
1
;
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
gpudata
*
workspace
=
0
;
if
(
worksize
!=
0
)
{
if
(
worksize
!=
0
)
{
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
if
(
workspace
==
NULL
)
{
if
(
workspace
==
NULL
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
PyErr_SetString
(
PyExc_RuntimeError
,
"Could not allocate working memory"
);
"Could not allocate working memory"
);
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
...
@@ -308,8 +335,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -308,8 +335,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait
(
output
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
(
output
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
((
*
input
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_wait
((
*
input
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
for
(
int
g
=
0
;
g
<
params
->
num_groups
;
g
++
)
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
{
err
=
cudnnConvolutionBackwardData
(
err
=
cudnnConvolutionBackwardData
(
params
->
handle
,
params
->
handle
,
alpha_p
,
alpha_p
,
...
@@ -330,7 +356,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -330,7 +356,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error doing operation: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error doing
cuDNN conv gradinput
operation: %s"
,
cudnnGetErrorString
(
err
));
cudnnGetErrorString
(
err
));
return
1
;
return
1
;
}
}
...
...
theano/gpuarray/c_code/dnn_gw.c
浏览文件 @
b998dc61
#section init_code_struct
#section init_code_struct
prev_algo
.
algo
=
PARAMS
->
conv_algo
;
prev_algo
.
mathType
=
CUDNN_DEFAULT_MATH
;
reuse_algo
=
0
;
reuse_algo
=
0
;
prev_algo
=
PARAMS
->
conv_algo
;
hash_prefix
=
std
::
string
(
"GW|GPU#"
);
memset
(
prev_img_dims
,
0
,
sizeof
(
prev_img_dims
));
memset
(
prev_top_dims
,
0
,
sizeof
(
prev_top_dims
));
#section support_code_struct
#section support_code_struct
#line 9 "dnn_gw.c"
int
reuse_algo
;
AlgoRec
prev_algo
;
std
::
string
hash_prefix
;
#ifdef DEBUG
char
algorithm_name
[
128
];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int
dnn_conv_gw_fallback
(
cudnnConvolutionBwdFilterAlgo_t
*
_algo
,
const
PyGpuArrayObject
*
input
,
const
PyGpuArrayObject
*
kerns
,
cudnnConvolutionDescriptor_t
desc
)
{
cudnnConvolutionBwdFilterAlgo_t
algo
=
*
_algo
;
int
reuse_algo
;
// The FFT implementation does not support strides, 1x1 filters or inputs
cudnnConvolutionBwdFilterAlgo_t
prev_algo
;
// with a spatial dimension larger than 1024.
size_t
prev_img_dims
[
5
];
// If the chosen implementation is FFT, validate that it can
size_t
prev_top_dims
[
5
];
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if
(
algo
==
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT
&&
PyGpuArray_NDIM
(
input
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
upscale
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
cudnnStatus_t
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
upscale
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
;
#ifdef DEBUG
fprintf
(
stderr
,
"(replacing gradweight algo fft with none)
\n
"
);
#endif
}
}
*
_algo
=
algo
;
return
0
;
}
int
int
APPLY_SPECIFIC
(
conv_gw
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
output
,
APPLY_SPECIFIC
(
conv_gw
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
output
,
...
@@ -23,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -23,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
void
*
beta_p
;
void
*
beta_p
;
float
af
=
alpha
,
bf
=
beta
;
float
af
=
alpha
,
bf
=
beta
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
cudnnStatus_t
err
=
CUDNN_STATUS_SUCCESS
;
bool
use_cached
=
0
;
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
km
)[
1
]
*
params
->
num_groups
)
{
if
(
PyGpuArray_DIMS
(
input
)[
1
]
!=
PyGpuArray_DIMS
(
km
)[
1
]
*
params
->
num_groups
)
{
PyErr_SetString
(
PyExc_ValueError
,
PyErr_SetString
(
PyExc_ValueError
,
...
@@ -72,99 +119,71 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -72,99 +119,71 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return
0
;
return
0
;
}
}
if
(
c_set_tensor_for_conv
(
input
,
APPLY_SPECIFIC
(
input
),
params
->
num_groups
)
==
-
1
)
int
groups
=
c_get_groups_for_conv
(
desc
,
params
->
num_groups
);
if
(
groups
==
-
1
)
return
1
;
return
1
;
if
(
c_set_tensor_for_conv
(
output
,
APPLY_SPECIFIC
(
output
),
params
->
num_
groups
)
==
-
1
)
if
(
c_set_tensor_for_conv
(
input
,
APPLY_SPECIFIC
(
input
),
groups
)
==
-
1
)
return
1
;
return
1
;
if
(
c_set_filter
(
*
kerns
,
APPLY_SPECIFIC
(
kerns
),
params
->
num_groups
)
==
-
1
)
if
(
c_set_tensor_for_conv
(
output
,
APPLY_SPECIFIC
(
output
),
groups
)
==
-
1
)
return
1
;
if
(
c_set_filter
(
*
kerns
,
APPLY_SPECIFIC
(
kerns
),
groups
)
==
-
1
)
return
1
;
return
1
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
input
,
0
)
/
params
->
num_groups
;
if
(
0
!=
dnn_check_convolution_output
(
desc
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
size_t
kern_offset
=
PyGpuArray_STRIDE
(
*
kerns
,
0
)
*
PyGpuArray_DIM
(
*
kerns
,
0
)
/
params
->
num_groups
;
PyGpuArray_NDIM
(
*
kerns
),
output
,
groups
))
size_t
output_offset
=
PyGpuArray_STRIDE
(
output
,
0
)
/
params
->
num_groups
;
return
1
;
size_t
input_offset
=
PyGpuArray_STRIDE
(
input
,
0
)
/
groups
;
size_t
kern_offset
=
PyGpuArray_STRIDE
(
*
kerns
,
0
)
*
PyGpuArray_DIM
(
*
kerns
,
0
)
/
groups
;
size_t
output_offset
=
PyGpuArray_STRIDE
(
output
,
0
)
/
groups
;
cudnnConvolutionBwdFilterAlgo_t
algo
=
params
->
conv_algo
;
cudnnConvolutionBwdFilterAlgo_t
algo
=
params
->
conv_algo
;
#ifdef DEBUG
size_t
worksize
=
0
;
char
algorithm_name
[
128
];
cudnnMathType_t
mathtype
=
CUDNN_DEFAULT_MATH
;
#endif
cuda_enter
(
c
->
ctx
)
;
std
::
string
hashkey
;
int
expected_output_dims
[
5
]
=
{
0
};
size_t
maxfree
=
c_get_largest_free_block_size
(
c
);
err
=
cudnnGetConvolutionNdForwardOutputDim
(
desc
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
if
(
PyErr_Occurred
())
return
1
;
PyGpuArray_NDIM
(
input
),
expected_output_dims
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
cuda_enter
(
c
->
ctx
);
PyErr_Format
(
PyExc_RuntimeError
,
"error computing convolution output dim: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
if
(
PyGpuArray_NDIM
(
input
)
==
4
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
/
params
->
num_groups
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %ldx%ldx%dx%ld"
" but received gradient with shape %ldx%ldx%dx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
],
expected_output_dims
[
2
],
expected_output_dims
[
3
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
]);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
else
if
(
PyGpuArray_NDIM
(
input
)
==
5
)
{
if
((
PyGpuArray_DIMS
(
output
)[
0
]
!=
expected_output_dims
[
0
])
||
(
PyGpuArray_DIMS
(
output
)[
1
]
!=
expected_output_dims
[
1
])
||
(
PyGpuArray_DIMS
(
output
)[
2
]
!=
expected_output_dims
[
2
])
||
(
PyGpuArray_DIMS
(
output
)[
3
]
!=
expected_output_dims
[
3
])
||
(
PyGpuArray_DIMS
(
output
)[
4
]
!=
expected_output_dims
[
4
]))
{
PyErr_Format
(
PyExc_ValueError
,
"impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld"
,
expected_output_dims
[
0
],
expected_output_dims
[
1
],
expected_output_dims
[
2
],
expected_output_dims
[
3
],
expected_output_dims
[
4
],
PyGpuArray_DIMS
(
output
)[
0
],
PyGpuArray_DIMS
(
output
)[
1
],
PyGpuArray_DIMS
(
output
)[
2
],
PyGpuArray_DIMS
(
output
)[
3
],
PyGpuArray_DIMS
(
output
)[
4
]);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
if
(
params
->
choose_algo
)
{
if
(
params
->
choose_algo
)
{
if
(
!
params
->
choose_once
)
{
reuse_algo
=
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
input
,
i
)
==
prev_img_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
output
,
i
)
==
prev_top_dims
[
i
]);
}
}
if
(
!
reuse_algo
)
{
if
(
!
reuse_algo
)
{
size_t
free
;
char
pci_id
[
16
];
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_PCIBUSID
,
pci_id
);
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
// check out cache
if
(
err2
!=
GA_NO_ERROR
)
{
hashkey
=
dnn_conv_shape
(
APPLY_SPECIFIC
(
input
),
input
,
APPLY_SPECIFIC
(
kerns
),
*
kerns
,
desc
,
output
,
groups
);
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
if
(
hashkey
.
empty
())
{
"memory information on the GPU"
);
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
hashkey
=
hash_prefix
+
pci_id
+
(
params
->
choose_time
?
" -t "
:
" "
)
+
hashkey
;
const
AlgoRec
*
cached
=
dnn_conv_check_cache
(
hashkey
);
if
(
cached
)
{
prev_algo
=
*
cached
;
use_cached
=
1
;
}
}
// Guess 4Mb if the info is not available
if
(
reuse_algo
||
use_cached
)
{
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
algo
=
(
cudnnConvolutionBwdFilterAlgo_t
)
prev_algo
.
algo
;
worksize
=
prev_algo
.
wsSize
;
mathtype
=
prev_algo
.
mathType
;
}
else
{
if
(
params
->
choose_time
)
{
if
(
params
->
choose_time
)
{
int
count
;
int
count
;
cudnnConvolutionBwdFilterAlgoPerf_t
choice
;
cudnnConvolutionBwdFilterAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
// set the 'tensor math ok' flag
c_set_math_type_for_conv
(
desc
,
CUDNN_TENSOR_OP_MATH
);
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
maxfree
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
cuda_exit
(
c
->
ctx
);
return
-
1
;
return
-
1
;
}
}
...
@@ -172,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -172,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
params
->
handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
params
->
handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
*
kerns
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
*
kerns
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
max
free
);
gpudata_release
(
tmpmem
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
...
@@ -183,25 +202,32 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -183,25 +202,32 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return
1
;
return
1
;
}
}
algo
=
choice
.
algo
;
#ifdef DEBUG
#ifdef DEBUG
if
(
count
==
0
)
{
if
(
count
==
0
)
{
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv gradweight algorithm found"
);
PyErr_SetString
(
PyExc_RuntimeError
,
"No best-timed conv gradweight algorithm found"
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
}
else
if
(
choice
.
status
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"error getting best-timed gradweight algo: %s"
,
"error getting best-timed gradweight algo: %s"
,
cudnnGetErrorString
(
choice
.
status
));
cudnnGetErrorString
(
choice
.
status
));
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
// Else, count is necessarly 1 for current implementation.
}
// Else, count is necessarly 1 for current implementation.
#endif
#endif
algo
=
choice
.
algo
;
prev_algo
.
algo
=
(
int
)
algo
;
prev_algo
.
wsSize
=
worksize
=
choice
.
memory
;
#if CUDNN_MAJOR >= 7
prev_algo
.
mathType
=
mathtype
=
choice
.
mathType
;
#endif
}
else
{
}
else
{
err
=
cudnnGetConvolutionBackwardFilterAlgorithm
(
err
=
cudnnGetConvolutionBackwardFilterAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
desc
,
APPLY_SPECIFIC
(
kerns
),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
max
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
"error selecting convolution algo: %s"
,
...
@@ -209,79 +235,84 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -209,79 +235,84 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
prev_algo
.
algo
=
algo
;
// no tensor_op returned from Get()
prev_algo
.
mathType
=
mathtype
=
CUDNN_DEFAULT_MATH
;
}
}
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
}
}
/* choose_algo */
if
(
c_set_math_type_for_conv
(
desc
,
mathtype
)
==
-
1
||
dnn_conv_gw_fallback
(
&
algo
,
input
,
*
kerns
,
desc
)
!=
0
)
{
cuda_exit
(
c
->
ctx
);
return
1
;
}
#ifdef DEBUG
// if FindEx was used (choose_time), workspace size is set.
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t
(
algo
,
algorithm_name
))
if
(
!
(
reuse_algo
||
use_cached
||
params
->
choose_time
))
{
err
=
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
algo
,
&
worksize
);
if
(
err
==
CUDNN_STATUS_NOT_SUPPORTED
)
{
// Fallback to none algo if not supported
#ifdef DEBUG
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
// NB: This is printed only when algorithm is chosen at runtime.
if
(
reuse_algo
)
fprintf
(
stderr
,
"(reused %s)
\n
"
,
algorithm_name
);
else
fprintf
(
stderr
,
"(using %s)
\n
"
,
algorithm_name
);
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
else
{
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
prev_img_dims
[
i
]
=
PyGpuArray_DIM
(
input
,
i
);
prev_top_dims
[
i
]
=
PyGpuArray_DIM
(
output
,
i
);
}
}
fprintf
(
stderr
,
"(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)
\n
"
,
algorithm_name
);
#endif
algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
;
err
=
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
algo
,
&
worksize
);
}
}
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if
(
algo
==
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT
&&
PyGpuArray_NDIM
(
input
)
==
4
)
{
// Extract the properties of the convolution descriptor
int
nd
;
int
pad
[
2
];
int
stride
[
2
];
int
upscale
[
2
];
cudnnConvolutionMode_t
mode
;
cudnnDataType_t
data_type
;
err
=
cudnnGetConvolutionNdDescriptor
(
desc
,
2
,
&
nd
,
pad
,
stride
,
upscale
,
&
mode
,
&
data_type
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
"error getting convolution properties: %s"
,
cudnnGetErrorString
(
err
));
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
*
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
*
kerns
,
3
)
==
1
))
{
algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
;
}
}
}
size_t
worksize
;
if
(
params
->
choose_algo
&&
(
!
params
->
choose_once
||
!
reuse_algo
))
{
gpudata
*
workspace
;
// algo may have changed due to fallback, we must update it.
prev_algo
.
algo
=
algo
;
// save worksize for next time/cache
prev_algo
.
wsSize
=
worksize
;
err
=
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
// Add to the cache
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
dnn_conv_update_cache
(
hashkey
,
prev_algo
);
APPLY_SPECIFIC
(
kerns
),
algo
,
&
worksize
);
}
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
#ifdef DEBUG
PyErr_Format
(
PyExc_RuntimeError
,
"error getting worksize: %s"
,
if
(
params
->
choose_algo
)
{
cudnnGetErrorString
(
err
));
if
(
0
!=
theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t
(
algo
,
algorithm_name
))
{
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
// NB: This is printed only when algorithm is chosen at runtime.
fprintf
(
stderr
,
"(using %s %s%s%s%s, ws:%ld, hash:%s)
\n
"
,
algorithm_name
,
params
->
choose_time
?
"(timed)"
:
""
,
reuse_algo
?
"(reused)"
:
""
,
use_cached
?
"(cache)"
:
""
,
mathtype
==
CUDNN_TENSOR_OP_MATH
?
"(tensor op)"
:
""
,
worksize
,
hashkey
.
c_str
()
);
}
#endif
if
(
params
->
choose_once
)
{
reuse_algo
=
1
;
}
}
gpudata
*
workspace
=
0
;
if
(
worksize
!=
0
)
{
if
(
worksize
!=
0
)
{
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
workspace
=
gpudata_alloc
(
c
->
ctx
,
worksize
,
NULL
,
0
,
NULL
);
if
(
workspace
==
NULL
)
{
if
(
workspace
==
NULL
)
{
...
@@ -295,9 +326,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -295,9 +326,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait
(
output
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
(
output
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_READ
);
cuda_wait
((
*
kerns
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_wait
((
*
kerns
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
for
(
int
g
=
0
;
g
<
params
->
num_groups
;
g
++
)
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
{
err
=
cudnnConvolutionBackwardFilter
(
err
=
cudnnConvolutionBackwardFilter
(
params
->
handle
,
params
->
handle
,
alpha_p
,
alpha_p
,
...
@@ -318,7 +347,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -318,7 +347,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error doing operation: %s"
,
PyErr_Format
(
PyExc_RuntimeError
,
"error doing
cuDNN conv gradweight
operation: %s"
,
cudnnGetErrorString
(
err
));
cudnnGetErrorString
(
err
));
return
1
;
return
1
;
}
}
...
...
theano/gpuarray/dnn.py
浏览文件 @
b998dc61
...
@@ -399,7 +399,7 @@ class DnnBase(COp):
...
@@ -399,7 +399,7 @@ class DnnBase(COp):
return
[]
return
[]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
super
(
DnnBase
,
self
)
.
c_code_cache_version
(),
version
(),
1
)
return
(
super
(
DnnBase
,
self
)
.
c_code_cache_version
(),
version
(),
4
)
class
GpuDnnConvDesc
(
COp
):
class
GpuDnnConvDesc
(
COp
):
...
@@ -412,7 +412,8 @@ class GpuDnnConvDesc(COp):
...
@@ -412,7 +412,8 @@ class GpuDnnConvDesc(COp):
"""
"""
__props__
=
(
'border_mode'
,
'subsample'
,
'dilation'
,
'conv_mode'
,
'precision'
)
__props__
=
(
'border_mode'
,
'subsample'
,
'dilation'
,
'conv_mode'
,
'precision'
,
'num_groups'
)
params_type
=
ParamsType
(
pad0
=
int_t
,
pad1
=
int_t
,
pad2
=
int_t
,
params_type
=
ParamsType
(
pad0
=
int_t
,
pad1
=
int_t
,
pad2
=
int_t
,
sub0
=
int_t
,
sub1
=
int_t
,
sub2
=
int_t
,
sub0
=
int_t
,
sub1
=
int_t
,
sub2
=
int_t
,
dil0
=
int_t
,
dil1
=
int_t
,
dil2
=
int_t
,
dil0
=
int_t
,
dil1
=
int_t
,
dil2
=
int_t
,
...
@@ -421,7 +422,8 @@ class GpuDnnConvDesc(COp):
...
@@ -421,7 +422,8 @@ class GpuDnnConvDesc(COp):
(
'BORDER_MODE_VALID'
,
'valid'
),
(
'BORDER_MODE_VALID'
,
'valid'
),
(
'BORDER_MODE_HALF'
,
'half'
)),
(
'BORDER_MODE_HALF'
,
'half'
)),
conv_mode
=
cudnn
.
cudnnConvolutionMode_t
,
conv_mode
=
cudnn
.
cudnnConvolutionMode_t
,
precision
=
cudnn
.
cudnnDataType_t
)
precision
=
cudnn
.
cudnnDataType_t
,
num_groups
=
int_t
)
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cudnn.h'
,
'cudnn_helper.h'
]
return
[
'cudnn.h'
,
'cudnn_helper.h'
]
...
@@ -448,7 +450,7 @@ class GpuDnnConvDesc(COp):
...
@@ -448,7 +450,7 @@ class GpuDnnConvDesc(COp):
return
False
return
False
def
__init__
(
self
,
border_mode
,
subsample
=
(
1
,
1
),
dilation
=
(
1
,
1
),
conv_mode
=
'conv'
,
def
__init__
(
self
,
border_mode
,
subsample
=
(
1
,
1
),
dilation
=
(
1
,
1
),
conv_mode
=
'conv'
,
precision
=
"float32"
):
precision
=
"float32"
,
num_groups
=
1
):
COp
.
__init__
(
self
,
[
"c_code/conv_desc.c"
],
"APPLY_SPECIFIC(conv_desc)"
)
COp
.
__init__
(
self
,
[
"c_code/conv_desc.c"
],
"APPLY_SPECIFIC(conv_desc)"
)
if
version
()
<
6000
and
any
([
d
!=
1
for
d
in
dilation
]):
if
version
()
<
6000
and
any
([
d
!=
1
for
d
in
dilation
]):
...
@@ -470,6 +472,7 @@ class GpuDnnConvDesc(COp):
...
@@ -470,6 +472,7 @@ class GpuDnnConvDesc(COp):
self
.
subsample
=
subsample
self
.
subsample
=
subsample
assert
cudnn
.
cudnnConvolutionMode_t
.
has_alias
(
conv_mode
)
assert
cudnn
.
cudnnConvolutionMode_t
.
has_alias
(
conv_mode
)
self
.
conv_mode
=
conv_mode
self
.
conv_mode
=
conv_mode
self
.
num_groups
=
num_groups
assert
len
(
dilation
)
==
len
(
subsample
)
assert
len
(
dilation
)
==
len
(
subsample
)
self
.
dilation
=
dilation
self
.
dilation
=
dilation
...
@@ -514,6 +517,8 @@ class GpuDnnConvDesc(COp):
...
@@ -514,6 +517,8 @@ class GpuDnnConvDesc(COp):
self
.
__dict__
.
update
(
d
)
self
.
__dict__
.
update
(
d
)
if
not
hasattr
(
self
,
"dilation"
):
if
not
hasattr
(
self
,
"dilation"
):
self
.
dilation
=
(
1
,)
*
len
(
self
.
subsample
)
self
.
dilation
=
(
1
,)
*
len
(
self
.
subsample
)
if
not
hasattr
(
self
,
"num_groups"
):
self
.
num_groups
=
1
# scalar constants
# scalar constants
...
@@ -622,8 +627,6 @@ class GpuDnnConv(DnnBase):
...
@@ -622,8 +627,6 @@ class GpuDnnConv(DnnBase):
SUPPORTED_DNN_CONV_ALGO_RUNTIME
):
SUPPORTED_DNN_CONV_ALGO_RUNTIME
):
raise
ValueError
(
"convolution algo
%
s can't be used for "
raise
ValueError
(
"convolution algo
%
s can't be used for "
"3d convolutions"
,
(
self
.
algo
,))
"3d convolutions"
,
(
self
.
algo
,))
if
img
.
type
.
ndim
==
5
and
self
.
num_groups
!=
1
:
raise
ValueError
(
"Grouped convolutions not implemented for 3D convolutions"
)
if
(
not
isinstance
(
desc
.
type
,
CDataType
)
or
if
(
not
isinstance
(
desc
.
type
,
CDataType
)
or
desc
.
type
.
ctype
!=
'cudnnConvolutionDescriptor_t'
):
desc
.
type
.
ctype
!=
'cudnnConvolutionDescriptor_t'
):
...
@@ -854,7 +857,6 @@ class GpuDnnConvGradI(DnnBase):
...
@@ -854,7 +857,6 @@ class GpuDnnConvGradI(DnnBase):
if
algo
is
None
:
if
algo
is
None
:
algo
=
config
.
dnn
.
conv
.
algo_bwd_data
algo
=
config
.
dnn
.
conv
.
algo_bwd_data
self
.
algo
=
algo
self
.
algo
=
algo
assert
cudnn
.
cudnnConvolutionBwdDataAlgo_t
.
has_alias
(
self
.
algo
)
or
self
.
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
assert
cudnn
.
cudnnConvolutionBwdDataAlgo_t
.
has_alias
(
self
.
algo
)
or
self
.
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
self
.
conv_algo
=
cudnn
.
cudnnConvolutionBwdDataAlgo_t
.
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
self
.
conv_algo
=
cudnn
.
cudnnConvolutionBwdDataAlgo_t
.
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
...
@@ -1039,7 +1041,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
...
@@ -1039,7 +1041,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
img
=
gpu_contiguous
(
img
)
img
=
gpu_contiguous
(
img
)
kerns
=
gpu_contiguous
(
kerns
)
kerns
=
gpu_contiguous
(
kerns
)
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
conv_mode
=
conv_mode
,
precision
=
precision
,
num_groups
=
num_groups
)(
kerns
.
shape
)
desc_op
=
desc
.
owner
.
op
desc_op
=
desc
.
owner
.
op
# We can use Shape_i and bypass the infer_shape here as this is on
# We can use Shape_i and bypass the infer_shape here as this is on
# the input of node and it will always be present.
# the input of node and it will always be present.
...
@@ -1056,7 +1059,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
...
@@ -1056,7 +1059,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
def
dnn_conv3d
(
img
,
kerns
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
def
dnn_conv3d
(
img
,
kerns
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
direction_hint
=
None
,
conv_mode
=
'conv'
,
direction_hint
=
None
,
algo
=
None
,
precision
=
None
):
algo
=
None
,
precision
=
None
,
num_groups
=
1
):
"""
"""
GPU convolution using cuDNN from NVIDIA.
GPU convolution using cuDNN from NVIDIA.
...
@@ -1099,6 +1102,9 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
...
@@ -1099,6 +1102,9 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
should be done. Possible values are 'as_input', 'float16', 'float32'
should be done. Possible values are 'as_input', 'float16', 'float32'
and 'float64'. Default is the value of
and 'float64'. Default is the value of
:attr:`config.dnn.conv.precision`.
:attr:`config.dnn.conv.precision`.
num_groups :
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
.. warning:: The cuDNN library only works with GPUs that have a compute
.. warning:: The cuDNN library only works with GPUs that have a compute
...
@@ -1113,7 +1119,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
...
@@ -1113,7 +1119,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
fgraph
=
getattr
(
img
,
'fgraph'
,
None
)
or
getattr
(
kerns
,
'fgraph'
,
None
)
fgraph
=
getattr
(
img
,
'fgraph'
,
None
)
or
getattr
(
kerns
,
'fgraph'
,
None
)
ctx_name
=
infer_context_name
(
img
,
kerns
)
ctx_name
=
infer_context_name
(
img
,
kerns
)
if
(
border_mode
==
'valid'
and
subsample
==
(
1
,
1
,
1
)
and
dilation
==
(
1
,
1
,
1
)
and
if
(
border_mode
==
'valid'
and
subsample
==
(
1
,
1
,
1
)
and
dilation
==
(
1
,
1
,
1
)
and
direction_hint
==
'bprop weights'
):
direction_hint
==
'bprop weights'
and
num_groups
==
1
):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
# up a suitable 'fake' convolution to compute the gradient for.
img
=
gpu_contiguous
(
img
.
dimshuffle
(
1
,
0
,
2
,
3
,
4
))
img
=
gpu_contiguous
(
img
.
dimshuffle
(
1
,
0
,
2
,
3
,
4
))
...
@@ -1135,7 +1141,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
...
@@ -1135,7 +1141,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
return
as_gpuarray_variable
(
conv
.
dimshuffle
(
1
,
0
,
2
,
3
,
4
),
ctx_name
)
return
as_gpuarray_variable
(
conv
.
dimshuffle
(
1
,
0
,
2
,
3
,
4
),
ctx_name
)
elif
(
border_mode
==
'full'
and
subsample
==
(
1
,
1
,
1
)
and
elif
(
border_mode
==
'full'
and
subsample
==
(
1
,
1
,
1
)
and
direction_hint
!=
'forward!'
):
direction_hint
!=
'forward!'
and
num_groups
==
1
):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
...
@@ -1159,7 +1165,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
...
@@ -1159,7 +1165,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
img
=
gpu_contiguous
(
img
)
img
=
gpu_contiguous
(
img
)
kerns
=
gpu_contiguous
(
kerns
)
kerns
=
gpu_contiguous
(
kerns
)
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
conv_mode
=
conv_mode
,
precision
=
precision
,
num_groups
=
num_groups
)(
kerns
.
shape
)
desc_op
=
desc
.
owner
.
op
desc_op
=
desc
.
owner
.
op
# We can use Shape_i and bypass the infer_shape here as this is on
# We can use Shape_i and bypass the infer_shape here as this is on
# the input of node and it will always be present.
# the input of node and it will always be present.
...
@@ -1171,7 +1178,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
...
@@ -1171,7 +1178,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
filter_dilation
=
dilation
)
filter_dilation
=
dilation
)
out_shp
=
assert_conv_shape
(
out_shp
)
out_shp
=
assert_conv_shape
(
out_shp
)
out
=
GpuAllocEmpty
(
dtype
=
img
.
dtype
,
context_name
=
ctx_name
)(
*
out_shp
)
out
=
GpuAllocEmpty
(
dtype
=
img
.
dtype
,
context_name
=
ctx_name
)(
*
out_shp
)
return
GpuDnnConv
(
algo
=
algo
)(
img
,
kerns
,
out
,
desc
)
return
GpuDnnConv
(
algo
=
algo
,
num_groups
=
num_groups
)(
img
,
kerns
,
out
,
desc
)
def
dnn_gradweight
(
img
,
topgrad
,
kerns_shp
,
border_mode
=
'valid'
,
def
dnn_gradweight
(
img
,
topgrad
,
kerns_shp
,
border_mode
=
'valid'
,
...
@@ -1189,18 +1196,21 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
...
@@ -1189,18 +1196,21 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
precision
=
get_precision
(
precision
,
[
img
,
topgrad
])
precision
=
get_precision
(
precision
,
[
img
,
topgrad
])
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns_shp
)
conv_mode
=
conv_mode
,
precision
=
precision
,
num_groups
=
num_groups
)(
kerns_shp
)
out
=
GpuAllocEmpty
(
dtype
=
img
.
dtype
,
context_name
=
ctx_name
)(
*
kerns_shp
)
out
=
GpuAllocEmpty
(
dtype
=
img
.
dtype
,
context_name
=
ctx_name
)(
*
kerns_shp
)
return
GpuDnnConvGradW
(
algo
=
algo
,
num_groups
=
num_groups
)(
img
,
topgrad
,
out
,
desc
)
return
GpuDnnConvGradW
(
algo
=
algo
,
num_groups
=
num_groups
)(
img
,
topgrad
,
out
,
desc
)
def
dnn_gradweight3d
(
img
,
topgrad
,
kerns_shp
,
border_mode
=
'valid'
,
def
dnn_gradweight3d
(
img
,
topgrad
,
kerns_shp
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
precision
=
None
):
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
1
):
"""
"""
3d version of dnn_gradweight
3d version of dnn_gradweight
"""
"""
return
dnn_gradweight
(
img
,
topgrad
,
kerns_shp
,
border_mode
,
return
dnn_gradweight
(
img
,
topgrad
,
kerns_shp
,
border_mode
,
subsample
,
dilation
,
conv_mode
,
precision
)
subsample
,
dilation
,
conv_mode
,
precision
,
algo
,
num_groups
)
def
dnn_gradinput
(
kerns
,
topgrad
,
img_shp
,
border_mode
=
'valid'
,
def
dnn_gradinput
(
kerns
,
topgrad
,
img_shp
,
border_mode
=
'valid'
,
...
@@ -1218,18 +1228,21 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
...
@@ -1218,18 +1228,21 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
precision
=
get_precision
(
precision
,
[
kerns
,
topgrad
])
precision
=
get_precision
(
precision
,
[
kerns
,
topgrad
])
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
conv_mode
=
conv_mode
,
precision
=
precision
,
num_groups
=
num_groups
)(
kerns
.
shape
)
out
=
GpuAllocEmpty
(
dtype
=
kerns
.
dtype
,
context_name
=
ctx_name
)(
*
img_shp
)
out
=
GpuAllocEmpty
(
dtype
=
kerns
.
dtype
,
context_name
=
ctx_name
)(
*
img_shp
)
return
GpuDnnConvGradI
(
algo
=
algo
,
num_groups
=
num_groups
)(
kerns
,
topgrad
,
out
,
desc
)
return
GpuDnnConvGradI
(
algo
=
algo
,
num_groups
=
num_groups
)(
kerns
,
topgrad
,
out
,
desc
)
def
dnn_gradinput3d
(
kerns
,
topgrad
,
img_shp
,
border_mode
=
'valid'
,
def
dnn_gradinput3d
(
kerns
,
topgrad
,
img_shp
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
precision
=
None
):
subsample
=
(
1
,
1
,
1
),
dilation
=
(
1
,
1
,
1
),
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
1
):
"""
"""
3d version of `dnn_gradinput`.
3d version of `dnn_gradinput`.
"""
"""
return
dnn_gradinput
(
kerns
,
topgrad
,
img_shp
,
border_mode
,
subsample
,
return
dnn_gradinput
(
kerns
,
topgrad
,
img_shp
,
border_mode
,
subsample
,
dilation
,
conv_mode
,
precision
)
dilation
,
conv_mode
,
precision
,
algo
,
num_groups
)
class
GpuDnnPoolDesc
(
Op
):
class
GpuDnnPoolDesc
(
Op
):
...
@@ -3020,8 +3033,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
...
@@ -3020,8 +3033,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if
version
(
raises
=
False
)
<
6000
and
op
.
filter_dilation
!=
(
1
,
1
):
if
version
(
raises
=
False
)
<
6000
and
op
.
filter_dilation
!=
(
1
,
1
):
return
None
return
None
if
op
.
num_groups
>
1
:
return
None
inp1
=
inputs
[
0
]
inp1
=
inputs
[
0
]
inp2
=
inputs
[
1
]
inp2
=
inputs
[
1
]
...
@@ -3071,8 +3082,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
...
@@ -3071,8 +3082,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
if
version
(
raises
=
False
)
<
6000
and
op
.
filter_dilation
!=
(
1
,
1
,
1
):
if
version
(
raises
=
False
)
<
6000
and
op
.
filter_dilation
!=
(
1
,
1
,
1
):
return
None
return
None
if
op
.
num_groups
>
1
:
return
None
inp1
=
inputs
[
0
]
inp1
=
inputs
[
0
]
inp2
=
inputs
[
1
]
inp2
=
inputs
[
1
]
...
@@ -3091,7 +3100,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
...
@@ -3091,7 +3100,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
subsample
=
op
.
subsample
,
subsample
=
op
.
subsample
,
dilation
=
op
.
filter_dilation
,
dilation
=
op
.
filter_dilation
,
direction_hint
=
'forward!'
,
direction_hint
=
'forward!'
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
,
num_groups
=
op
.
num_groups
)
elif
isinstance
(
op
,
AbstractConv3d_gradWeights
):
elif
isinstance
(
op
,
AbstractConv3d_gradWeights
):
shape
=
(
inp2
.
shape
[
1
],
inp1
.
shape
[
1
],
shape
=
(
inp2
.
shape
[
1
],
inp1
.
shape
[
1
],
inputs
[
2
][
0
],
inputs
[
2
][
1
],
inputs
[
2
][
2
])
inputs
[
2
][
0
],
inputs
[
2
][
1
],
inputs
[
2
][
2
])
...
@@ -3099,7 +3109,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
...
@@ -3099,7 +3109,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
border_mode
=
op
.
border_mode
,
border_mode
=
op
.
border_mode
,
subsample
=
op
.
subsample
,
subsample
=
op
.
subsample
,
dilation
=
op
.
filter_dilation
,
dilation
=
op
.
filter_dilation
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
,
num_groups
=
op
.
num_groups
)
elif
isinstance
(
op
,
AbstractConv3d_gradInputs
):
elif
isinstance
(
op
,
AbstractConv3d_gradInputs
):
shape
=
(
inp2
.
shape
[
0
],
inp1
.
shape
[
1
],
shape
=
(
inp2
.
shape
[
0
],
inp1
.
shape
[
1
],
inputs
[
2
][
0
],
inputs
[
2
][
1
],
inputs
[
2
][
2
])
inputs
[
2
][
0
],
inputs
[
2
][
1
],
inputs
[
2
][
2
])
...
@@ -3107,7 +3118,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
...
@@ -3107,7 +3118,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
border_mode
=
op
.
border_mode
,
border_mode
=
op
.
border_mode
,
subsample
=
op
.
subsample
,
subsample
=
op
.
subsample
,
dilation
=
op
.
filter_dilation
,
dilation
=
op
.
filter_dilation
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
,
num_groups
=
op
.
num_groups
)
return
[
rval
]
return
[
rval
]
...
...
theano/gpuarray/tests/test_dnn.py
浏览文件 @
b998dc61
...
@@ -26,6 +26,7 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer
...
@@ -26,6 +26,7 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer
from
theano.configdefaults
import
SUPPORTED_DNN_CONV_ALGO_FWD
from
theano.configdefaults
import
SUPPORTED_DNN_CONV_ALGO_FWD
from
theano.tensor.nnet.tests.test_abstract_conv
import
Grouped_conv_noOptim
from
theano.tensor.nnet.tests.test_abstract_conv
import
Grouped_conv_noOptim
from
theano.tensor.nnet.tests.test_abstract_conv
import
Grouped_conv3d_noOptim
try
:
try
:
import
pygpu
import
pygpu
...
@@ -2264,7 +2265,7 @@ def test_dnn_rnn_lstm_grad_c():
...
@@ -2264,7 +2265,7 @@ def test_dnn_rnn_lstm_grad_c():
utt
.
assert_allclose
(
ref_grads_layer
[
j
],
g
)
utt
.
assert_allclose
(
ref_grads_layer
[
j
],
g
)
def
dconv
2
d
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconv
fw
d
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconv
(
img
,
kern
):
def
dconv
(
img
,
kern
):
return
dnn
.
dnn_conv
(
img
,
kern
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
return
dnn
.
dnn_conv
(
img
,
kern
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
conv_mode
=
'conv'
,
direction_hint
=
'forward'
,
workmem
=
None
,
conv_mode
=
'conv'
,
direction_hint
=
'forward'
,
workmem
=
None
,
...
@@ -2272,14 +2273,14 @@ def dconv2d(border_mode, subsample, filter_dilation, num_groups):
...
@@ -2272,14 +2273,14 @@ def dconv2d(border_mode, subsample, filter_dilation, num_groups):
return
dconv
return
dconv
def
dconv
2d
w
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconv
g
w
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconvw
(
img
,
topgrad
,
kshp
):
def
dconvw
(
img
,
topgrad
,
kshp
):
return
dnn
.
dnn_gradweight
(
img
,
topgrad
,
kshp
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
return
dnn
.
dnn_gradweight
(
img
,
topgrad
,
kshp
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
num_groups
)
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
num_groups
)
return
dconvw
return
dconvw
def
dconv
2d
i
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconv
g
i
(
border_mode
,
subsample
,
filter_dilation
,
num_groups
):
def
dconvi
(
kern
,
topgrad
,
imshp
):
def
dconvi
(
kern
,
topgrad
,
imshp
):
return
dnn
.
dnn_gradinput
(
kern
,
topgrad
,
imshp
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
return
dnn
.
dnn_gradinput
(
kern
,
topgrad
,
imshp
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
filter_dilation
,
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
num_groups
)
conv_mode
=
'conv'
,
precision
=
None
,
algo
=
None
,
num_groups
=
num_groups
)
...
@@ -2288,9 +2289,21 @@ def dconv2di(border_mode, subsample, filter_dilation, num_groups):
...
@@ -2288,9 +2289,21 @@ def dconv2di(border_mode, subsample, filter_dilation, num_groups):
class
Cudnn_grouped_conv
(
Grouped_conv_noOptim
):
class
Cudnn_grouped_conv
(
Grouped_conv_noOptim
):
mode
=
mode_with_gpu
mode
=
mode_with_gpu
conv
=
staticmethod
(
dconv2d
)
conv
=
staticmethod
(
dconvfwd
)
conv_gradw
=
staticmethod
(
dconv2dw
)
conv_gradw
=
staticmethod
(
dconvgw
)
conv_gradi
=
staticmethod
(
dconv2di
)
conv_gradi
=
staticmethod
(
dconvgi
)
conv_op
=
dnn
.
GpuDnnConv
conv_gradw_op
=
dnn
.
GpuDnnConvGradW
conv_gradi_op
=
dnn
.
GpuDnnConvGradI
flip_filter
=
False
is_dnn
=
True
class
Cudnn_grouped_conv3d
(
Grouped_conv3d_noOptim
):
mode
=
mode_with_gpu
conv
=
staticmethod
(
dconvfwd
)
conv_gradw
=
staticmethod
(
dconvgw
)
conv_gradi
=
staticmethod
(
dconvgi
)
conv_op
=
dnn
.
GpuDnnConv
conv_op
=
dnn
.
GpuDnnConv
conv_gradw_op
=
dnn
.
GpuDnnConvGradW
conv_gradw_op
=
dnn
.
GpuDnnConvGradW
conv_gradi_op
=
dnn
.
GpuDnnConvGradI
conv_gradi_op
=
dnn
.
GpuDnnConvGradI
...
@@ -2519,3 +2532,151 @@ def test_dnn_spatialtf_grad():
...
@@ -2519,3 +2532,151 @@ def test_dnn_spatialtf_grad():
utt
.
verify_grad
(
grad_functor
,
[
inputs_val
,
theta_val
],
mode
=
mode_with_gpu
,
utt
.
verify_grad
(
grad_functor
,
[
inputs_val
,
theta_val
],
mode
=
mode_with_gpu
,
abs_tol
=
atol
,
rel_tol
=
rtol
)
abs_tol
=
atol
,
rel_tol
=
rtol
)
class
TestDnnConv2DRuntimeAlgorithms
(
object
):
ndim
=
2
cpu_conv_class
=
theano
.
tensor
.
nnet
.
corr
.
CorrMM
runtime_shapes
=
[
(
3
,
[(
2
,
3
,
10
,
9
),
(
5
,
3
,
7
,
7
)]),
(
1
,
[(
1
,
1
,
100
,
200
),
(
1
,
1
,
50
,
200
)]),
(
1
,
[(
4
,
2
,
20
,
20
),
(
2
,
2
,
20
,
19
)]),
(
3
,
[(
2
,
3
,
10
,
9
),
(
5
,
3
,
7
,
7
)]),
# cache should be used
(
1
,
[(
2
,
2
,
50
,
50
),
(
5
,
2
,
25
,
31
)]),
(
1
,
[(
1
,
1
,
100
,
200
),
(
1
,
1
,
50
,
200
)]),
# cache should be used
(
1
,
[(
4
,
2
,
20
,
20
),
(
2
,
2
,
20
,
19
)]),
# cache should be used
(
1
,
[(
1
,
2
,
3
,
4
),
(
6
,
2
,
2
,
1
)])
]
def
__init__
(
self
):
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
utt
.
seed_rng
()
self
.
runtime_algorithms
=
(
'time_once'
,
'guess_once'
,
'time_on_shape_change'
,
'guess_on_shape_change'
)
def
test_fwd_runtime_algorithms
(
self
):
dtype
=
'float32'
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_fwd_runtime_algorithm
(
algo
):
inputs
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
filters
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
# Scale down the input values to prevent very large absolute errors
# due to float rounding
lower_inputs
=
inputs
/
10
lower_filters
=
filters
/
10
conv
=
dnn
.
dnn_conv
(
img
=
lower_inputs
,
kerns
=
lower_filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
)
f
=
theano
.
function
([
inputs
,
filters
],
conv
,
mode
=
mode_with_gpu
)
if
self
.
ndim
==
3
:
flipped_filters
=
lower_filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
lower_filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
lower_inputs
),
flipped_filters
)
f_ref
=
theano
.
function
([
inputs
,
filters
],
conv_ref
,
mode
=
'FAST_RUN'
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
'time_once'
,
'guess_once'
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
f
(
inputs_val
,
filters_val
)
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
utt
.
assert_allclose
(
cpu_res
,
np
.
asarray
(
gpu_res
))
for
algo
in
self
.
runtime_algorithms
:
yield
(
run_fwd_runtime_algorithm
,
algo
)
def
test_gradinput_runtime_algorithms
(
self
):
dtype
=
'float32'
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_gradinput_runtime_algorithm
(
algo
):
theano
.
config
.
dnn
.
conv
.
algo_bwd_data
=
algo
inputs
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
filters
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
conv
=
dnn
.
dnn_conv
(
img
=
inputs
,
kerns
=
filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
)
grad_i
=
theano
.
tensor
.
grad
(
conv
.
sum
(),
[
inputs
])
f
=
theano
.
function
([
inputs
,
filters
],
grad_i
,
mode
=
mode_with_gpu
)
assert
1
==
len
([
node
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
node
.
op
,
dnn
.
GpuDnnConvGradI
)])
assert
not
any
(
isinstance
(
node
.
op
,
dnn
.
GpuDnnConv
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
assert
not
any
(
isinstance
(
node
.
op
,
dnn
.
GpuDnnConvGradW
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
if
self
.
ndim
==
3
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
inputs
),
flipped_filters
)
grad_i_ref
=
theano
.
tensor
.
grad
(
conv_ref
.
sum
(),
[
inputs
])
f_ref
=
theano
.
function
([
inputs
,
filters
],
grad_i_ref
,
mode
=
'FAST_RUN'
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
'time_once'
,
'guess_once'
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
f
(
inputs_val
,
filters_val
)
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
utt
.
assert_allclose
(
cpu_res
,
np
.
asarray
(
gpu_res
))
for
algo
in
self
.
runtime_algorithms
:
yield
(
run_gradinput_runtime_algorithm
,
algo
)
def
test_gradweight_runtime_algorithms
(
self
):
dtype
=
'float32'
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_gradweight_runtime_algorithm
(
algo
):
theano
.
config
.
dnn
.
conv
.
algo_bwd_filter
=
algo
inputs
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
filters
=
theano
.
tensor
.
TensorType
(
dtype
,
_broadcastable
)()
conv
=
dnn
.
dnn_conv
(
img
=
inputs
,
kerns
=
filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
)
grad_w
=
theano
.
tensor
.
grad
(
conv
.
sum
(),
[
filters
])
f
=
theano
.
function
([
inputs
,
filters
],
grad_w
,
mode
=
mode_with_gpu
)
assert
1
==
len
([
node
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
node
.
op
,
dnn
.
GpuDnnConvGradW
)])
assert
not
any
(
isinstance
(
node
.
op
,
dnn
.
GpuDnnConv
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
assert
not
any
(
isinstance
(
node
.
op
,
dnn
.
GpuDnnConvGradI
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
if
self
.
ndim
==
3
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
inputs
),
flipped_filters
)
grad_w_ref
=
theano
.
tensor
.
grad
(
conv_ref
.
sum
(),
[
filters
])
f_ref
=
theano
.
function
([
inputs
,
filters
],
grad_w_ref
,
mode
=
'FAST_RUN'
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
'time_once'
,
'guess_once'
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
f
(
inputs_val
,
filters_val
)
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
utt
.
assert_allclose
(
cpu_res
,
np
.
asarray
(
gpu_res
))
for
algo
in
self
.
runtime_algorithms
:
yield
(
run_gradweight_runtime_algorithm
,
algo
)
class
TestDnnConv3DRuntimeAlgorithms
(
TestDnnConv2DRuntimeAlgorithms
):
ndim
=
3
cpu_conv_class
=
theano
.
tensor
.
nnet
.
corr3d
.
Corr3dMM
runtime_shapes
=
[
(
3
,
[(
2
,
3
,
5
,
10
,
9
),
(
5
,
3
,
4
,
7
,
7
)]),
(
1
,
[(
1
,
1
,
5
,
100
,
200
),
(
1
,
1
,
4
,
50
,
200
)]),
(
1
,
[(
4
,
2
,
20
,
20
,
20
),
(
2
,
2
,
20
,
19
,
18
)]),
(
3
,
[(
2
,
3
,
5
,
10
,
9
),
(
5
,
3
,
4
,
7
,
7
)]),
# cache should be used
(
1
,
[(
2
,
2
,
50
,
50
,
5
),
(
5
,
2
,
25
,
31
,
4
)]),
(
1
,
[(
1
,
1
,
5
,
100
,
200
),
(
1
,
1
,
4
,
50
,
200
)]),
# cache should be used
(
1
,
[(
4
,
2
,
20
,
20
,
20
),
(
2
,
2
,
20
,
19
,
18
)]),
# cache should be used
(
1
,
[(
1
,
2
,
3
,
4
,
5
),
(
6
,
2
,
3
,
2
,
1
)])
]
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论