Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
46783773
提交
46783773
authored
5月 30, 2017
作者:
abergeron
提交者:
GitHub
5月 30, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5964 from notoraptor/cudnn-less-ccode-and-v6
Wrap Op params for many gpuarray DNN Ops and add cuDNN v6 integration.
上级
3baa162f
101fbd90
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
15 个修改的文件
包含
536 行增加
和
386 行删除
+536
-386
configdefaults.py
theano/configdefaults.py
+9
-11
op.py
theano/gof/op.py
+4
-1
type.py
theano/gof/type.py
+9
-1
conv_desc.c
theano/gpuarray/conv_desc.c
+22
-21
cudnn_defs.py
theano/gpuarray/cudnn_defs.py
+132
-0
dnn.py
theano/gpuarray/dnn.py
+0
-0
dnn_batchnorm_grad.c
theano/gpuarray/dnn_batchnorm_grad.c
+3
-3
dnn_batchnorm_inf.c
theano/gpuarray/dnn_batchnorm_inf.c
+11
-11
dnn_fwd.c
theano/gpuarray/dnn_fwd.c
+98
-106
dnn_gi.c
theano/gpuarray/dnn_gi.c
+108
-96
dnn_gw.c
theano/gpuarray/dnn_gw.c
+95
-100
dnn_pool.c
theano/gpuarray/dnn_pool.c
+3
-3
dnn_pool_grad.c
theano/gpuarray/dnn_pool_grad.c
+3
-3
test_dnn.py
theano/gpuarray/tests/test_dnn.py
+33
-30
pool.py
theano/tensor/signal/pool.py
+6
-0
没有找到文件。
theano/configdefaults.py
浏览文件 @
46783773
...
@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo):
...
@@ -268,19 +268,18 @@ def safe_no_dnn_algo_bwd(algo):
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.'
)
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.'
)
return
True
return
True
# Those are the options provided by Theano to choose algorithms at runtime.
SUPPORTED_DNN_CONV_ALGO_RUNTIME
=
(
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
)
# Those are the supported algorithm by Theano,
# Those are the supported algorithm by Theano,
# The tests will reference those lists.
# The tests will reference those lists.
SUPPORTED_DNN_CONV_ALGO_FWD
=
(
'small'
,
'none'
,
'large'
,
'fft'
,
'fft_tiling'
,
SUPPORTED_DNN_CONV_ALGO_FWD
=
(
'small'
,
'none'
,
'large'
,
'fft'
,
'fft_tiling'
,
'winograd'
)
+
SUPPORTED_DNN_CONV_ALGO_RUNTIME
'winograd'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
)
SUPPORTED_DNN_CONV_ALGO_BWD_DATA
=
(
'none'
,
'deterministic'
,
'fft'
,
'fft_tiling'
,
'winograd'
)
+
SUPPORTED_DNN_CONV_ALGO_RUNTIME
SUPPORTED_DNN_CONV_ALGO_BWD_DATA
=
(
'none'
,
'deterministic'
,
'fft'
,
'fft_tiling'
,
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
=
(
'none'
,
'deterministic'
,
'fft'
,
'small'
)
+
SUPPORTED_DNN_CONV_ALGO_RUNTIME
'winograd'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
)
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
=
(
'none'
,
'deterministic'
,
'fft'
,
'small'
,
SUPPORTED_DNN_CONV_PRECISION
=
(
'as_input_f32'
,
'as_input'
,
'float16'
,
'float32'
,
'float64'
)
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
)
AddConfigVar
(
'dnn.conv.algo_bwd'
,
AddConfigVar
(
'dnn.conv.algo_bwd'
,
"This flag is deprecated; use dnn.conv.algo_bwd_data and "
"This flag is deprecated; use dnn.conv.algo_bwd_data and "
...
@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision',
...
@@ -311,8 +310,7 @@ AddConfigVar('dnn.conv.precision',
"Default data precision to use for the computation in cuDNN "
"Default data precision to use for the computation in cuDNN "
"convolutions (defaults to the same dtype as the inputs of the "
"convolutions (defaults to the same dtype as the inputs of the "
"convolutions, or float32 if inputs are float16)."
,
"convolutions, or float32 if inputs are float16)."
,
EnumStr
(
'as_input_f32'
,
'as_input'
,
'float16'
,
'float32'
,
EnumStr
(
*
SUPPORTED_DNN_CONV_PRECISION
),
'float64'
),
in_c_key
=
False
)
in_c_key
=
False
)
...
...
theano/gof/op.py
浏览文件 @
46783773
...
@@ -1413,7 +1413,10 @@ class COp(Op):
...
@@ -1413,7 +1413,10 @@ class COp(Op):
return
[]
return
[]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
hash
(
tuple
(
self
.
func_codes
))
version
=
(
hash
(
tuple
(
self
.
func_codes
)),
)
if
hasattr
(
self
,
'params_type'
):
version
+=
(
self
.
params_type
.
c_code_cache_version
(),
)
return
version
def
c_init_code
(
self
):
def
c_init_code
(
self
):
"""
"""
...
...
theano/gof/type.py
浏览文件 @
46783773
...
@@ -963,6 +963,12 @@ class EnumType(Type, dict):
...
@@ -963,6 +963,12 @@ class EnumType(Type, dict):
"""
"""
return
alias
in
self
.
aliases
return
alias
in
self
.
aliases
def
get_aliases
(
self
):
"""
Return the list of all aliases in this enumeration.
"""
return
self
.
aliases
.
keys
()
def
__repr__
(
self
):
def
__repr__
(
self
):
names_to_aliases
=
{
constant_name
:
''
for
constant_name
in
self
}
names_to_aliases
=
{
constant_name
:
''
for
constant_name
in
self
}
for
alias
in
self
.
aliases
:
for
alias
in
self
.
aliases
:
...
@@ -1184,4 +1190,6 @@ class CEnumType(EnumList):
...
@@ -1184,4 +1190,6 @@ class CEnumType(EnumList):
fail
=
sub
[
'fail'
])
fail
=
sub
[
'fail'
])
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
1
,
super
(
CEnumType
,
self
)
.
c_code_cache_version
())
# C code depends on (C constant name, Python value) associations (given by `self.items()`),
# so we should better take them into account in C code version.
return
(
1
,
tuple
(
self
.
items
()),
super
(
CEnumType
,
self
)
.
c_code_cache_version
())
theano/gpuarray/conv_desc.c
浏览文件 @
46783773
#section support_code_apply
#section support_code_apply
int
APPLY_SPECIFIC
(
conv_desc
)(
PyArrayObject
*
filt_shp
,
int
APPLY_SPECIFIC
(
conv_desc
)(
PyArrayObject
*
filt_shp
,
cudnnConvolutionDescriptor_t
*
desc
)
{
cudnnConvolutionDescriptor_t
*
desc
,
PARAMS_TYPE
*
params
)
{
cudnnStatus_t
err
;
cudnnStatus_t
err
;
int
pad
[
3
]
=
{
PAD_0
,
PAD_1
,
PAD_
2
};
int
pad
[
3
]
=
{
params
->
pad0
,
params
->
pad1
,
params
->
pad
2
};
int
strides
[
3
]
=
{
SUB_0
,
SUB_1
,
SUB_
2
};
int
strides
[
3
]
=
{
params
->
sub0
,
params
->
sub1
,
params
->
sub
2
};
int
dilation
[
3
]
=
{
DIL_0
,
DIL_1
,
DIL_
2
};
int
dilation
[
3
]
=
{
params
->
dil0
,
params
->
dil1
,
params
->
dil
2
};
#if BORDER_MODE == 0
if
(
params
->
bmode
==
BORDER_MODE_FULL
)
{
pad
[
0
]
=
(
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
2
)
-
1
)
*
DIL_0
;
pad
[
0
]
=
(
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
2
)
-
1
)
*
dilation
[
0
]
;
pad
[
1
]
=
(
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
3
)
-
1
)
*
DIL_1
;
pad
[
1
]
=
(
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
3
)
-
1
)
*
dilation
[
1
]
;
#if NB_DIMS > 2
if
(
params
->
nb_dims
>
2
)
{
pad
[
2
]
=
(
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
4
)
-
1
)
*
DIL_2
;
pad
[
2
]
=
(
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
4
)
-
1
)
*
dilation
[
2
]
;
#endif
}
#elif BORDER_MODE == 2
}
else
if
(
params
->
bmode
==
BORDER_MODE_HALF
)
{
pad
[
0
]
=
((
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
2
)
-
1
)
*
DIL_0
+
1
)
/
2
;
pad
[
0
]
=
((
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
2
)
-
1
)
*
dilation
[
0
]
+
1
)
/
2
;
pad
[
1
]
=
((
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
3
)
-
1
)
*
DIL_1
+
1
)
/
2
;
pad
[
1
]
=
((
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
3
)
-
1
)
*
dilation
[
1
]
+
1
)
/
2
;
#if NB_DIMS > 2
if
(
params
->
nb_dims
>
2
)
{
pad
[
2
]
=
((
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
4
)
-
1
)
*
DIL_2
+
1
)
/
2
;
pad
[
2
]
=
((
*
(
npy_int64
*
)
PyArray_GETPTR1
(
filt_shp
,
4
)
-
1
)
*
dilation
[
2
]
+
1
)
/
2
;
#endif
}
#endif
}
if
(
PyArray_DIM
(
filt_shp
,
0
)
-
2
!=
NB_DIMS
)
{
if
(
PyArray_DIM
(
filt_shp
,
0
)
-
2
!=
params
->
nb_dims
)
{
PyErr_Format
(
PyExc_ValueError
,
"Filter shape has too many dimensions: "
PyErr_Format
(
PyExc_ValueError
,
"Filter shape has too many dimensions: "
"expected %d, got %lld."
,
NB_DIMS
,
"expected %d, got %lld."
,
params
->
nb_dims
,
(
long
long
)
PyArray_DIM
(
filt_shp
,
0
));
(
long
long
)
PyArray_DIM
(
filt_shp
,
0
));
return
-
1
;
return
-
1
;
}
}
...
@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
...
@@ -35,8 +36,8 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
return
-
1
;
return
-
1
;
}
}
err
=
cudnnSetConvolutionNdDescriptor
(
*
desc
,
NB_DIMS
,
pad
,
strides
,
err
=
cudnnSetConvolutionNdDescriptor
(
*
desc
,
params
->
nb_dims
,
pad
,
strides
,
dilation
,
CONV_MODE
,
PRECISION
);
dilation
,
params
->
conv_mode
,
params
->
precision
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_MemoryError
,
"could not set convolution "
PyErr_Format
(
PyExc_MemoryError
,
"could not set convolution "
"descriptor: %s"
,
cudnnGetErrorString
(
err
));
"descriptor: %s"
,
cudnnGetErrorString
(
err
));
...
...
theano/gpuarray/cudnn_defs.py
0 → 100644
浏览文件 @
46783773
"""
Declarations of cuDNN types and constants used in Theano gpuarray DNN module.
For every cuDNN API supported by Theano, this module defines a class that
provides the set of cuDNN definitions to be used in Theano Ops.
Use :func:`get_definitions` to get the right cuDNN definitions
for a given cuDNN version.
Currently supported cuDNN APIs:
- v5.1
- v6.0
"""
from
__future__
import
absolute_import
,
print_function
,
division
from
theano.gof
import
CEnumType
# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
# We still register them here because we try to exactly copy cuDNN enums
# in Python side, but they will have no aliases associated, to help
# exclude them from lists of supported algorithms.
class
CuDNNV51
(
object
):
version
=
5
cudnnConvolutionMode_t
=
CEnumType
((
'CUDNN_CONVOLUTION'
,
'conv'
),
(
'CUDNN_CROSS_CORRELATION'
,
'cross'
),
ctype
=
'cudnnConvolutionMode_t'
)
cudnnDataType_t
=
CEnumType
((
'CUDNN_DATA_FLOAT'
,
'float32'
),
(
'CUDNN_DATA_DOUBLE'
,
'float64'
),
(
'CUDNN_DATA_HALF'
,
'float16'
),
# CUDNN_DATA_INT8 # new in v6
# CUDNN_DATA_INT32 # new in v6
# CUDNN_DATA_INT8x4 # new in v6
ctype
=
'cudnnDataType_t'
)
cudnnConvolutionFwdAlgo_t
=
CEnumType
((
'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
,
'none'
),
(
'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
,
'small'
),
(
'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
,
'large'
),
# not implemented:
(
'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
),
(
'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
,
'fft'
),
(
'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
,
'fft_tiling'
),
(
'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD'
,
'winograd'
),
# TODO: Not yet tested/documented:
(
'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED'
,
'winograd_non_fused'
),
ctype
=
'cudnnConvolutionFwdAlgo_t'
)
conv3d_fwd_algorithms
=
(
'none'
,
'small'
,
'fft_tiling'
)
cudnnConvolutionBwdFilterAlgo_t
=
CEnumType
((
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
,
'none'
),
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
,
'deterministic'
),
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
,
'fft'
),
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
,
'small'
),
# not implemented:
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'
),
# TODO: not yet tested/documented:
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED'
,
'winograd_non_fused'
),
ctype
=
'cudnnConvolutionBwdFilterAlgo_t'
)
conv3d_bwd_filter_algorithms
=
(
'none'
,
'small'
)
cudnnConvolutionBwdDataAlgo_t
=
CEnumType
((
'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
,
'none'
),
(
'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
,
'deterministic'
),
(
'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
,
'fft'
),
(
'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
,
'fft_tiling'
),
(
'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD'
,
'winograd'
),
# TODO: not yet tested/documented:
(
'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED'
,
'winograd_non_fused'
),
ctype
=
'cudnnConvolutionBwdDataAlgo_t'
)
conv3d_bwd_data_algorithms
=
(
'none'
,
'deterministic'
,
'fft_tiling'
)
cudnnPoolingMode_t
=
CEnumType
((
'CUDNN_POOLING_MAX'
,
'max'
),
(
'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
,
'average_inc_pad'
),
(
'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
,
'average_exc_pad'
),
ctype
=
'cudnnPoolingMode_t'
)
cudnnSoftmaxAlgorithm_t
=
CEnumType
((
'CUDNN_SOFTMAX_FAST'
,
'fast'
),
(
'CUDNN_SOFTMAX_ACCURATE'
,
'accurate'
),
(
'CUDNN_SOFTMAX_LOG'
,
'log'
),
ctype
=
'cudnnSoftmaxAlgorithm_t'
)
cudnnSoftmaxMode_t
=
CEnumType
((
'CUDNN_SOFTMAX_MODE_INSTANCE'
,
'instance'
),
(
'CUDNN_SOFTMAX_MODE_CHANNEL'
,
'channel'
),
ctype
=
'cudnnSoftmaxMode_t'
)
cudnnBatchNormMode_t
=
CEnumType
((
'CUDNN_BATCHNORM_PER_ACTIVATION'
,
'per-activation'
),
(
'CUDNN_BATCHNORM_SPATIAL'
,
'spatial'
),
ctype
=
'cudnnBatchNormMode_t'
)
class
CuDNNV6
(
CuDNNV51
):
version
=
6
cudnnPoolingMode_t
=
CEnumType
((
'CUDNN_POOLING_MAX'
,
'max'
),
(
'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
,
'average_inc_pad'
),
(
'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
,
'average_exc_pad'
),
# new in v6:
(
'CUDNN_POOLING_MAX_DETERMINISTIC'
,
'max_deterministic'
),
ctype
=
'cudnnPoolingMode_t'
)
cudnnConvolutionBwdFilterAlgo_t
=
CEnumType
((
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
,
'none'
),
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
,
'deterministic'
),
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
,
'fft'
),
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
,
'small'
),
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD'
),
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED'
,
'winograd_non_fused'
),
# TODO: not yet tested/documented:
# new in v6:
(
'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING'
,
'fft_tiling'
),
ctype
=
'cudnnConvolutionBwdFilterAlgo_t'
)
def
get_definitions
(
cudnn_version
=
None
):
"""
Return cuDNN definitions to be used by Theano for the given cuDNN version.
``cudnn_version`` must be None or an integer
(typically the version returned by :func:`theano.gpuarray.dnn.version`).
if None, return definitions for the most recent supported cuDNN version.
"""
if
cudnn_version
is
not
None
and
cudnn_version
//
1000
==
5
:
return
CuDNNV51
()
# By default, we use definitions for the last supported cuDNN version.
return
CuDNNV6
()
theano/gpuarray/dnn.py
浏览文件 @
46783773
差异被折叠。
点击展开。
theano/gpuarray/dnn_batchnorm_grad.c
浏览文件 @
46783773
...
@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
...
@@ -24,7 +24,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject
*
scale
,
PyGpuArrayObject
*
x_mean
,
PyGpuArrayObject
*
scale
,
PyGpuArrayObject
*
x_mean
,
PyGpuArrayObject
*
x_invstd
,
npy_float64
epsilon
,
PyGpuArrayObject
*
x_invstd
,
npy_float64
epsilon
,
PyGpuArrayObject
**
dinp
,
PyGpuArrayObject
**
dscale
,
PyGpuArrayObject
**
dinp
,
PyGpuArrayObject
**
dscale
,
PyGpuArrayObject
**
dbias
,
cudnnHandle_t
_handle
)
{
PyGpuArrayObject
**
dbias
,
PARAMS_TYPE
*
params
)
{
PyGpuContextObject
*
c
=
inp
->
context
;
PyGpuContextObject
*
c
=
inp
->
context
;
if
(
c_set_tensorNd
(
inp
,
bn_input
)
!=
0
)
if
(
c_set_tensorNd
(
inp
,
bn_input
)
!=
0
)
...
@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
...
@@ -70,8 +70,8 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam
=
(
void
*
)
&
fbeta
;
betaParam
=
(
void
*
)
&
fbeta
;
}
}
cudnnStatus_t
err
=
cudnnBatchNormalizationBackward
(
cudnnStatus_t
err
=
cudnnBatchNormalizationBackward
(
_
handle
,
params
->
handle
,
MODE
,
params
->
mode
,
alphaData
,
alphaData
,
betaData
,
betaData
,
alphaParam
,
alphaParam
,
...
...
theano/gpuarray/dnn_batchnorm_inf.c
浏览文件 @
46783773
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
int
dnn_batchnorm_op
(
PyGpuArrayObject
*
inp
,
PyGpuArrayObject
*
scale
,
int
dnn_batchnorm_op
(
PyGpuArrayObject
*
inp
,
PyGpuArrayObject
*
scale
,
PyGpuArrayObject
*
bias
,
PyGpuArrayObject
*
est_mean
,
PyGpuArrayObject
*
bias
,
PyGpuArrayObject
*
est_mean
,
PyGpuArrayObject
*
est_var
,
npy_float64
epsilon
,
PyGpuArrayObject
*
est_var
,
npy_float64
epsilon
,
PyGpuArrayObject
**
outp
,
cudnnHandle_t
_handle
)
{
PyGpuArrayObject
**
outp
,
PARAMS_TYPE
*
params
)
{
PyGpuContextObject
*
c
=
inp
->
context
;
PyGpuContextObject
*
c
=
inp
->
context
;
if
(
c_set_tensorNd
(
inp
,
bn_input
)
!=
0
)
if
(
c_set_tensorNd
(
inp
,
bn_input
)
!=
0
)
...
@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
...
@@ -16,14 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return
1
;
return
1
;
}
}
#ifdef INPLACE_OUTPUT
if
(
params
->
inplace
)
{
Py_XDECREF
(
*
outp
);
Py_XDECREF
(
*
outp
);
*
outp
=
inp
;
*
outp
=
inp
;
Py_INCREF
(
*
outp
);
Py_INCREF
(
*
outp
);
#else
}
else
{
if
(
theano_prep_output
(
outp
,
inp
->
ga
.
nd
,
inp
->
ga
.
dimensions
,
inp
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
if
(
theano_prep_output
(
outp
,
inp
->
ga
.
nd
,
inp
->
ga
.
dimensions
,
inp
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
return
1
;
return
1
;
#endif
}
if
(
c_set_tensorNd
(
*
outp
,
bn_output
)
!=
0
)
if
(
c_set_tensorNd
(
*
outp
,
bn_output
)
!=
0
)
return
1
;
return
1
;
...
@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
...
@@ -43,8 +43,8 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta
=
(
void
*
)
&
fbeta
;
beta
=
(
void
*
)
&
fbeta
;
}
}
cudnnStatus_t
err
=
cudnnBatchNormalizationForwardInference
(
cudnnStatus_t
err
=
cudnnBatchNormalizationForwardInference
(
_
handle
,
params
->
handle
,
MODE
,
params
->
mode
,
alpha
,
alpha
,
beta
,
beta
,
bn_input
,
bn_input
,
...
...
theano/gpuarray/dnn_fwd.c
浏览文件 @
46783773
#section init_code_struct
#section init_code_struct
#ifdef CHOOSE_ALGO
if
(
PARAMS
->
choose_algo
)
{
reuse_algo
=
0
;
reuse_algo
=
0
;
prev_algo
=
CONV_ALGO
;
prev_algo
=
PARAMS
->
conv_algo
;
#ifndef CHOOSE_ONCE
if
(
!
PARAMS
->
choose_once
)
{
memset
(
prev_img_dims
,
0
,
sizeof
(
prev_img_dims
));
memset
(
prev_img_dims
,
0
,
sizeof
(
prev_img_dims
));
memset
(
prev_kern_dims
,
0
,
sizeof
(
prev_kern_dims
));
memset
(
prev_kern_dims
,
0
,
sizeof
(
prev_kern_dims
));
#endif
}
#endif
}
#section support_code_struct
#section support_code_struct
#ifdef CHOOSE_ALGO
int
reuse_algo
;
int
reuse_algo
;
cudnnConvolutionFwdAlgo_t
prev_algo
;
cudnnConvolutionFwdAlgo_t
prev_algo
;
#ifndef CHOOSE_ONCE
size_t
prev_img_dims
[
5
];
size_t
prev_img_dims
[
5
];
size_t
prev_kern_dims
[
5
];
size_t
prev_kern_dims
[
5
];
#endif
#endif
int
int
APPLY_SPECIFIC
(
conv_fwd
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
kerns
,
APPLY_SPECIFIC
(
conv_fwd
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
kerns
,
...
@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -26,7 +22,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t
desc
,
cudnnConvolutionDescriptor_t
desc
,
double
alpha
,
double
beta
,
double
alpha
,
double
beta
,
PyGpuArrayObject
**
output
,
PyGpuArrayObject
**
output
,
cudnnHandle_t
_handle
)
{
PARAMS_TYPE
*
params
)
{
PyGpuContextObject
*
c
=
input
->
context
;
PyGpuContextObject
*
c
=
input
->
context
;
void
*
alpha_p
;
void
*
alpha_p
;
void
*
beta_p
;
void
*
beta_p
;
...
@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -54,17 +50,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return
1
;
return
1
;
}
}
#ifdef CONV_INPLACE
if
(
params
->
inplace
)
{
Py_XDECREF
(
*
output
);
Py_XDECREF
(
*
output
);
*
output
=
om
;
*
output
=
om
;
Py_INCREF
(
*
output
);
Py_INCREF
(
*
output
);
#else
}
else
{
if
(
theano_prep_output
(
output
,
PyGpuArray_NDIM
(
om
),
PyGpuArray_DIMS
(
om
),
if
(
theano_prep_output
(
output
,
PyGpuArray_NDIM
(
om
),
PyGpuArray_DIMS
(
om
),
om
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
om
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
return
1
;
return
1
;
if
(
beta
!=
0
.
0
&&
pygpu_move
(
*
output
,
om
))
if
(
beta
!=
0
.
0
&&
pygpu_move
(
*
output
,
om
))
return
1
;
return
1
;
#endif
}
if
(
PyGpuArray_DIMS
(
input
)[
0
]
==
0
||
PyGpuArray_DIMS
(
kerns
)[
0
]
==
0
||
PyGpuArray_DIMS
(
kerns
)[
1
]
==
0
)
{
if
(
PyGpuArray_DIMS
(
input
)[
0
]
==
0
||
PyGpuArray_DIMS
(
kerns
)[
0
]
==
0
||
PyGpuArray_DIMS
(
kerns
)[
1
]
==
0
)
{
int
err2
=
GpuArray_memset
(
&
(
*
output
)
->
ga
,
0
);
int
err2
=
GpuArray_memset
(
&
(
*
output
)
->
ga
,
0
);
...
@@ -83,90 +79,90 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -83,90 +79,90 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if
(
c_set_tensorNd
(
*
output
,
APPLY_SPECIFIC
(
output
))
==
-
1
)
if
(
c_set_tensorNd
(
*
output
,
APPLY_SPECIFIC
(
output
))
==
-
1
)
return
1
;
return
1
;
cudnnConvolutionFwdAlgo_t
algo
=
CONV_ALGO
;
cudnnConvolutionFwdAlgo_t
algo
=
params
->
conv_algo
;
cuda_enter
(
c
->
ctx
);
cuda_enter
(
c
->
ctx
);
#ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
reuse_algo
=
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
input
,
i
)
==
prev_img_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
kerns
,
i
)
==
prev_kern_dims
[
i
]);
}
#endif
if
(
!
reuse_algo
)
{
size_t
free
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
if
(
params
->
choose_algo
)
{
if
(
err2
!=
GA_NO_ERROR
)
{
if
(
params
->
choose_once
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
reuse_algo
=
1
;
"memory information on the GPU"
);
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
cuda_exit
(
c
->
ctx
);
reuse_algo
=
(
reuse_algo
&&
return
1
;
PyGpuArray_DIM
(
input
,
i
)
==
prev_img_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
kerns
,
i
)
==
prev_kern_dims
[
i
]);
}
}
}
// Guess 4Mb if the info is not available
if
(
!
reuse_algo
)
{
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
size_t
free
;
#ifdef CHOOSE_TIME
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
int
count
;
if
(
err2
!=
GA_NO_ERROR
)
{
cudnnConvolutionFwdAlgoPerf_t
choice
;
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
gpudata
*
tmpmem
;
"memory information on the GPU"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
// Guess 4Mb if the info is not available
if
(
tmpmem
==
NULL
)
{
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
return
-
1
;
if
(
params
->
choose_time
)
{
int
count
;
cudnnConvolutionFwdAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
return
-
1
;
}
// We don't sync the buffer as we don't care about the values.
err
=
cudnnFindConvolutionForwardAlgorithmEx
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
*
output
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
algo
=
choice
.
algo
;
}
else
{
err
=
cudnnGetConvolutionForwardAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
}
// We don't sync the buffer as we don't care about the values.
err
=
cudnnFindConvolutionForwardAlgorithmEx
(
_handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
*
output
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
params
->
choose_once
)
{
PyErr_Format
(
PyExc_RuntimeError
,
reuse_algo
=
1
;
"error selecting convolution algo: %s"
,
}
else
{
cudnnGetErrorString
(
err
));
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
cuda_exit
(
c
->
ctx
);
prev_img_dims
[
i
]
=
PyGpuArray_DIM
(
input
,
i
);
return
1
;
prev_kern_dims
[
i
]
=
PyGpuArray_DIM
(
kerns
,
i
);
}
}
algo
=
choice
.
algo
;
#else
err
=
cudnnGetConvolutionForwardAlgorithm
(
_handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
#endif
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
}
#ifdef CHOOSE_ONCE
reuse_algo
=
1
;
#else
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
prev_img_dims
[
i
]
=
PyGpuArray_DIM
(
input
,
i
);
prev_kern_dims
[
i
]
=
PyGpuArray_DIM
(
kerns
,
i
);
}
#endif
#endif
/* These two algos are not supported for 3d conv */
/* These two algos are not supported for 3d conv */
if
(
PyGpuArray_NDIM
(
input
)
==
5
&&
if
(
PyGpuArray_NDIM
(
input
)
==
5
&&
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
||
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
||
...
@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -201,20 +197,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return
1
;
return
1
;
}
}
if
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
)
if
(
algo
==
CUDNN_CONVOLUTION_FWD_ALGO_FFT
)
{
{
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
PyGpuArray_DIM
(
input
,
2
)
>
1024
||
PyGpuArray_DIM
(
input
,
3
)
>
1024
||
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
(
PyGpuArray_DIM
(
kerns
,
2
)
==
1
&&
PyGpuArray_DIM
(
kerns
,
3
)
==
1
))
{
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
}
}
}
else
{
else
{
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
if
(
stride
[
0
]
!=
1
||
stride
[
1
]
!=
1
)
{
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
}
}
}
}
...
@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -223,7 +215,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{
{
size_t
worksize
;
size_t
worksize
;
gpudata
*
workspace
;
gpudata
*
workspace
;
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
_
handle
,
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
kerns
),
desc
,
desc
,
...
@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -236,7 +228,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning
// TODO: Print a warning
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
_
handle
,
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
kerns
),
desc
,
desc
,
...
@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...
@@ -273,7 +265,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait
((
*
output
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_wait
((
*
output
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
err
=
cudnnConvolutionForward
(
err
=
cudnnConvolutionForward
(
_
handle
,
params
->
handle
,
alpha_p
,
alpha_p
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
...
...
theano/gpuarray/dnn_gi.c
浏览文件 @
46783773
#section init_code_struct
#section init_code_struct
#ifdef CHOOSE_ALGO
// #ifdef CHOOSE_ALGO
reuse_algo
=
0
;
if
(
PARAMS
->
choose_algo
)
{
prev_algo
=
CONV_ALGO
;
reuse_algo
=
0
;
#ifndef CHOOSE_ONCE
prev_algo
=
PARAMS
->
conv_algo
;
memset
(
prev_kern_dims
,
0
,
sizeof
(
prev_kern_dims
));
// #ifndef CHOOSE_ONCE
memset
(
prev_top_dims
,
0
,
sizeof
(
prev_top_dims
));
if
(
!
PARAMS
->
choose_once
)
{
#endif
memset
(
prev_kern_dims
,
0
,
sizeof
(
prev_kern_dims
));
#endif
memset
(
prev_top_dims
,
0
,
sizeof
(
prev_top_dims
));
}
// #endif
}
// #endif
#section support_code_struct
#section support_code_struct
#ifdef CHOOSE_ALGO
int
reuse_algo
;
int
reuse_algo
=
0
;
cudnnConvolutionBwdDataAlgo_t
prev_algo
;
cudnnConvolutionBwdDataAlgo_t
prev_algo
=
CONV_ALGO
;
#ifndef CHOOSE_ONCE
size_t
prev_kern_dims
[
5
]
=
{
0
};
size_t
prev_kern_dims
[
5
]
=
{
0
};
size_t
prev_top_dims
[
5
]
=
{
0
};
size_t
prev_top_dims
[
5
]
=
{
0
};
#endif
#endif
int
int
APPLY_SPECIFIC
(
conv_gi
)(
PyGpuArrayObject
*
kerns
,
PyGpuArrayObject
*
output
,
APPLY_SPECIFIC
(
conv_gi
)(
PyGpuArrayObject
*
kerns
,
PyGpuArrayObject
*
output
,
PyGpuArrayObject
*
im
,
PyGpuArrayObject
*
im
,
cudnnConvolutionDescriptor_t
desc
,
cudnnConvolutionDescriptor_t
desc
,
double
alpha
,
double
beta
,
PyGpuArrayObject
**
input
,
double
alpha
,
double
beta
,
PyGpuArrayObject
**
input
,
cudnnHandle_t
_handle
)
{
PARAMS_TYPE
*
params
)
{
PyGpuContextObject
*
c
=
kerns
->
context
;
PyGpuContextObject
*
c
=
kerns
->
context
;
void
*
alpha_p
;
void
*
alpha_p
;
void
*
beta_p
;
void
*
beta_p
;
...
@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -53,17 +53,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return
1
;
return
1
;
}
}
#ifdef CONV_INPLACE
// #ifdef CONV_INPLACE
Py_XDECREF
(
*
input
);
if
(
params
->
inplace
)
{
*
input
=
im
;
Py_XDECREF
(
*
input
);
Py_INCREF
(
*
input
);
*
input
=
im
;
#else
Py_INCREF
(
*
input
);
if
(
theano_prep_output
(
input
,
PyGpuArray_NDIM
(
im
),
PyGpuArray_DIMS
(
im
),
// #else
im
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
}
else
{
return
1
;
if
(
theano_prep_output
(
input
,
PyGpuArray_NDIM
(
im
),
PyGpuArray_DIMS
(
im
),
if
(
beta
!=
0
.
0
&&
pygpu_move
(
*
input
,
im
))
im
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
return
1
;
return
1
;
#endif
if
(
beta
!=
0
.
0
&&
pygpu_move
(
*
input
,
im
))
return
1
;
}
// #endif
if
(
PyGpuArray_DIMS
(
im
)[
0
]
==
0
||
PyGpuArray_DIMS
(
kerns
)[
0
]
==
0
||
PyGpuArray_DIMS
(
kerns
)[
1
]
==
0
)
{
if
(
PyGpuArray_DIMS
(
im
)[
0
]
==
0
||
PyGpuArray_DIMS
(
kerns
)[
0
]
==
0
||
PyGpuArray_DIMS
(
kerns
)[
1
]
==
0
)
{
int
err2
=
GpuArray_memset
(
&
(
*
input
)
->
ga
,
0
);
int
err2
=
GpuArray_memset
(
&
(
*
input
)
->
ga
,
0
);
...
@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -82,7 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if
(
c_set_tensorNd
(
*
input
,
APPLY_SPECIFIC
(
input
))
==
-
1
)
if
(
c_set_tensorNd
(
*
input
,
APPLY_SPECIFIC
(
input
))
==
-
1
)
return
1
;
return
1
;
cudnnConvolutionBwdDataAlgo_t
algo
=
CONV_ALGO
;
cudnnConvolutionBwdDataAlgo_t
algo
=
params
->
conv_algo
;
cuda_enter
(
c
->
ctx
);
cuda_enter
(
c
->
ctx
);
...
@@ -128,84 +131,93 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -128,84 +131,93 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
}
}
}
#ifdef CHOOSE_ALGO
// #ifdef CHOOSE_ALGO
#ifndef CHOOSE_ONCE
if
(
params
->
choose_algo
)
{
reuse_algo
=
1
;
// #ifndef CHOOSE_ONCE
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
kerns
);
i
++
)
{
if
(
!
params
->
choose_once
)
{
reuse_algo
=
(
reuse_algo
&&
reuse_algo
=
1
;
PyGpuArray_DIM
(
kerns
,
i
)
==
prev_kern_dims
[
i
]);
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
kerns
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
output
,
i
)
==
prev_top_dims
[
i
]);
PyGpuArray_DIM
(
kerns
,
i
)
==
prev_kern_dims
[
i
]);
}
reuse_algo
=
(
reuse_algo
&&
#endif
PyGpuArray_DIM
(
output
,
i
)
==
prev_top_dims
[
i
]);
}
}
// #endif
if
(
!
reuse_algo
)
{
if
(
!
reuse_algo
)
{
size_t
free
;
size_t
free
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
if
(
err2
!=
GA_NO_ERROR
)
{
if
(
err2
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
"memory information on the GPU"
);
"memory information on the GPU"
);
cuda_exit
(
c
->
ctx
);
cuda_exit
(
c
->
ctx
);
return
1
;
return
1
;
}
}
// Guess 4Mb if the info is not available
// Guess 4Mb if the info is not available
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
#ifdef CHOOSE_TIME
// #ifdef CHOOSE_TIME
int
count
;
if
(
params
->
choose_time
)
{
cudnnConvolutionBwdDataAlgoPerf_t
choice
;
int
count
;
gpudata
*
tmpmem
;
cudnnConvolutionBwdDataAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
return
-
1
;
return
-
1
;
}
}
err
=
cudnnFindConvolutionBackwardDataAlgorithmEx
(
err
=
cudnnFindConvolutionBackwardDataAlgorithmEx
(
_handle
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
*
input
),
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
*
input
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
gpudata_release
(
tmpmem
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
algo
=
choice
.
algo
;
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
// #else
cudnnGetErrorString
(
err
));
}
else
{
cuda_exit
(
c
->
ctx
);
err
=
cudnnGetConvolutionBackwardDataAlgorithm
(
return
1
;
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
}
// #endif
algo
=
choice
.
algo
;
prev_algo
=
algo
;
#else
}
else
{
err
=
cudnnGetConvolutionBackwardDataAlgorithm
(
algo
=
prev_algo
;
_handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
#endif
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
#ifdef CHOOSE_ONCE
// #ifdef CHOOSE_ONCE
reuse_algo
=
1
;
if
(
params
->
choose_once
)
{
#else
reuse_algo
=
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
kerns
);
i
++
)
{
// #else
prev_kern_dims
[
i
]
=
PyGpuArray_DIM
(
kerns
,
i
);
}
else
{
prev_top_dims
[
i
]
=
PyGpuArray_DIM
(
output
,
i
);
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
kerns
);
i
++
)
{
prev_kern_dims
[
i
]
=
PyGpuArray_DIM
(
kerns
,
i
);
prev_top_dims
[
i
]
=
PyGpuArray_DIM
(
output
,
i
);
}
}
// #endif
}
}
#endif
// #endif
#endif
// The FFT implementation does not support strides, 1x1 filters or inputs
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// with a spatial dimension larger than 1024. The tiled-FFT implementation
...
@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -258,7 +270,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata
*
workspace
;
gpudata
*
workspace
;
err
=
cudnnGetConvolutionBackwardDataWorkspaceSize
(
err
=
cudnnGetConvolutionBackwardDataWorkspaceSize
(
_
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
params
->
handle
,
APPLY_SPECIFIC
(
kerns
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
input
),
algo
,
&
worksize
);
APPLY_SPECIFIC
(
input
),
algo
,
&
worksize
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
...
@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...
@@ -283,7 +295,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait
((
*
input
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_wait
((
*
input
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
err
=
cudnnConvolutionBackwardData
(
err
=
cudnnConvolutionBackwardData
(
_
handle
,
params
->
handle
,
alpha_p
,
alpha_p
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
kerns
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
...
...
theano/gpuarray/dnn_gw.c
浏览文件 @
46783773
#section init_code_struct
#section init_code_struct
#ifdef CHOOSE_ALGO
if
(
PARAMS
->
choose_algo
)
{
reuse_algo
=
0
;
reuse_algo
=
0
;
prev_algo
=
CONV_ALGO
;
prev_algo
=
PARAMS
->
conv_algo
;
#ifndef CHOOSE_ONCE
if
(
!
PARAMS
->
choose_once
)
{
memset
(
prev_img_dims
,
0
,
sizeof
(
prev_img_dims
));
memset
(
prev_img_dims
,
0
,
sizeof
(
prev_img_dims
));
memset
(
prev_top_dims
,
0
,
sizeof
(
prev_top_dims
));
memset
(
prev_top_dims
,
0
,
sizeof
(
prev_top_dims
));
#endif
}
#endif
}
#section support_code_struct
#section support_code_struct
#ifdef CHOOSE_ALGO
int
reuse_algo
;
int
reuse_algo
;
cudnnConvolutionBwdFilterAlgo_t
prev_algo
;
cudnnConvolutionBwdFilterAlgo_t
prev_algo
;
#ifndef CHOOSE_ONCE
size_t
prev_img_dims
[
5
];
size_t
prev_img_dims
[
5
];
size_t
prev_top_dims
[
5
];
size_t
prev_top_dims
[
5
];
#endif
#endif
int
int
APPLY_SPECIFIC
(
conv_gw
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
output
,
APPLY_SPECIFIC
(
conv_gw
)(
PyGpuArrayObject
*
input
,
PyGpuArrayObject
*
output
,
PyGpuArrayObject
*
km
,
PyGpuArrayObject
*
km
,
cudnnConvolutionDescriptor_t
desc
,
cudnnConvolutionDescriptor_t
desc
,
double
alpha
,
double
beta
,
PyGpuArrayObject
**
kerns
,
double
alpha
,
double
beta
,
PyGpuArrayObject
**
kerns
,
cudnnHandle_t
_handle
)
{
PARAMS_TYPE
*
params
)
{
PyGpuContextObject
*
c
=
input
->
context
;
PyGpuContextObject
*
c
=
input
->
context
;
void
*
alpha_p
;
void
*
alpha_p
;
void
*
beta_p
;
void
*
beta_p
;
...
@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -53,17 +49,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return
1
;
return
1
;
}
}
#ifdef CONV_INPLACE
if
(
params
->
inplace
)
{
Py_XDECREF
(
*
kerns
);
Py_XDECREF
(
*
kerns
);
*
kerns
=
km
;
*
kerns
=
km
;
Py_INCREF
(
*
kerns
);
Py_INCREF
(
*
kerns
);
#else
}
else
{
if
(
theano_prep_output
(
kerns
,
PyGpuArray_NDIM
(
km
),
PyGpuArray_DIMS
(
km
),
if
(
theano_prep_output
(
kerns
,
PyGpuArray_NDIM
(
km
),
PyGpuArray_DIMS
(
km
),
km
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
km
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
return
1
;
return
1
;
if
(
beta
!=
0
.
0
&&
pygpu_move
(
*
kerns
,
km
))
if
(
beta
!=
0
.
0
&&
pygpu_move
(
*
kerns
,
km
))
return
1
;
return
1
;
#endif
}
if
(
PyGpuArray_DIMS
(
input
)[
0
]
==
0
||
PyGpuArray_DIMS
(
km
)[
0
]
==
0
||
PyGpuArray_DIMS
(
km
)[
1
]
==
0
)
{
if
(
PyGpuArray_DIMS
(
input
)[
0
]
==
0
||
PyGpuArray_DIMS
(
km
)[
0
]
==
0
||
PyGpuArray_DIMS
(
km
)[
1
]
==
0
)
{
int
err2
=
GpuArray_memset
(
&
(
*
kerns
)
->
ga
,
0
);
int
err2
=
GpuArray_memset
(
&
(
*
kerns
)
->
ga
,
0
);
...
@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -82,7 +78,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if
(
c_set_filter
(
*
kerns
,
APPLY_SPECIFIC
(
kerns
))
==
-
1
)
if
(
c_set_filter
(
*
kerns
,
APPLY_SPECIFIC
(
kerns
))
==
-
1
)
return
1
;
return
1
;
cudnnConvolutionBwdFilterAlgo_t
algo
=
CONV_ALGO
;
cudnnConvolutionBwdFilterAlgo_t
algo
=
params
->
conv_algo
;
cuda_enter
(
c
->
ctx
);
cuda_enter
(
c
->
ctx
);
...
@@ -128,86 +124,85 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -128,86 +124,85 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
}
}
}
#ifdef CHOOSE_ALGO
if
(
params
->
choose_algo
)
{
#ifndef CHOOSE_ONCE
if
(
!
params
->
choose_once
)
{
reuse_algo
=
1
;
reuse_algo
=
1
;
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
reuse_algo
=
(
reuse_algo
&&
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
input
,
i
)
==
prev_img_dims
[
i
]);
PyGpuArray_DIM
(
input
,
i
)
==
prev_img_dims
[
i
]);
reuse_algo
=
(
reuse_algo
&&
reuse_algo
=
(
reuse_algo
&&
PyGpuArray_DIM
(
output
,
i
)
==
prev_top_dims
[
i
]);
PyGpuArray_DIM
(
output
,
i
)
==
prev_top_dims
[
i
]);
}
}
#endif
if
(
!
reuse_algo
)
{
size_t
free
;
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
if
(
err2
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
"memory information on the GPU"
);
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
// Guess 4Mb if the info is not available
if
(
!
reuse_algo
)
{
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
size_t
free
;
#ifdef CHOOSE_TIME
int
count
;
cudnnConvolutionBwdFilterAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
return
-
1
;
}
err
=
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
int
err2
=
gpucontext_property
(
c
->
ctx
,
GA_CTX_PROP_LARGEST_MEMBLOCK
,
&
free
);
_handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
if
(
err2
!=
GA_NO_ERROR
)
{
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
PyErr_Format
(
PyExc_RuntimeError
,
"Error when trying to find the "
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
*
kerns
),
"memory information on the GPU"
);
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
cuda_exit
(
c
->
ctx
);
gpudata_release
(
tmpmem
);
return
1
;
}
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
// Guess 4Mb if the info is not available
"error selecting convolution algo: %s"
,
if
(
free
==
0
)
free
=
4
*
1024
*
1024
;
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
if
(
params
->
choose_time
)
{
return
1
;
int
count
;
cudnnConvolutionBwdFilterAlgoPerf_t
choice
;
gpudata
*
tmpmem
;
tmpmem
=
gpudata_alloc
(
c
->
ctx
,
free
,
NULL
,
0
,
NULL
);
if
(
tmpmem
==
NULL
)
{
PyErr_SetString
(
PyExc_MemoryError
,
"Could not allocate working GPU memory"
);
return
-
1
;
}
err
=
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
PyGpuArray_DEV_DATA
(
*
kerns
),
1
,
&
count
,
&
choice
,
*
(
void
**
)
tmpmem
,
free
);
gpudata_release
(
tmpmem
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
algo
=
choice
.
algo
;
}
else
{
err
=
cudnnGetConvolutionBackwardFilterAlgorithm
(
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
}
algo
=
choice
.
algo
;
if
(
params
->
choose_once
)
{
#else
reuse_algo
=
1
;
err
=
cudnnGetConvolutionBackwardFilterAlgorithm
(
}
else
{
_handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
desc
,
APPLY_SPECIFIC
(
kerns
),
prev_img_dims
[
i
]
=
PyGpuArray_DIM
(
input
,
i
);
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
free
,
&
algo
);
prev_top_dims
[
i
]
=
PyGpuArray_DIM
(
output
,
i
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
}
PyErr_Format
(
PyExc_RuntimeError
,
"error selecting convolution algo: %s"
,
cudnnGetErrorString
(
err
));
cuda_exit
(
c
->
ctx
);
return
1
;
}
}
#endif
prev_algo
=
algo
;
}
else
{
algo
=
prev_algo
;
}
#ifdef CHOOSE_ONCE
reuse_algo
=
1
;
#else
for
(
unsigned
int
i
=
0
;
i
<
PyGpuArray_NDIM
(
input
);
i
++
)
{
prev_img_dims
[
i
]
=
PyGpuArray_DIM
(
input
,
i
);
prev_top_dims
[
i
]
=
PyGpuArray_DIM
(
output
,
i
);
}
}
#endif
#endif
// The FFT implementation does not support strides, 1x1 filters or inputs
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// with a spatial dimension larger than 1024.
...
@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -246,7 +241,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata
*
workspace
;
gpudata
*
workspace
;
err
=
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
err
=
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
_
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
params
->
handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
output
),
desc
,
APPLY_SPECIFIC
(
kerns
),
algo
,
&
worksize
);
APPLY_SPECIFIC
(
kerns
),
algo
,
&
worksize
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
...
@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...
@@ -270,7 +265,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait
((
*
kerns
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_wait
((
*
kerns
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
err
=
cudnnConvolutionBackwardFilter
(
err
=
cudnnConvolutionBackwardFilter
(
_
handle
,
params
->
handle
,
alpha_p
,
alpha_p
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
output
),
...
...
theano/gpuarray/dnn_pool.c
浏览文件 @
46783773
...
@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
...
@@ -42,7 +42,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject
*
stride
,
PyArrayObject
*
stride
,
PyArrayObject
*
pad
,
PyArrayObject
*
pad
,
PyGpuArrayObject
**
out
,
PyGpuArrayObject
**
out
,
cudnnHandle_t
_handle
)
{
PARAMS_TYPE
*
params
)
{
PyGpuContextObject
*
c
=
img
->
context
;
PyGpuContextObject
*
c
=
img
->
context
;
size_t
dims
[
5
];
size_t
dims
[
5
];
cudnnStatus_t
err
;
cudnnStatus_t
err
;
...
@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
...
@@ -90,7 +90,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
if
(
c_set_tensorNd
(
*
out
,
APPLY_SPECIFIC
(
output
))
!=
0
)
if
(
c_set_tensorNd
(
*
out
,
APPLY_SPECIFIC
(
output
))
!=
0
)
return
1
;
return
1
;
err
=
cudnnSetPoolingNdDescriptor
(
APPLY_SPECIFIC
(
pool
),
MODE_FLAG
,
CUDNN_PROPAGATE_NAN
,
ndims
,
w
,
p
,
s
);
err
=
cudnnSetPoolingNdDescriptor
(
APPLY_SPECIFIC
(
pool
),
params
->
mode
,
CUDNN_PROPAGATE_NAN
,
ndims
,
w
,
p
,
s
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"could not set op descriptor %s"
,
cudnnGetErrorString
(
err
));
PyErr_Format
(
PyExc_RuntimeError
,
"could not set op descriptor %s"
,
cudnnGetErrorString
(
err
));
...
@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
...
@@ -124,7 +124,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait
((
*
out
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_wait
((
*
out
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
err
=
cudnnPoolingForward
(
err
=
cudnnPoolingForward
(
_
handle
,
APPLY_SPECIFIC
(
pool
),
params
->
handle
,
APPLY_SPECIFIC
(
pool
),
alpha
,
alpha
,
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
img
),
APPLY_SPECIFIC
(
input
),
PyGpuArray_DEV_DATA
(
img
),
beta
,
beta
,
...
...
theano/gpuarray/dnn_pool_grad.c
浏览文件 @
46783773
...
@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
...
@@ -64,7 +64,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject
*
stride
,
PyArrayObject
*
stride
,
PyArrayObject
*
pad
,
PyArrayObject
*
pad
,
PyGpuArrayObject
**
inp_grad
,
PyGpuArrayObject
**
inp_grad
,
cudnnHandle_t
_handle
)
{
PARAMS_TYPE
*
params
)
{
PyGpuContextObject
*
c
=
inp
->
context
;
PyGpuContextObject
*
c
=
inp
->
context
;
cudnnStatus_t
err
;
cudnnStatus_t
err
;
...
@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
...
@@ -116,7 +116,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s
[
i
]
=
*
((
npy_intp
*
)
PyArray_GETPTR1
(
stride
,
i
));
s
[
i
]
=
*
((
npy_intp
*
)
PyArray_GETPTR1
(
stride
,
i
));
}
}
err
=
cudnnSetPoolingNdDescriptor
(
APPLY_SPECIFIC
(
pool
),
MODE_FLAG
,
CUDNN_PROPAGATE_NAN
,
ndims
,
w
,
p
,
s
);
err
=
cudnnSetPoolingNdDescriptor
(
APPLY_SPECIFIC
(
pool
),
params
->
mode
,
CUDNN_PROPAGATE_NAN
,
ndims
,
w
,
p
,
s
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"could not set op descriptor %s"
,
cudnnGetErrorString
(
err
));
PyErr_Format
(
PyExc_RuntimeError
,
"could not set op descriptor %s"
,
cudnnGetErrorString
(
err
));
...
@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
...
@@ -155,7 +155,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait
((
*
inp_grad
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
cuda_wait
((
*
inp_grad
)
->
ga
.
data
,
GPUARRAY_CUDA_WAIT_WRITE
);
err
=
cudnnPoolingBackward
(
err
=
cudnnPoolingBackward
(
_
handle
,
APPLY_SPECIFIC
(
pool
),
params
->
handle
,
APPLY_SPECIFIC
(
pool
),
alpha
,
alpha
,
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
out
),
APPLY_SPECIFIC
(
output
),
PyGpuArray_DEV_DATA
(
out
),
APPLY_SPECIFIC
(
output_grad
),
PyGpuArray_DEV_DATA
(
out_grad
),
APPLY_SPECIFIC
(
output_grad
),
PyGpuArray_DEV_DATA
(
out_grad
),
...
...
theano/gpuarray/tests/test_dnn.py
浏览文件 @
46783773
...
@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including()
...
@@ -31,6 +31,20 @@ mode_with_gpu = mode_with_gpu.including()
mode_with_gpu
.
check_py_code
=
False
mode_with_gpu
.
check_py_code
=
False
# This variable will store the list of pooling modes available with the current runtime cuDNN version.
# Don't use this variable directly, always call `get_dnn_pool_modes()` instead.
dnn_pool_modes
=
None
def
get_dnn_pool_modes
():
# This function is called only by pooling tests to initialize and/or get dnn_pool_modes.
global
dnn_pool_modes
if
dnn_pool_modes
is
None
:
from
..
import
cudnn_defs
dnn_pool_modes
=
cudnn_defs
.
get_definitions
(
dnn
.
version
(
raises
=
False
))
.
cudnnPoolingMode_t
.
get_aliases
()
return
dnn_pool_modes
# If using float16, set CUDNN precision to float32
# If using float16, set CUDNN precision to float32
def
set_precision
(
floatX
):
def
set_precision
(
floatX
):
if
floatX
==
"float16"
:
if
floatX
==
"float16"
:
...
@@ -155,11 +169,7 @@ def test_pooling():
...
@@ -155,11 +169,7 @@ def test_pooling():
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
utt
.
seed_rng
()
utt
.
seed_rng
()
# 'average_exc_pad' is disabled for versions < 4004
modes
=
get_dnn_pool_modes
()
if
dnn
.
version
(
raises
=
False
)
<
4004
:
modes
=
(
'max'
,
'average_inc_pad'
)
else
:
modes
=
(
'max'
,
'average_inc_pad'
,
'average_exc_pad'
)
x
=
T
.
tensor4
()
x
=
T
.
tensor4
()
for
mode
,
pad
in
product
(
modes
,
for
mode
,
pad
in
product
(
modes
,
...
@@ -242,7 +252,9 @@ def test_pooling():
...
@@ -242,7 +252,9 @@ def test_pooling():
for
node
in
fg
.
maker
.
fgraph
.
toposort
()])
for
node
in
fg
.
maker
.
fgraph
.
toposort
()])
def
test_pooling_with_tensor_vars
():
# This test will be run with different values of 'mode'
# (see next test below).
def
run_pooling_with_tensor_vars
(
mode
):
if
not
dnn
.
dnn_available
(
test_ctx_name
):
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
utt
.
seed_rng
()
utt
.
seed_rng
()
...
@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars():
...
@@ -251,7 +263,6 @@ def test_pooling_with_tensor_vars():
ws
=
theano
.
shared
(
np
.
array
([
2
,
2
],
dtype
=
'int32'
))
ws
=
theano
.
shared
(
np
.
array
([
2
,
2
],
dtype
=
'int32'
))
stride
=
theano
.
shared
(
np
.
array
([
1
,
1
],
dtype
=
'int32'
))
stride
=
theano
.
shared
(
np
.
array
([
1
,
1
],
dtype
=
'int32'
))
pad
=
theano
.
shared
(
np
.
array
([
0
,
0
],
dtype
=
'int32'
))
pad
=
theano
.
shared
(
np
.
array
([
0
,
0
],
dtype
=
'int32'
))
mode
=
'max'
def
fn
(
x
):
def
fn
(
x
):
dnn_op
=
dnn
.
dnn_pool
(
dnn_op
=
dnn
.
dnn_pool
(
...
@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars():
...
@@ -297,6 +308,12 @@ def test_pooling_with_tensor_vars():
i
+=
1
i
+=
1
def
test_pooling_with_tensor_vars
():
# Let's test for mode 'max' and also for 'max_deterministic' if available.
for
mode
in
[
m
for
m
in
get_dnn_pool_modes
()
if
m
in
(
'max'
,
'max_deterministic'
)]:
yield
(
run_pooling_with_tensor_vars
,
mode
)
def
test_pooling3d
():
def
test_pooling3d
():
# 3d pooling requires version 3 or newer.
# 3d pooling requires version 3 or newer.
if
not
dnn
.
dnn_available
(
test_ctx_name
)
or
dnn
.
version
(
raises
=
False
)
<
3000
:
if
not
dnn
.
dnn_available
(
test_ctx_name
)
or
dnn
.
version
(
raises
=
False
)
<
3000
:
...
@@ -307,11 +324,7 @@ def test_pooling3d():
...
@@ -307,11 +324,7 @@ def test_pooling3d():
mode_without_gpu_ref
=
theano
.
compile
.
mode
.
get_mode
(
mode_without_gpu_ref
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpuarray'
)
'FAST_RUN'
)
.
excluding
(
'gpuarray'
)
# 'average_exc_pad' is disabled for versions < 4004
modes
=
get_dnn_pool_modes
()
if
dnn
.
version
(
raises
=
False
)
<
4004
:
modes
=
(
'max'
,
'average_inc_pad'
)
else
:
modes
=
(
'max'
,
'average_inc_pad'
,
'average_exc_pad'
)
x
=
T
.
tensor5
()
x
=
T
.
tensor5
()
for
mode
,
pad
in
product
(
modes
,
for
mode
,
pad
in
product
(
modes
,
...
@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions():
...
@@ -467,11 +480,7 @@ def test_pooling_opt_arbitrary_dimensions():
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
utt
.
seed_rng
()
utt
.
seed_rng
()
# 'average_exc_pad' is disabled for versions < 4004
modes
=
get_dnn_pool_modes
()
if
dnn
.
version
(
raises
=
False
)
<
4004
:
modes
=
(
'max'
,
'average_inc_pad'
)
else
:
modes
=
(
'max'
,
'average_inc_pad'
,
'average_exc_pad'
)
for
n_non_pool_dims
in
(
0
,
1
,
2
,
3
):
for
n_non_pool_dims
in
(
0
,
1
,
2
,
3
):
for
ws
in
((
2
,
2
),
(
3
,
3
,
3
)):
for
ws
in
((
2
,
2
),
(
3
,
3
,
3
)):
...
@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions():
...
@@ -498,7 +507,7 @@ def test_pooling_opt_arbitrary_dimensions():
fc
=
theano
.
function
([],
out
,
mode
=
mode_without_gpu
)
fc
=
theano
.
function
([],
out
,
mode
=
mode_without_gpu
)
assert
any
([
isinstance
(
node
.
op
,
Pool
)
assert
any
([
isinstance
(
node
.
op
,
Pool
)
for
node
in
fc
.
maker
.
fgraph
.
toposort
()])
for
node
in
fc
.
maker
.
fgraph
.
toposort
()])
if
mode
==
'max'
:
if
mode
in
(
'max'
,
'max_deterministic'
)
:
assert
any
([
isinstance
(
node
.
op
,
MaxPoolGrad
)
assert
any
([
isinstance
(
node
.
op
,
MaxPoolGrad
)
for
node
in
fc
.
maker
.
fgraph
.
toposort
()])
for
node
in
fc
.
maker
.
fgraph
.
toposort
()])
else
:
else
:
...
@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -780,11 +789,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype
=
theano
.
config
.
floatX
dtype
=
theano
.
config
.
floatX
)
)
# 'average_exc_pad' is disabled for versions < 4004
modes
=
get_dnn_pool_modes
()
if
dnn
.
version
(
raises
=
False
)
<
4004
:
modes
=
[
'max'
,
'average_inc_pad'
]
else
:
modes
=
[
'max'
,
'average_inc_pad'
,
'average_exc_pad'
]
for
params
in
product
(
for
params
in
product
(
[(
1
,
1
),
(
2
,
2
),
(
3
,
3
)],
[(
1
,
1
),
(
2
,
2
),
(
3
,
3
)],
...
@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -807,11 +812,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
dtype
=
theano
.
config
.
floatX
dtype
=
theano
.
config
.
floatX
)
)
# 'average_exc_pad' is disabled for versions < 4004
modes
=
get_dnn_pool_modes
()
if
dnn
.
version
(
raises
=
False
)
<
4004
:
modes
=
[
'max'
,
'average_inc_pad'
]
else
:
modes
=
[
'max'
,
'average_inc_pad'
,
'average_exc_pad'
]
for
params
in
product
(
for
params
in
product
(
[(
1
,
1
,
1
),
(
2
,
2
,
2
),
(
3
,
3
,
3
)],
[(
1
,
1
,
1
),
(
2
,
2
,
2
),
(
3
,
3
,
3
)],
...
@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -847,7 +848,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for
params
in
product
(
for
params
in
product
(
[(
1
,
1
),
(
2
,
2
),
(
3
,
3
)],
[(
1
,
1
),
(
2
,
2
),
(
3
,
3
)],
[(
1
,
1
),
(
2
,
2
),
(
3
,
3
)],
[(
1
,
1
),
(
2
,
2
),
(
3
,
3
)],
[
'max'
,
'average_inc_pad'
]
# modes without `average_exc_pad`
[
m
for
m
in
get_dnn_pool_modes
()
if
m
!=
'average_exc_pad'
]
):
):
pool_grad
=
dnn
.
GpuDnnPoolGrad
(
mode
=
params
[
2
])(
pool_grad
=
dnn
.
GpuDnnPoolGrad
(
mode
=
params
[
2
])(
img
,
img
,
...
@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
...
@@ -886,7 +888,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
for
params
in
product
(
for
params
in
product
(
[(
1
,
1
,
1
),
(
2
,
2
,
2
),
(
3
,
3
,
3
)],
[(
1
,
1
,
1
),
(
2
,
2
,
2
),
(
3
,
3
,
3
)],
[(
1
,
1
,
1
),
(
2
,
2
,
2
),
(
3
,
3
,
3
)],
[(
1
,
1
,
1
),
(
2
,
2
,
2
),
(
3
,
3
,
3
)],
[
'max'
,
'average_inc_pad'
]
# modes without `average_exc_pad`
[
m
for
m
in
get_dnn_pool_modes
()
if
m
!=
'average_exc_pad'
]
):
):
pool_grad
=
dnn
.
GpuDnnPoolGrad
(
mode
=
params
[
2
])(
pool_grad
=
dnn
.
GpuDnnPoolGrad
(
mode
=
params
[
2
])(
img
,
img
,
...
...
theano/tensor/signal/pool.py
浏览文件 @
46783773
...
@@ -434,6 +434,9 @@ class Pool(OpenMPOp):
...
@@ -434,6 +434,9 @@ class Pool(OpenMPOp):
super
(
Pool
,
self
)
.
__init__
(
openmp
=
openmp
)
super
(
Pool
,
self
)
.
__init__
(
openmp
=
openmp
)
self
.
ndim
=
ndim
self
.
ndim
=
ndim
self
.
ignore_border
=
ignore_border
self
.
ignore_border
=
ignore_border
if
mode
==
'max_deterministic'
:
# It seems max pool algo is already deterministic in CPU.
mode
=
'max'
if
mode
not
in
[
'max'
,
'average_inc_pad'
,
'average_exc_pad'
,
'sum'
]:
if
mode
not
in
[
'max'
,
'average_inc_pad'
,
'average_exc_pad'
,
'sum'
]:
raise
ValueError
(
raise
ValueError
(
"Pool mode parameter only support 'max', 'sum',"
"Pool mode parameter only support 'max', 'sum',"
...
@@ -1041,6 +1044,9 @@ class PoolGrad(OpenMPOp):
...
@@ -1041,6 +1044,9 @@ class PoolGrad(OpenMPOp):
def
__init__
(
self
,
ignore_border
,
mode
=
'max'
,
ndim
=
2
,
openmp
=
None
):
def
__init__
(
self
,
ignore_border
,
mode
=
'max'
,
ndim
=
2
,
openmp
=
None
):
self
.
ndim
=
ndim
self
.
ndim
=
ndim
self
.
ignore_border
=
ignore_border
self
.
ignore_border
=
ignore_border
if
mode
==
'max_deterministic'
:
# It seems max pool grad algo is already deterministic in CPU.
mode
=
'max'
if
mode
not
in
[
'max'
,
'sum'
,
'average_inc_pad'
,
'average_exc_pad'
]:
if
mode
not
in
[
'max'
,
'sum'
,
'average_inc_pad'
,
'average_exc_pad'
]:
raise
ValueError
(
raise
ValueError
(
"Pool mode parameter only support 'max', 'sum',"
"Pool mode parameter only support 'max', 'sum',"
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论