Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
9147b12e
提交
9147b12e
authored
2月 28, 2015
作者:
Frédéric Bastien
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2542 from abergeron/cudnn_r2_alg
Enable algorithm selection in cudnn convolution
上级
f120de51
009733e0
隐藏空白字符变更
内嵌
并排
正在显示
7 个修改的文件
包含
129 行增加
和
34 行删除
+129
-34
config.txt
doc/library/config.txt
+19
-0
op.py
theano/gof/op.py
+22
-1
cuda_ndarray.cu
theano/sandbox/cuda/cuda_ndarray.cu
+19
-0
cuda_ndarray.cuh
theano/sandbox/cuda/cuda_ndarray.cuh
+3
-1
cudnn_helper.h
theano/sandbox/cuda/cudnn_helper.h
+14
-0
dnn.py
theano/sandbox/cuda/dnn.py
+35
-18
dnn_fwd.c
theano/sandbox/cuda/dnn_fwd.c
+17
-14
没有找到文件。
doc/library/config.txt
浏览文件 @
9147b12e
...
...
@@ -484,6 +484,25 @@ import theano and print the config variable, as in:
A directory with bin/, lib/, include/ folders containing cuda utilities.
.. attribute:: config.dnn.conv.workmem
String value: 'none', 'small', 'large'
Default: 'small'
The default value for the amount of working memory that is
tolerated in the convolution implementation in cudnn.
'none'
Don't allow any extra memory.
'small'
Allow extra memory that is much smaller than the input sizes.
'large'
Allow extra memory that is on the order of the input sizes.
.. attribute:: config.gcc.cxxflags
Default: ""
...
...
theano/gof/op.py
浏览文件 @
9147b12e
...
...
@@ -1146,10 +1146,27 @@ class COp(Op):
raise
ValueError
(
"No valid section marker was found in file "
"
%
s"
%
self
.
func_files
[
i
])
def
get_op_params
(
self
):
"""
Returns a list of (name, value) pairs that will be turned into
macros for use within the op code. This is intended to allow
an op's properties to influence the generated C code.
The names must be strings that are not a C keyword and the
values must be strings of literal C representations.
"""
return
[]
def
c_code_cache_version
(
self
):
return
hash
(
tuple
(
self
.
func_codes
))
c_init_code
=
simple_meth
(
'init_code'
)
def
c_init_code
(
self
):
if
'init_code'
in
self
.
code_sections
:
return
[
self
.
code_sections
[
'init_code'
]]
else
:
raise
utils
.
MethodNotDefined
(
'c_init_code'
,
type
(
self
),
type
(
self
)
.
__name__
)
c_init_code_apply
=
apply_meth
(
'init_code_apply'
)
c_support_code
=
simple_meth
(
'support_code'
)
c_support_code_apply
=
apply_meth
(
'support_code_apply'
)
...
...
@@ -1208,6 +1225,10 @@ class COp(Op):
"str##_
%
s"
%
name
))
undef_macros
.
append
(
undef_template
%
"APPLY_SPECIFIC"
)
for
n
,
v
in
self
.
get_op_params
():
define_macros
.
append
(
define_template
%
(
n
,
v
))
undef_macros
.
append
(
undef_template
%
(
n
,))
return
os
.
linesep
.
join
(
define_macros
),
os
.
linesep
.
join
(
undef_macros
)
def
_lquote_macro
(
self
,
txt
):
...
...
theano/sandbox/cuda/cuda_ndarray.cu
浏览文件 @
9147b12e
...
...
@@ -298,6 +298,25 @@ outstanding_mallocs(PyObject* self, PyObject * args)
return
PyInt_FromLong
(
_outstanding_mallocs
[
0
]);
}
static
void
*
work_mem
=
NULL
;
static
size_t
work_size
=
0
;
/*
* Returns a chunk of memory for temporary work inside of an op. You can only
* request a single chunk of memory at a time since it is reused.
*/
void
*
get_work_mem
(
size_t
sz
)
{
if
(
sz
<
work_size
)
return
work_mem
;
device_free
(
work_mem
);
work_mem
=
device_malloc
(
sz
);
work_size
=
sz
;
if
(
work_mem
==
NULL
)
work_size
=
0
;
return
work_mem
;
}
/////////////////////////
// Static helper methods
/////////////////////////
...
...
theano/sandbox/cuda/cuda_ndarray.cuh
浏览文件 @
9147b12e
...
...
@@ -88,7 +88,8 @@ typedef float real;
extern
DllExport
cublasHandle_t
handle
;
/**
* Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
* Allocation and freeing of device memory should go through these functions so
* that the lib can track memory usage.
*
* device_malloc will set the Python error message before returning None.
* device_free will return nonzero on failure (after setting the python error message)
...
...
@@ -98,6 +99,7 @@ extern DllExport cublasHandle_t handle;
DllExport
void
*
device_malloc
(
size_t
size
);
DllExport
void
*
device_malloc
(
size_t
size
,
int
verbose
);
DllExport
int
device_free
(
void
*
ptr
);
DllExport
void
*
get_work_mem
(
size_t
sz
);
template
<
typename
T
>
static
T
ceil_intdiv
(
T
a
,
T
b
)
...
...
theano/sandbox/cuda/cudnn_helper.h
浏览文件 @
9147b12e
...
...
@@ -73,6 +73,20 @@ cudnnGetConvolutionForwardAlgorithm(
return
CUDNN_STATUS_SUCCESS
;
}
static
inline
cudnnStatus_t
cudnnGetConvolutionForwardWorkspaceSize
(
cudnnHandle_t
handle
,
const
cudnnTensorDescriptor_t
srcDesc
,
const
cudnnFilterDescriptor_t
filterDesc
,
const
cudnnConvolutionDescriptor_t
convDesc
,
const
cudnnTensor4dDescriptor_t
destDesc
,
cudnnConvolutionFwdAlgo_t
algo
,
size_t
*
sizeInBytes
)
{
*
sizeInBytes
=
0
;
return
CUDNN_STATUS_SUCCESS
;
}
static
inline
cudnnStatus_t
cudnnConvolutionForward_v2
(
cudnnHandle_t
handle
,
...
...
theano/sandbox/cuda/dnn.py
浏览文件 @
9147b12e
import
os
import
theano
from
theano
import
Apply
,
gof
,
tensor
from
theano
import
Apply
,
gof
,
tensor
,
config
from
theano.scalar
import
as_scalar
from
theano.gradient
import
DisconnectedType
from
theano.gof
import
Optimizer
,
local_optimizer
,
COp
from
theano.gof.type
import
CDataType
,
Generic
from
theano.compat
import
PY3
from
theano.compile.ops
import
shape_i
from
theano.configparser
import
AddConfigVar
,
EnumStr
from
theano.tensor.nnet
import
SoftmaxGrad
from
theano.tensor.basic
import
ShapeError
from
theano.sandbox.cuda.type
import
CudaNdarrayType
...
...
@@ -133,20 +134,6 @@ class DnnBase(GpuOp, COp):
def
c_libraries
(
self
):
return
[
'cudnn'
]
def
c_init_code
(
self
):
if
PY3
:
error_out
=
"NULL"
else
:
error_out
=
""
return
[
"""{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle:
%%
s",
cudnnGetErrorString(err));
return
%
s;
}
}"""
%
(
error_out
,)]
class
DnnVersion
(
GpuOp
):
def
c_compiler
(
self
):
...
...
@@ -342,6 +329,11 @@ class GpuDnnConvDesc(GpuOp):
return
(
2
,
version
())
AddConfigVar
(
'dnn.conv.workmem'
,
"Default value for the workmem attribute of cudnn convolutions."
,
EnumStr
(
'small'
,
'none'
,
'large'
),
in_c_key
=
False
)
class
GpuDnnConv
(
DnnBase
,
COp
):
"""
The forward convolution.
...
...
@@ -349,13 +341,36 @@ class GpuDnnConv(DnnBase, COp):
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__
=
()
__props__
=
(
'workmem'
,
)
def
__init__
(
self
):
def
__init__
(
self
,
workmem
=
None
):
"""
:param workmem: either 'none', 'small' or 'large'. Default is
the value of :attr:`config.dnn.conv.workmem`.
"""
COp
.
__init__
(
self
,
[
"dnn_base.c"
,
"dnn_conv_base.c"
,
"dnn_fwd.c"
],
"APPLY_SPECIFIC(conv_fwd)"
)
if
workmem
is
None
:
workmem
=
config
.
dnn
.
conv
.
workmem
self
.
workmem
=
workmem
assert
self
.
workmem
in
[
'none'
,
'small'
,
'large'
]
def
__setstate__
(
self
,
d
):
self
.
__dict__
.
update
(
d
)
if
not
hasattr
(
self
,
'workmem'
):
self
.
workmem
=
'small'
def
get_op_params
(
self
):
if
version
()
==
-
1
:
return
[(
'CONV_ALGO'
,
"0"
)]
if
self
.
workmem
==
'none'
:
alg
=
'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
elif
self
.
workmem
==
'small'
:
alg
=
'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif
self
.
workmem
==
'large'
:
alg
=
'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
return
[(
'CONV_ALGO'
,
alg
)]
def
make_node
(
self
,
img
,
kern
,
desc
):
img
=
as_cuda_ndarray_variable
(
img
)
...
...
@@ -575,6 +590,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
:note: The working memory of the op is influenced by
:attr:`config.dnn.conv.workmem`.
"""
fgraph
=
getattr
(
img
,
'fgraph'
,
None
)
or
getattr
(
kerns
,
'fgraph'
,
None
)
if
(
border_mode
==
'valid'
and
subsample
==
(
1
,
1
)
and
...
...
theano/sandbox/cuda/dnn_fwd.c
浏览文件 @
9147b12e
...
...
@@ -33,24 +33,27 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
return
1
;
{
cudnnConvolutionFwdAlgo_t
algo
;
err
=
cudnnGetConvolutionForwardAlgorithm
(
_handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
CUDNN_CONVOLUTION_FWD_NO_WORKSPACE
,
// TODO: add op param
0
,
&
algo
);
size_t
worksize
;
void
*
workspace
;
err
=
cudnnGetConvolutionForwardWorkspaceSize
(
_handle
,
APPLY_SPECIFIC
(
input
),
APPLY_SPECIFIC
(
kerns
),
desc
,
APPLY_SPECIFIC
(
output
),
CONV_ALGO
,
&
worksize
);
if
(
err
!=
CUDNN_STATUS_SUCCESS
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"GpuDnnConv: Couldn't select convolution algorithm
: %s"
,
cudnnGetErrorString
(
err
));
"GpuDnnConv: error getting worksize
: %s"
,
cudnnGetErrorString
(
err
));
return
1
;
}
workspace
=
get_work_mem
(
worksize
);
if
(
workspace
==
NULL
&&
worksize
!=
0
)
return
1
;
const
float
alpha
=
1
;
const
float
beta
=
0
;
...
...
@@ -60,8 +63,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
APPLY_SPECIFIC
(
input
),
CudaNdarray_DEV_DATA
(
input
),
APPLY_SPECIFIC
(
kerns
),
CudaNdarray_DEV_DATA
(
kerns
),
desc
,
algo
,
NULL
,
0
,
CONV_ALGO
,
workspace
,
worksize
,
(
void
*
)
&
beta
,
APPLY_SPECIFIC
(
output
),
CudaNdarray_DEV_DATA
(
*
output
));
}
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论