Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
17562537
提交
17562537
authored
8月 28, 2012
作者:
Ian Goodfellow
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
moved the docstrings to the new methods and deleted the old methods
上级
0103f893
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
129 行增加
和
305 行删除
+129
-305
gradient.py
theano/gradient.py
+129
-305
没有找到文件。
theano/gradient.py
浏览文件 @
17562537
...
...
@@ -54,145 +54,6 @@ def format_as(use_list, use_tuple, outputs):
else
:
return
outputs
def
grad_sources_inputs_aside
(
sources
,
graph_inputs
,
warn_type
=
True
):
"""
A gradient source is a pair (``v``, ``g_v``), in which ``v`` is
a `Variable`, and ``g_v`` is a `Variable` that is a gradient wrt
``v``. More specifically, ``g_v`` is the gradient of an external
scalar cost, ``cost`` (that is not explicitly used), wrt ``v``.
This function traverses the graph backward from the ``r`` sources,
calling ``op.grad(...)`` for all ops with some non-None gradient
on an output, to compute gradients of ``cost`` wrt intermediate
variables and ``graph_inputs``.
The ``op.grad(...)`` functions are called like this:
.. code-block:: python
op.grad(op.inputs[:], [total_gradient(v) for v in op.outputs])
This call to ``op.grad`` should return a list or tuple: one symbolic
gradient per input. These gradients represent the gradients of
the same implicit ``cost`` mentionned above, wrt ``op.inputs``. Note
that this is **not** the same as the gradient of ``op.outputs`` wrt
``op.inputs``.
If ``op`` has a single input, then ``op.grad`` should return a list
or tuple of length 1.
For each input wrt to which ``op`` is not differentiable, it should
return ``None`` instead of a `Variable` instance.
If a source ``r`` receives a gradient from another source ``r2``,
then the effective gradient on ``r`` is the sum of both gradients.
:type sources: list of pairs of Variable: (v, gradient-on-v) to
initialize the total_gradient dictionary
:param sources: gradients to back-propagate using chain rule
:type graph_inputs: list of Variable
:param graph_inputs: variables considered to be constant
(do not backpropagate through them)
:type warn_type: bool
:param warn_type: True will trigger warnings via the logging module when
the gradient on an expression has a different type than the original
expression
:rtype: dictionary whose keys and values are of type Variable
:return: mapping from each Variable encountered in the backward
traversal to the gradient with respect to that Variable.
It is assumed that there is some objective J shared between all members of
sources, so that for each v, gradient-on-v is the gradient of J with
respect to v
"""
gmap
=
{}
for
(
r
,
g_r
)
in
sources
:
if
not
hasattr
(
r
,
'type'
):
raise
TypeError
(
'sources must be Variables'
,
r
)
if
g_r
is
not
None
:
if
r
in
gmap
:
gmap
[
r
]
=
gmap
[
r
]
+
g_r
else
:
gmap
[
r
]
=
g_r
graph_outputs
=
gof
.
utils
.
uniq
([
r
for
r
,
g
in
sources
])
if
graph_inputs
is
None
:
graph_inputs
=
gof
.
graph
.
inputs
(
graph_outputs
)
for
node
in
gof
.
graph
.
io_toposort
(
graph_inputs
,
graph_outputs
)
.
__reversed__
():
g_outputs
=
[
gmap
.
get
(
o
,
None
)
for
o
in
node
.
outputs
]
#if all output gradients are None, continue
if
all
(
map
(
lambda
x
:
x
is
None
,
g_outputs
)):
continue
#Disable all grad operation on complex. verify_grad don't
#support them and we don't know we want to handle them.
for
var
in
node
.
inputs
+
node
.
outputs
:
if
(
hasattr
(
var
.
type
,
'dtype'
)
and
"complex"
in
var
.
type
.
dtype
):
raise
Exception
(
"We do not support grad/Rop/Lop/verify_grad"
" on complex."
)
output_arg
=
g_outputs
input_arg
=
node
.
inputs
# Each Op's grad function requires inputs and output_grads
# If the Op destroys any input, but the grad expression uses it,
# then chances are the resulting graph will have a dependency
# cycle. We avoid this cycle by passing (symbolic) copies of
# each destroyed input.
try
:
dinputs
=
[
node
.
inputs
[
x
[
0
]]
for
x
in
node
.
op
.
destroy_map
.
values
()]
except
AttributeError
:
dinputs
=
[]
new_input_arg
=
[]
for
input
in
input_arg
:
if
input
in
dinputs
and
hasattr
(
input
,
'copy'
):
new_input_arg
.
append
(
input
.
copy
())
else
:
new_input_arg
.
append
(
input
)
input_arg
=
new_input_arg
#note that this function is not in a try-except block
# the rationale:
# If the op implements grad, then any exception should be passed to
# the caller
# If the op doesn't implement grad, this entire function should fail.
# Other possibilities:
# * return a partial back-prop
#
op_grad
=
node
.
op
.
grad
(
input_arg
,
output_arg
)
if
not
isinstance
(
op_grad
,
(
list
,
tuple
)):
raise
ValueError
(
_msg_retType
,
node
.
op
)
g_inputs
=
op_grad
assert
isinstance
(
g_inputs
,
(
list
,
tuple
))
if
len
(
g_inputs
)
!=
len
(
node
.
inputs
):
raise
ValueError
(
_msg_badlen
,
node
.
op
,
len
(
g_inputs
),
len
(
node
.
inputs
))
for
ii
,
(
r
,
g_r
)
in
enumerate
(
zip
(
node
.
inputs
,
g_inputs
)):
if
warn_type
:
if
g_r
and
(
getattr
(
r
,
'type'
,
0
)
!=
getattr
(
g_r
,
'type'
,
1
)):
r_type
=
getattr
(
r
,
'type'
,
None
)
g_r_type
=
getattr
(
g_r
,
'type'
,
None
)
_logger
.
warning
(
'
%
s.grad returned a different type (
%
s) '
'for input
%
i of type (
%
s)'
,
node
.
op
,
g_r_type
,
ii
,
r_type
)
if
g_r
is
not
None
:
assert
r
is
not
None
if
r
in
gmap
:
gmap
[
r
]
=
gmap
[
r
]
+
g_r
else
:
gmap
[
r
]
=
g_r
return
gmap
def
grad_not_implemented
(
op
,
x_pos
,
x
,
comment
=
""
):
"""
Return an un-computable symbolic variable of type `x.type`.
...
...
@@ -230,8 +91,6 @@ def grad_undefined(op, x_pos, x, comment = ""):
" mathematically undefined."
)()
########################
# R Operator
########################
...
...
@@ -446,6 +305,35 @@ def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False,
def
grad
(
cost
,
wrt
,
g_cost
=
None
,
consider_constant
=
None
,
warn_type
=
False
,
disconnected_inputs
=
'raise'
):
"""
:type cost: Scalar (0-dimensional) Variable.
:type wrt: Variable or list of Variables.
:type g_cost: Scalar Variable, or None.
:param g_cost: an expression for the gradient through cost. The default is
``ones_like(cost)``.
:param consider_constant: a list of expressions not to backpropagate
through
:param warn_type: a value of True will cause warnings to be logged for any
Op that emits a gradient that does not match its input type.
:type disconnected_inputs: string
:param disconnected_inputs: Defines the behaviour if some of the variables
in ``wrt`` are not part of the computational graph computing ``cost``
(or if all links are non-differentiable). The possible values are:
- 'ignore': considers that the gradient on these parameters is zero.
- 'warn': consider the gradient zero, and print a warning.
- 'raise': raise an exception.
:rtype: Variable or list/tuple of Variables (depending upon `wrt`)
:return: symbolic expression of gradient of `cost` with respect to `wrt`.
If an element of `wrt` is not differentiable with respect
to the output, then a zero variable is returned.
It returns an object of same type as `wrt`: a list/tuple
or Variable in all cases.
"""
global
tensor
if
tensor
is
None
:
from
theano
import
tensor
...
...
@@ -479,27 +367,7 @@ def grad(cost, wrt, g_cost = None, consider_constant = None, warn_type = False,
if
not
using_list
and
not
using_tuple
:
wrt
=
[
wrt
]
#var_to_node_to_idx[var][node] = i means node has var as input at position i
var_to_node_to_idx
=
{}
#set of variables that have been added to their parents
accounted_for
=
set
([])
#mark the variables in the relevant subgraph with
#a dictionary called chidlren
#var._children[node] gives the index of var in _children.inputs
def
account_for
(
var
):
if
var
in
accounted_for
:
return
accounted_for
.
add
(
var
)
if
var
.
owner
is
not
None
:
node
=
var
.
owner
for
i
,
ipt
in
enumerate
(
node
.
inputs
):
if
ipt
not
in
var_to_node_to_idx
:
var_to_node_to_idx
[
ipt
]
=
{}
var_to_node_to_idx
[
ipt
][
node
]
=
i
account_for
(
ipt
)
account_for
(
cost
)
var_to_node_to_idx
=
_populate_var_to_node_to_idx
([
cost
])
#build a dict mapping var to the gradient of cost with respect to var
grad_dict
=
{}
...
...
@@ -545,6 +413,47 @@ def grad(cost, wrt, g_cost = None, consider_constant = None, warn_type = False,
return
rval
def
_populate_var_to_node_to_idx
(
outputs
):
"""
Common code shared between grad and grad_sources_inputs
outputs: a list of nodes we want to take gradients of
returns:
var_to_node_to_idx: a dictionary mapping a variable to
a second dictionary.
the second dictionary maps apply nodes acting on
this variable to the variable's index in the apply
node's input list
"""
#var_to_node_to_idx[var][node] = i means node has var as input at position i
var_to_node_to_idx
=
{}
#set of variables that have been added to their parents
accounted_for
=
set
([])
#mark the variables in the relevant subgraph with
#a dictionary called chidlren
#var._children[node] gives the index of var in _children.inputs
def
account_for
(
var
):
if
var
in
accounted_for
:
return
accounted_for
.
add
(
var
)
if
var
.
owner
is
not
None
:
node
=
var
.
owner
for
i
,
ipt
in
enumerate
(
node
.
inputs
):
if
ipt
not
in
var_to_node_to_idx
:
var_to_node_to_idx
[
ipt
]
=
{}
var_to_node_to_idx
[
ipt
][
node
]
=
i
account_for
(
ipt
)
for
output
in
outputs
:
account_for
(
output
)
return
var_to_node_to_idx
def
_populate_grad_dict
(
var_to_node_to_idx
,
\
grad_dict
,
wrt
,
warn_type
,
cost_name
=
None
):
"""
...
...
@@ -663,6 +572,63 @@ def _populate_grad_dict(var_to_node_to_idx,\
def
grad_sources_inputs
(
sources
,
graph_inputs
,
warn_type
=
True
):
"""
Used to compute the gradient of a cost with respect to all the
variables between graph_input and cost, but in the special
case where you don't know the cost, you only know its gradient
on a set of intermediate values.
A gradient source is a pair (``v``, ``g_v``), in which ``v`` is
a `Variable`, and ``g_v`` is a `Variable` that is a gradient wrt
``v``. More specifically, ``g_v`` is the gradient of an external
scalar cost, ``cost`` (that is not explicitly used), wrt ``v``.
This function traverses the graph backward from the ``r`` sources,
calling ``op.grad(...)`` for all ops with some non-None gradient
on an output, to compute gradients of ``cost`` wrt intermediate
variables and ``graph_inputs``.
The ``op.grad(...)`` functions are called like this:
.. code-block:: python
op.grad(op.inputs[:], [total_gradient(v) for v in op.outputs])
This call to ``op.grad`` should return a list or tuple: one symbolic
gradient per input. These gradients represent the gradients of
the same implicit ``cost`` mentionned above, wrt ``op.inputs``. Note
that this is **not** the same as the gradient of ``op.outputs`` wrt
``op.inputs``.
If ``op`` has a single input, then ``op.grad`` should return a list
or tuple of length 1.
For each input wrt to which ``op`` is not differentiable, it should
return ``None`` instead of a `Variable` instance.
If a source ``r`` receives a gradient from another source ``r2``,
then the effective gradient on ``r`` is the sum of both gradients.
:type sources: list of pairs of Variable: (v, gradient-on-v) to
initialize the total_gradient dictionary
:param sources: gradients to back-propagate using chain rule
:type graph_inputs: list of Variable
:param graph_inputs: variables considered to be constant
(do not backpropagate through them)
:type warn_type: bool
:param warn_type: True will trigger warnings via the logging module when
the gradient on an expression has a different type than the original
expression
:rtype: dictionary whose keys and values are of type Variable
:return: mapping from each Variable encountered in the backward
traversal to the gradient with respect to that Variable.
It is assumed that there is some objective J shared between all members of
sources, so that for each v, gradient-on-v is the gradient of J with
respect to v
"""
outputs
,
output_grads
=
zip
(
*
sources
)
...
...
@@ -677,27 +643,7 @@ def grad_sources_inputs(sources, graph_inputs, warn_type = True):
wrt
=
graph_inputs
#var_to_node_to_idx[var][node] = i means node has var as input at position i
var_to_node_to_idx
=
{}
#set of variables that have been added to their parents
accounted_for
=
set
([])
#notify the parents the variables in the relevant subgraph
#that they have children
def
account_for
(
var
):
if
var
in
accounted_for
:
return
accounted_for
.
add
(
var
)
if
var
.
owner
is
not
None
:
node
=
var
.
owner
for
i
,
ipt
in
enumerate
(
node
.
inputs
):
if
ipt
not
in
var_to_node_to_idx
:
var_to_node_to_idx
[
ipt
]
=
{}
var_to_node_to_idx
[
ipt
][
node
]
=
i
account_for
(
ipt
)
for
output
in
outputs
:
account_for
(
output
)
var_to_node_to_idx
=
_populate_var_to_node_to_idx
(
outputs
)
#build a dict mapping var to the gradient of cost with respect to var
grad_dict
=
{}
...
...
@@ -711,10 +657,6 @@ def grad_sources_inputs(sources, graph_inputs, warn_type = True):
#according to the flag, possibly raise an error if wrt is disconnected
for
elem
in
wrt
:
if
elem
not
in
var_to_node_to_idx
and
elem
not
in
outputs
:
message
=
(
"grad method was asked to compute the gradient "
"with respect to a variable that is not part of "
"the computational graph of the cost, or is used "
"only by a non-differentiable operator:
%
s"
%
elem
)
grad_dict
[
elem
]
=
elem
.
zeros_like
()
...
...
@@ -723,123 +665,6 @@ def grad_sources_inputs(sources, graph_inputs, warn_type = True):
return
grad_dict
def
grad_wrong
(
cost
,
wrt
,
g_cost
=
None
,
consider_constant
=
None
,
warn_type
=
False
,
disconnected_inputs
=
'raise'
):
"""
:type cost: Scalar (0-dimensional) Variable.
:type wrt: Variable or list of Variables.
:type g_cost: Scalar Variable, or None.
:param g_cost: an expression for the gradient through cost. The default is
``ones_like(cost)``.
:param consider_constant: a list of expressions not to backpropagate
through
:param warn_type: a value of True will cause warnings to be logged for any
Op that emits a gradient that does not match its input type.
:type disconnected_inputs: string
:param disconnected_inputs: Defines the behaviour if some of the variables
in ``wrt`` are not part of the computational graph computing ``cost``
(or if all links are non-differentiable). The possible values are:
- 'ignore': considers that the gradient on these parameters is zero.
- 'warn': consider the gradient zero, and print a warning.
- 'raise': raise an exception.
:rtype: Variable or list/tuple of Variables (depending upon `wrt`)
:return: symbolic expression of gradient of `cost` with respect to `wrt`.
If an element of `wrt` is not differentiable with respect
to the output, then a zero variable is returned.
It returns an object of same type as `wrt`: a list/tuple
or Variable in all cases.
This function is a wrapper around the more general function
`theano.gradient.grad_sources_inputs``.
"""
if
consider_constant
is
None
:
consider_constant
=
[]
else
:
#error checking on consider_constant: verify that it is a collection
# of theano variables
# this is important, if someone accidentally passes a nested data
# structure with theano variables at the leaves, only the root will
# be properly considered constant
if
not
hasattr
(
consider_constant
,
'__iter__'
):
raise
TypeError
(
'consider_constant must be an iterable collection,'
' got '
+
str
(
type
(
consider_constant
)))
for
elem
in
consider_constant
:
if
not
isinstance
(
elem
,
gof
.
Variable
):
raise
TypeError
(
'Elements of consider_constant must be '
'variables, but got '
+
str
(
type
(
elem
)))
if
not
isinstance
(
cost
,
Variable
):
raise
TypeError
((
'In grad(), cost argument should be '
'a Variable.'
),
cost
)
if
cost
.
type
.
ndim
:
raise
TypeError
(
'In theano.gradient.grad, "cost" argument should be a scalar,'
' but ndim is
%
i (should be 0). If you want to compute the'
' gradient of the sum of cost, you should use cost.sum().'
%
cost
.
type
.
ndim
)
if
g_cost
is
None
:
from
theano
import
tensor
g_cost
=
tensor
.
ones_like
(
cost
)
inputs
=
gof
.
graph
.
inputs
([
cost
])
gmap
=
grad_sources_inputs_aside
(
[(
cost
,
g_cost
)],
list
(
inputs
)
+
list
(
consider_constant
),
warn_type
=
warn_type
)
# Note : If p is not in gmap there can be several reasons, among which
# is the fact that p might not be part of the computational graph. A
# simple example is that for a+b for e.g. a[0] is not part of the graph,
# so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
# such subtle cases can be fixed by a more careful implementation of the
# gradient, but for now Theano needs to throw an exception, and make the
# user aware that it does not know how to compute that gradient
using_list
=
isinstance
(
wrt
,
list
)
using_tuple
=
isinstance
(
wrt
,
tuple
)
if
not
isinstance
(
wrt
,
(
list
,
tuple
)):
wrt
=
[
wrt
]
ret
=
[]
for
p
in
wrt
:
if
p
in
gmap
:
ret
.
append
(
gmap
[
p
])
else
:
message
=
(
"grad method was asked to compute the gradient "
"with respect to a variable that is not part of "
"the computational graph of the cost, or is used "
"only by a non-differentiable operator:
%
s"
%
p
)
if
disconnected_inputs
==
'ignore'
:
pass
elif
disconnected_inputs
==
'warn'
:
warnings
.
warn
(
message
,
stacklevel
=
1
)
elif
disconnected_inputs
==
'raise'
:
raise
ValueError
(
message
)
else
:
raise
ValueError
(
"Invalid value for keyword "
"'disconnected_inputs', valid values are "
"'ignore', 'warn' and 'raise'."
)
ret
.
append
(
p
.
zeros_like
())
if
cost
.
name
is
not
None
and
p
.
name
is
not
None
\
and
ret
[
-
1
]
.
name
is
None
:
ret
[
-
1
]
.
name
=
'(d
%
s/d
%
s)'
%
(
cost
.
name
,
p
.
name
)
# new_vars is meant to be a list of all variables created
# by this call to grad(), which will be visible to the caller
# after we return.
new_vars
=
gof
.
graph
.
ancestors
(
ret
,
blockers
=
gof
.
graph
.
ancestors
([
cost
])
+
list
(
wrt
))
return
format_as
(
using_list
,
using_tuple
,
ret
)
class
numeric_grad
(
object
):
"""
Compute the numeric derivative of a scalar-valued function at a particular
...
...
@@ -1038,7 +863,6 @@ class numeric_grad(object):
max_pos
=
pos
[
max_arg
]
return
(
max_arg
,
pos
[
max_arg
],
abs_errs
[
max_arg
],
rel_errs
[
max_arg
])
def
verify_grad
(
fun
,
pt
,
n_tests
=
2
,
rng
=
None
,
eps
=
None
,
out_type
=
None
,
abs_tol
=
None
,
rel_tol
=
None
,
mode
=
None
,
cast_to_output_type
=
False
):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论