Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
40bbb7da
提交
40bbb7da
authored
11月 16, 2012
作者:
David Warde-Farley
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1068 from goodfeli/fix_consider_constant
Fixes several issues with gradients and some other bugs
上级
87cd138e
83781003
隐藏空白字符变更
内嵌
并排
正在显示
8 个修改的文件
包含
402 行增加
和
299 行删除
+402
-299
op.txt
doc/extending/op.txt
+2
-0
builders.py
theano/compile/builders.py
+6
-3
gradient.py
theano/gradient.py
+189
-238
scan_op.py
theano/scan_module/scan_op.py
+20
-11
basic.py
theano/tensor/basic.py
+54
-7
conv.py
theano/tensor/nnet/conv.py
+2
-3
test_gradient.py
theano/tests/test_gradient.py
+128
-30
test_rop.py
theano/tests/test_rop.py
+1
-7
没有找到文件。
doc/extending/op.txt
浏览文件 @
40bbb7da
...
...
@@ -249,6 +249,8 @@ following methods:
1) They must be Variable instances.
2) When they are types that have dtypes, they must never have an integer dtype.
The output gradients passed *to* Op.grad will also obey these constraints.
Integers are a tricky subject. Integers are the main reason for having DisconnectedType,
NullType or zero gradient. When you have an integer as an argument to your grad method,
recall the definition of a derivative to help you decide what value to return:
...
...
theano/compile/builders.py
浏览文件 @
40bbb7da
...
...
@@ -55,9 +55,12 @@ class OpFromGraph(gof.Op):
if
grad_depth
>
0
:
output_grads
=
[
t
()
for
t
in
self
.
output_types
]
gd
=
G
.
grad_sources_inputs
(
zip
(
self
.
outputs
,
output_grads
),
self
.
inputs
)
gs
=
map
(
gd
.
get
,
self
.
inputs
)
# OpFromGraph doesn't implement a connection_pattern, so for now we regard
# all inputs and outputs as connected. This will compute the right numerical
# value for the gradients but could fail to raise the disconnected inputs error
# in some cases.
gs
=
G
.
grad
(
cost
=
None
,
known_grads
=
dict
(
zip
(
self
.
outputs
,
output_grads
)),
wrt
=
self
.
inputs
,
disconnected_inputs
=
'ignore'
)
self
.
grad_ops
=
[]
for
g
in
gs
:
if
g
is
None
:
...
...
theano/gradient.py
浏览文件 @
40bbb7da
...
...
@@ -13,9 +13,11 @@ import warnings
_logger
=
logging
.
getLogger
(
'theano.gradient'
)
import
numpy
# for numeric_grad
np
=
numpy
import
theano
from
itertools
import
izip
from
theano
import
gof
from
theano.gof
import
Variable
from
theano.gof.python25
import
all
...
...
@@ -317,9 +319,6 @@ def Lop(f, wrt, eval_points, consider_constant=None,
coordinates of the tensor element in the last
If `f` is a list/tuple, then return a list/tuple with the results.
"""
if
consider_constant
is
None
:
consider_constant
=
[]
if
type
(
eval_points
)
not
in
(
list
,
tuple
):
eval_points
=
[
eval_points
]
...
...
@@ -333,50 +332,15 @@ def Lop(f, wrt, eval_points, consider_constant=None,
f
=
list
(
f
)
grads
=
list
(
eval_points
)
for
elem
in
consider_constant
:
assert
elem
not
in
f
f
.
append
(
elem
)
grads
.
append
(
elem
.
zeros_like
())
if
not
isinstance
(
wrt
,
(
list
,
tuple
)):
wrt
=
[
wrt
]
arg1
=
zip
(
f
,
eval_points
)
arg2
=
list
(
wrt
)
gmap
=
grad_sources_inputs
(
arg1
,
arg2
)
# Note : If p is not in gmap there can be several reasons, among which
# is the fact that p might not be part of the computational graph. A
# simple example is that for a+b for e.g. a[0] is not part of the graph,
# so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
# such subtle cases can be fixed by a more careful implementation of the
# gradient, but for now Theano needs to throw an exception, and make the
# user aware that it does not know how to compute that gradient
ret
=
[]
for
p
in
wrt
:
if
p
in
gmap
:
ret
.
append
(
gmap
[
p
])
else
:
message
=
(
"Lop method was asked to compute the gradient "
"with respect to a variable that is not part of "
"the computational graph of the cost, or is used "
"only by a non-differentiable operator:
%
s"
%
p
)
if
disconnected_inputs
==
'ignore'
:
pass
elif
disconnected_inputs
==
'warn'
:
warnings
.
warn
(
message
,
stacklevel
=
1
)
elif
disconnected_inputs
==
'raise'
:
raise
ValueError
(
message
)
else
:
raise
ValueError
(
"Invalid value for keyword "
"'disconnected_inputs', valid values are "
"'ignore', 'warn' and 'raise'."
)
ret
.
append
(
p
.
zeros_like
())
assert
len
(
f
)
==
len
(
grads
)
known
=
dict
(
izip
(
f
,
grads
))
ret
=
grad
(
cost
=
None
,
known_grads
=
known
,
consider_constant
=
consider_constant
,
wrt
=
wrt
,
disconnected_inputs
=
disconnected_inputs
)
return
format_as
(
using_list
,
using_tuple
,
ret
)
...
...
@@ -386,9 +350,11 @@ def Lop(f, wrt, eval_points, consider_constant=None,
#########################
def
grad
(
cost
,
wrt
,
g_cost
=
None
,
consider_constant
=
None
,
disconnected_inputs
=
'raise'
,
add_names
=
True
):
disconnected_inputs
=
'raise'
,
add_names
=
True
,
known_grads
=
None
,
return_disconnected
=
'zero'
):
"""
:type cost: Scalar (0-dimensional) Variable.
May optionally be None if known_grads is provided.
:type wrt: Variable or list of Variables.
:type g_cost: Scalar Variable, or None.
:param g_cost: an expression for the gradient through cost. The default is
...
...
@@ -409,6 +375,20 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
(d<cost.name>/d<wrt.name>) provided that both cost and wrt have
names
:type known_grads: dict
:param known_grads: If not None, a dictionary mapping variables to their
gradients. This is useful in the case where you know the
gradient on some variables but do not know the original
cost.
:type return_disconnected: string
:param return_disconnected:
'zero' : If wrt[i] is disconnected, return value i will be
wrt[i].zeros_like()
'None' : If wrt[i] is disconnected, return value i will be
None
'Disconnected' : returns variables of type DisconnectedType
:rtype: Variable or list/tuple of Variables (depending upon `wrt`)
:return: symbolic expression of gradient of `cost` with respect to `wrt`.
...
...
@@ -422,29 +402,17 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
if
tensor
is
None
:
from
theano
import
tensor
if
isinstance
(
cost
.
type
,
NullType
):
if
cost
is
None
:
assert
known_grads
is
not
None
if
cost
is
not
None
and
isinstance
(
cost
.
type
,
NullType
):
raise
ValueError
(
"Can't differentiate a NaN cost."
"cost is NaN because "
+
\
cost
.
type
.
why_null
)
if
cost
.
ndim
!=
0
:
if
cost
is
not
None
and
cost
.
ndim
!=
0
:
raise
TypeError
(
"cost must be a scalar."
)
if
consider_constant
is
None
:
consider_constant
=
[]
else
:
# error checking on consider_constant: verify that it is a collection
# of theano variables
# this is important, if someone accidentally passes a nested data
# structure with theano variables at the leaves, only the root will
# be properly considered constant
if
not
hasattr
(
consider_constant
,
'__iter__'
):
raise
TypeError
(
'consider_constant must be an iterable collection,'
' got '
+
str
(
type
(
consider_constant
)))
for
elem
in
consider_constant
:
if
not
isinstance
(
elem
,
gof
.
Variable
):
raise
TypeError
(
'Elements of consider_constant must be '
'variables, but got '
+
str
(
type
(
elem
)))
if
isinstance
(
wrt
,
set
):
raise
TypeError
(
"wrt must not be a set. sets have no defined "
...
...
@@ -461,7 +429,14 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
raise
TypeError
(
"Expected Variable, got "
+
str
(
elem
)
+
" of type "
+
str
(
type
(
elem
)))
var_to_node_to_idx
=
_populate_var_to_node_to_idx
([
cost
],
wrt
)
outputs
=
[]
if
cost
is
not
None
:
outputs
.
append
(
cost
)
if
known_grads
is
not
None
:
outputs
.
extend
(
known_grads
.
keys
())
var_to_node_to_idx
=
_populate_var_to_node_to_idx
(
outputs
,
wrt
,
consider_constant
)
# build a dict mapping var to the gradient of cost with respect to var
grad_dict
=
{}
...
...
@@ -469,49 +444,57 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
# The gradient of the cost should default to 1 if the cost is of a
# continuous dtype (float, for the moment, as complex are unsupported),
# and should always be 0 if the cost is of discrete (integer) dtype.
if
getattr
(
cost
.
type
,
'dtype'
,
None
)
not
in
tensor
.
float_dtypes
:
if
cost
is
not
None
:
if
g_cost
is
None
:
g_cost
=
_float_ones_like
(
cost
)
# g_cost may be Disconnected or NullType. A creative use of the function,
# sure, but nonetheless one we can and should support. So before we try
# to cast it make sure it even has a dtype
if
hasattr
(
g_cost
.
type
,
'dtype'
)
and
cost
.
type
.
dtype
not
in
tensor
.
discrete_dtypes
:
# Here we enforce the constraint that floating point variables have
# the same dtype as their gradient.
g_cost
=
g_cost
.
astype
(
cost
.
type
.
dtype
)
# DO NOT enforce g_cost to be 0 if cost is an integer.
# This is to be enforced by the Op.grad method for the Op that outputs cost.
assert
g_cost
not
in
tensor
.
discrete_dtypes
grad_dict
[
cost
]
=
g_cost
else
:
if
g_cost
is
not
None
:
try
:
cval
=
theano
.
get_constant_value
(
g_cost
)
if
cval
==
0
:
g_cost_is_zero
=
True
else
:
g_cost_is_zero
=
False
except
TypeError
:
g_cost_is_zero
=
False
if
not
g_cost_is_zero
:
raise
ValueError
(
"The gradient of a cost of non-continuous "
"dtype (here,
%
s), if it is defined, should be 0. "
"However, a value of
%
s was provided in the 'g_cost' "
"argument of theano.grad(). To remove this error, "
"you can simply omit the 'g_cost' argument, or "
"give it the default value of None."
%
(
getattr
(
g_cost
.
type
,
'dtype'
,
'no dtype defined'
),
g_cost
))
g_cost
=
tensor
.
zeros_like
(
cost
)
elif
g_cost
is
None
:
# cost.type.dtype is in tensor.float_dtypes at that point
g_cost
=
tensor
.
ones_like
(
cost
)
raise
ValueError
(
"No cost node was specified, but a gradient"
" on it was."
)
else
:
# Cast the provided gradient so that it has the same dtype
# as the cost.
g_cost
=
g_cost
.
astype
(
cost
.
type
.
dtype
)
if
known_grads
is
not
None
:
for
var
in
known_grads
:
g_var
=
known_grads
[
var
]
if
not
hasattr
(
g_var
,
'type'
):
raise
TypeError
(
'output grads must be theano variables.'
'Ambiguous whether
%
s should be made into tensor'
' or sparse theano variable'
%
str
(
type
(
g_var
)))
if
g_var
.
type
not
in
[
NullType
,
DisconnectedType
]
and
'float'
\
not
in
str
(
g_var
.
type
.
dtype
):
raise
TypeError
(
"Gradients must always be NullType, "
"DisconnectedType, or continuous, but grad was "
"given a known_grad of type "
+
str
(
g_var
.
type
))
# DO NOT check that these gradients are equal to 0 if var is int
# The gradient is allowed to be non-zero on var in that case
# Ops outputing var should not backpropagate its gradient further
# but that is enforced elsewhere (grep for only_connected_to_int)
grad_dict
[
var
]
=
g_var
grad_dict
[
cost
]
=
g_cost
# the gradient of the constants is 0
for
const
in
consider_constant
:
grad_dict
[
const
]
=
DisconnectedType
()()
# variables that do not influence the cost have zero gradient.
# if wrt is such a variable, populate the grad_dict with this info
# so that wrt not being in var_to_node_to_idx won't cause an error below
# according to the flag, possibly raise an error if wrt is disconnected
for
elem
in
wrt
:
if
elem
not
in
var_to_node_to_idx
and
elem
is
not
cost
:
if
elem
not
in
var_to_node_to_idx
and
elem
is
not
cost
\
and
elem
not
in
grad_dict
:
message
=
(
"grad method was asked to compute the gradient "
"with respect to a variable that is not part of "
"the computational graph of the cost, or is used "
...
...
@@ -529,15 +512,15 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
grad_dict
[
elem
]
=
DisconnectedType
()()
cost_name
=
None
if
add_names
:
if
add_names
and
cost
is
not
None
:
cost_name
=
cost
.
name
# Make sure we didn't initialize the grad_dict with any ints
# for non-int outputs
# The gradient may NEVER be an int, even if the variable is an int.
# Read the Op contract and talk to Ian Goodfellow before changing this!
for
var
in
grad_dict
:
g
=
grad_dict
[
var
]
if
(
hasattr
(
g
.
type
,
'dtype'
)
and
getattr
(
var
.
type
,
'dtype'
,
''
)
in
tensor
.
float_dtypes
):
if
hasattr
(
g
.
type
,
'dtype'
):
assert
g
.
type
.
dtype
in
tensor
.
float_dtypes
rval
=
_populate_grad_dict
(
var_to_node_to_idx
,
...
...
@@ -545,7 +528,12 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
for
i
in
xrange
(
len
(
rval
)):
if
isinstance
(
rval
[
i
]
.
type
,
DisconnectedType
):
rval
[
i
]
=
_float_zeros_like
(
wrt
[
i
])
if
return_disconnected
==
'zero'
:
rval
[
i
]
=
_float_zeros_like
(
wrt
[
i
])
elif
return_disconnected
==
'None'
:
rval
[
i
]
=
None
else
:
assert
return_disconnected
==
'Disconnected'
if
using_tuple
:
rval
=
tuple
(
rval
)
...
...
@@ -592,15 +580,18 @@ def _node_to_pattern(node):
return
connection_pattern
def
_populate_var_to_node_to_idx
(
outputs
,
wrt
):
def
_populate_var_to_node_to_idx
(
outputs
,
wrt
,
consider_constant
):
"""
Common code shared between grad and grad_sources_inputs
Helper function for grad function.
outputs: a list of variables we want to take gradients of
wrt: a list of variables we want to take the gradient with
respect to.
consider_constant: a list of variables not to backpropagate
through.
returns:
var_to_app_to_idx:
...
...
@@ -622,8 +613,30 @@ def _populate_var_to_node_to_idx(outputs, wrt):
This set is exactly the set of variables that connect
the variables in wrt to the cost being differentiated.
(A variable in consider_constant is not a function of
anything)
"""
# Validate and format consider_constant
if
consider_constant
is
None
:
consider_constant
=
[]
else
:
# error checking on consider_constant: verify that it is a collection
# of theano variables
# this is important, if someone accidentally passes a nested data
# structure with theano variables at the leaves, only the root will
# be properly considered constant
try
:
iter
(
consider_constant
)
except
TypeError
:
raise
TypeError
(
'consider_constant must be an iterable collection,'
' got '
+
str
(
type
(
consider_constant
)))
for
elem
in
consider_constant
:
if
not
isinstance
(
elem
,
gof
.
Variable
):
raise
TypeError
(
'Elements of consider_constant must be '
'variables, but got '
+
str
(
type
(
elem
)))
# var_to_app_to_idx[var][node] = [i,j] means node has
# var as input at positions i and j
var_to_app_to_idx
=
{}
...
...
@@ -638,9 +651,17 @@ def _populate_var_to_node_to_idx(outputs, wrt):
accounted_for
=
set
([])
def
account_for
(
var
):
# Don't visit the same variable twice
if
var
in
accounted_for
:
return
accounted_for
.
add
(
var
)
# Constants are not a function of anything
if
var
in
consider_constant
:
return
# Recursively add the variables that this variable is
# a function of.
if
var
.
owner
is
not
None
:
app
=
var
.
owner
...
...
@@ -699,11 +720,16 @@ def _populate_var_to_node_to_idx(outputs, wrt):
return
var_to_app_to_idx
class
NullTypeGradError
(
TypeError
):
"""
Raised when grad encounters a NullType.
"""
pass
def
_populate_grad_dict
(
var_to_node_to_idx
,
grad_dict
,
wrt
,
cost_name
=
None
):
"""
Common code shared between grad_sources_inputs and grad
Helper function for grad function.
var_to_node_to_idx: a dictionary mapping a variable to
a second dictionary.
...
...
@@ -712,7 +738,7 @@ def _populate_grad_dict(var_to_node_to_idx,
node's input list
grad_dict: a dictionary mapping variables to their gradients
should be populated by grad
or grad_sources_inputs
should be populated by grad
function.
grad should set gradients to DisconnectedType()() for
variables to be considered constant, set the
...
...
@@ -779,38 +805,46 @@ def _populate_grad_dict(var_to_node_to_idx,
inputs
=
[
try_to_copy_if_needed
(
ipt
)
for
ipt
in
inputs
]
# Build a list of output gradients with the same dtype as
# the corresponding output variable.
# If an output is of a float dtype, we want to cast the
# output gradient into the same dtype, to avoid having a
# gradient graph with double precision (taking more memory,
# and more computation).
# If an output is of an integer dtype, then we ensure the
# output gradient is zero, and that zero can be represented
# in the same int dtype.
# If an output gradient is a NullType or DisconnectedType,
# then it will not have a dtype, and it will not be changed.
# If an output is of an integer dtype, then we just leave it
# alone.
# DO NOT force integer variables to have zero grad. This causes
# bugs where we fail to detect disconnected or undefined gradients.
# DO NOT force integer variables to have integer dtype. This is
# a violation of the op contract.
new_output_grads
=
[]
for
o
,
og
in
zip
(
node
.
outputs
,
output_grads
):
o_dt
=
getattr
(
o
.
type
,
'dtype'
,
None
)
og_dt
=
getattr
(
og
.
type
,
'dtype'
,
None
)
if
og_dt
and
o_dt
in
theano
.
tensor
.
discrete_dtypes
:
new_output_grads
.
append
(
o
.
zeros_like
())
elif
o_dt
and
og_dt
and
o_dt
!=
og_dt
:
if
o_dt
not
in
theano
.
tensor
.
discrete_dtypes
and
og_dt
and
o_dt
!=
og_dt
:
new_output_grads
.
append
(
og
.
astype
(
o_dt
))
else
:
new_output_grads
.
append
(
og
)
# Make sure that, if new_output_grads[i] has a dtype:
# - it is the same dtype as outputs[i]
# - if the dtype is an int, then new_output_grads[i] is 0.
# Make sure that, if new_output_grads[i] has a floating point dtype,
# it is the same dtype as outputs[i]
for
o
,
ng
in
zip
(
node
.
outputs
,
new_output_grads
):
o_dt
=
getattr
(
o
.
type
,
'dtype'
,
None
)
ng_dt
=
getattr
(
ng
.
type
,
'dtype'
,
None
)
if
ng_dt
:
if
ng_dt
is
not
None
and
o_dt
not
in
theano
.
tensor
.
discrete_dtypes
:
assert
ng_dt
==
o_dt
if
ng_dt
in
theano
.
tensor
.
discrete_dtypes
:
assert
theano
.
get_constant_value
(
ng
)
==
0
# Someone who had obviously not read the Op contract tried
# to modify this part of the function.
# If you ever think it is a good idea to make an integer
# valued gradient, please
# 1) Read the Op contract again
# 2) Talk to Ian Goodfellow
# (Both of these sources will tell you not to do it)
for
ng
in
new_output_grads
:
assert
getattr
(
ng
.
type
,
'dtype'
,
None
)
not
in
theano
.
tensor
.
discrete_dtypes
input_grads
=
node
.
op
.
grad
(
inputs
,
new_output_grads
)
...
...
@@ -863,6 +897,7 @@ def _populate_grad_dict(var_to_node_to_idx,
'the grad_undefined or grad_unimplemented helper '
'functions.'
)
%
node
.
op
)
if
not
isinstance
(
term
.
type
,
(
NullType
,
DisconnectedType
)):
if
term
.
type
.
dtype
not
in
theano
.
tensor
.
float_dtypes
:
...
...
@@ -875,14 +910,9 @@ def _populate_grad_dict(var_to_node_to_idx,
# it's not undefined or disconnected
# The only other valid thing it can be is 0
no_constant_value
=
True
try
:
constant_value
=
theano
.
get_constant_value
(
term
)
no_constant_value
=
False
except
TypeError
:
pass
if
no_constant_value
:
is_zero
=
_is_zero
(
term
)
assert
is_zero
in
[
'yes'
,
'no'
,
'maybe'
]
if
is_zero
==
'maybe'
:
msg
=
"
%
s.grad returned
%
s of type
%
s for input"
msg
+=
"
%
d. This input's only connections to "
msg
+=
"the cost through this op are via "
...
...
@@ -896,8 +926,7 @@ def _populate_grad_dict(var_to_node_to_idx,
msg
=
msg
%
(
str
(
node
.
op
),
str
(
term
),
str
(
type
(
term
)),
i
)
raise
ValueError
(
msg
)
if
constant_value
!=
0
:
if
is_zero
==
'no'
:
msg
=
"
%
s.grad returned
%
s of type
%
s for input"
msg
+=
"
%
d. Since this input is only connected "
msg
+=
"to integer-valued outputs, it should "
...
...
@@ -905,7 +934,7 @@ def _populate_grad_dict(var_to_node_to_idx,
msg
+=
"
%
s."
msg
%
(
str
(
node
.
op
),
str
(
term
),
str
(
type
(
term
)),
i
,
str
(
constant_value
))
i
,
str
(
theano
.
get_constant_value
(
term
)
))
raise
ValueError
(
msg
)
...
...
@@ -961,7 +990,7 @@ def _populate_grad_dict(var_to_node_to_idx,
type
(
term
)))
if
isinstance
(
term
.
type
,
NullType
):
raise
Type
Error
(
"tensor.grad "
raise
NullTypeGrad
Error
(
"tensor.grad "
"encountered a NaN. "
+
\
term
.
type
.
why_null
)
...
...
@@ -997,113 +1026,6 @@ def _populate_grad_dict(var_to_node_to_idx,
return
rval
def
grad_sources_inputs
(
sources
,
graph_inputs
):
"""
Used to compute the gradient of a cost with respect to all the
variables between graph_input and cost, but in the special
case where you don't know the cost, you only know its gradient
on a set of intermediate values.
A gradient source is a pair (``v``, ``g_v``), in which ``v`` is
a `Variable`, and ``g_v`` is a `Variable` that is a gradient wrt
``v``. More specifically, ``g_v`` is the gradient of an external
scalar cost, ``cost`` (that is not explicitly used), wrt ``v``.
This function traverses the graph backward from the ``r`` sources,
calling ``op.grad(...)`` for all ops with some non-None gradient
on an output, to compute gradients of ``cost`` wrt intermediate
variables and ``graph_inputs``.
The ``op.grad(...)`` functions are called like this:
.. code-block:: python
op.grad(op.inputs[:], [total_gradient(v) for v in op.outputs])
This call to ``op.grad`` should return a list or tuple: one symbolic
gradient per input. These gradients represent the gradients of
the same implicit ``cost`` mentionned above, wrt ``op.inputs``. Note
that this is **not** the same as the gradient of ``op.outputs`` wrt
``op.inputs``.
If ``op`` has a single input, then ``op.grad`` should return a list
or tuple of length 1.
For each input wrt to which ``op`` is not differentiable, it should
return ``None`` instead of a `Variable` instance.
If a source ``r`` receives a gradient from another source ``r2``,
then the effective gradient on ``r`` is the sum of both gradients.
:type sources: list of pairs of Variable: (v, gradient-on-v) to
initialize the total_gradient dictionary
:param sources: gradients to back-propagate using chain rule
:type graph_inputs: list of Variable
:param graph_inputs: variables considered to be constant
(do not backpropagate through them)
:rtype: dictionary whose keys and values are of type Variable
:return: mapping from each Variable encountered in the backward
traversal to the gradient with respect to that Variable.
It is assumed that there is some objective J shared between all members of
sources, so that for each v, gradient-on-v is the gradient of J with
respect to v
"""
outputs
,
output_grads
=
zip
(
*
sources
)
for
output_grad
in
output_grads
:
if
not
hasattr
(
output_grad
,
'type'
):
raise
TypeError
(
'output grads must be theano variables.'
'Ambiguous whether
%
s should be made into tensor'
' or sparse theano variable'
%
str
(
type
(
output_grad
)))
if
graph_inputs
is
None
:
graph_inputs
=
gof
.
graph
.
inputs
(
outputs
)
wrt
=
graph_inputs
var_to_node_to_idx
=
_populate_var_to_node_to_idx
(
outputs
,
wrt
)
# build a dict mapping var to the gradient of cost with respect to var
grad_dict
=
{}
for
output
,
output_grad
in
sources
:
# The gradient of the cost should always be 0 if the cost is of
# discrete (integer) dtype.
if
getattr
(
output
.
type
,
'dtype'
,
''
)
not
in
theano
.
tensor
.
float_dtypes
:
output_grad
=
output
.
zeros_like
()
else
:
# Cast the provided gradient so that it has the same dtype
# as the cost.
output_grad
=
output_grad
.
astype
(
output
.
type
.
dtype
)
grad_dict
[
output
]
=
output_grad
# variables that do not influence the cost have zero gradient.
# if wrt is such a variable, populate the grad_dict with this info
# so that wrt not being in var_to_node_to_idx won't cause an error below
# according to the flag, possibly raise an error if wrt is disconnected
for
elem
in
wrt
:
if
elem
not
in
var_to_node_to_idx
and
elem
not
in
outputs
:
grad_dict
[
elem
]
=
DisconnectedType
()()
_populate_grad_dict
(
var_to_node_to_idx
,
grad_dict
,
wrt
)
# post-process out the DisconnectedTypes
for
key
in
grad_dict
:
if
isinstance
(
grad_dict
[
key
]
.
type
,
DisconnectedType
):
if
hasattr
(
key
,
'zeros_like'
):
grad_dict
[
key
]
=
_float_zeros_like
(
key
)
return
grad_dict
def
_float_zeros_like
(
x
):
""" Like zeros_like, but forces the object to have a
a floating point dtype """
...
...
@@ -1634,3 +1556,32 @@ def hessian(cost, wrt, consider_constant=None,
"script that generated the error)"
)
hessians
.
append
(
hess
)
return
format_as
(
using_list
,
using_tuple
,
hessians
)
def
_is_zero
(
x
):
"""
Returns 'yes', 'no', or 'maybe' indicating whether x
is always 0.
'maybe' means that x is an expression that is complicated enough
that we can't tell that it simplifies to 0.
"""
if
not
hasattr
(
x
,
'type'
):
return
np
.
all
(
x
==
0.
)
if
isinstance
(
x
.
type
,
NullType
):
return
'no'
if
isinstance
(
x
.
type
,
DisconnectedType
):
return
'yes'
no_constant_value
=
True
try
:
constant_value
=
theano
.
get_constant_value
(
x
)
no_constant_value
=
False
except
TypeError
:
pass
if
no_constant_value
:
return
'maybe'
if
constant_value
!=
0.
:
return
'no'
return
'yes'
theano/scan_module/scan_op.py
浏览文件 @
40bbb7da
...
...
@@ -221,7 +221,8 @@ class Scan(PureOp):
'following error has been encountered: The '
'
%
s
%
s (argument number
%
d) has dtype '
'
%
s and
%
d dimension(s). The corresponding slice
%
s '
'however has dtype
%
s and
%
d dimension(s). This '
'however has dtype
%
s and
%
d dimension(s) (it should '
'have the same dtype and one fewer dimensions). This '
'should never happen, please '
'report to theano-dev mailing list'
)
...
...
@@ -1261,11 +1262,9 @@ class Scan(PureOp):
if
x
in
diff_inputs
]
for
x
in
consider_inps
:
try
:
_gmp
=
gradient
.
grad_sources_inputs
(
[(
y
,
g_y
)],
[
x
])
gmp
[
x
]
=
_gmp
[
x
]
except
TypeError
:
gmp
[
x
]
=
gradient
.
grad
(
cost
=
None
,
known_grads
=
{
y
:
g_y
},
wrt
=
x
)
except
gradient
.
NullTypeGradError
:
# It means the gradient is undefined (which implies
# is connected)
gmp
[
x
]
=
x
...
...
@@ -1374,11 +1373,21 @@ class Scan(PureOp):
self
.
inner_nitsot_outs
(
self_outputs
))
def
compute_gradient
(
y
,
g_y
):
gmp
=
gradient
.
grad_sources_inputs
(
[(
y
,
g_y
)],
[
x
for
x
in
theano
.
gof
.
graph
.
inputs
([
y
])
if
x
in
diff_inputs
])
return
[
gmp
.
get
(
p
,
None
)
for
p
in
diff_inputs
]
if
'int'
in
str
(
g_y
.
dtype
):
raise
TypeError
(
"Gradients may never be integers but g_y "
"has type "
+
str
(
g_y
.
type
))
wrt
=
[
x
for
x
in
theano
.
gof
.
graph
.
inputs
([
y
])
if
x
in
diff_inputs
]
grads
=
gradient
.
grad
(
cost
=
None
,
known_grads
=
{
y
:
g_y
},
wrt
=
wrt
,
consider_constant
=
wrt
,
disconnected_inputs
=
'ignore'
,
return_disconnected
=
'None'
)
gmp
=
dict
(
zip
(
wrt
,
grads
))
rval
=
[
gmp
.
get
(
p
,
None
)
for
p
in
diff_inputs
]
return
rval
dC_dinps_t
=
[
None
for
inp
in
diff_inputs
]
disconnected_dC_dinps_t
=
[
True
for
inp
in
diff_inputs
]
dC_dXts
=
[]
...
...
theano/tensor/basic.py
浏览文件 @
40bbb7da
...
...
@@ -462,13 +462,27 @@ def _allclose(a, b, rtol=None, atol=None):
return
numpy
.
allclose
(
a
,
b
,
atol
=
atol_
,
rtol
=
rtol_
)
class
NotConstantError
(
TypeError
):
"""
Raised by get_constant_value if called on something that is
not constant.
For now it is a TypeError, to maintain the old interface
that get_constant_value should raise a TypeError in this
situation. However, this is unsafe because get_constant_value
could inadvertently raise a TypeError if it has a bug.
So we should eventually make NotConstantError derive
from Exception directly, and modify all code that uses
get_constant_value to catch this more specific exception.
"""
pass
def
get_constant_value
(
v
):
"""return the constant scalar(0-D) value underlying variable `v`
If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast
this function digs through them.
If `v` is not some view of constant data, then raise a
Type
Error.
If `v` is not some view of constant data, then raise a
NotConstant
Error.
:note: There may be another function similar to this one in the
code, but I'm not sure where it is.
...
...
@@ -488,7 +502,7 @@ def get_constant_value(v):
numpy
.
complex
(
data
)
# works for all numeric scalars
return
data
except
Exception
:
raise
Type
Error
(
raise
NotConstant
Error
(
'v.data is non-numeric, non-scalar, or has more than one'
' unique value'
,
v
)
if
v
.
owner
:
...
...
@@ -516,9 +530,17 @@ def get_constant_value(v):
v
.
owner
.
op
.
perform
(
v
.
owner
,
[
const
],
ret
)
return
ret
[
0
][
0
]
if
isinstance
(
v
.
owner
.
op
,
Subtensor
)
and
v
.
ndim
==
0
:
if
isinstance
(
v
.
owner
.
inputs
[
0
],
TensorConstant
):
return
v
.
owner
.
inputs
[
0
]
.
data
.
__getitem__
(
# This condition depends on Subtensor always embedding constant
# indices in the Op rather than making them inputs to the Apply node
if
isinstance
(
v
.
owner
.
inputs
[
0
],
TensorConstant
)
and
\
len
(
v
.
owner
.
inputs
)
==
1
:
try
:
return
v
.
owner
.
inputs
[
0
]
.
data
.
__getitem__
(
tuple
(
v
.
owner
.
op
.
idx_list
))
except
IndexError
:
raise
IndexError
(
str
(
tuple
(
v
.
owner
.
op
.
idx_list
))
+
" is not a valid index into "
+
\
str
(
v
.
owner
.
inputs
[
0
]
.
data
))
# The index list 'idx_list' should have length the same
# shape as the input.
...
...
@@ -3780,7 +3802,7 @@ class AdvancedIndexingError(TypeError):
class
Subtensor
(
Op
):
"""Return a subtensor view
The inputs array is the tensor x, followed by scalar integer
variabl
es.
The inputs array is the tensor x, followed by scalar integer
typ
es.
TODO: WRITEME: how are the scalar integer variables formatted?
This class uses a relatively complex internal representation of the inputs
...
...
@@ -3789,7 +3811,7 @@ class Subtensor(Op):
idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
(old docstring gives two conflicting
descriptions)
elements are either integers, theano scalars, or slices.
elements are either integers, theano scalar
type
s, or slices.
one element per "explicitly named dimension"
TODO: WRITEME: what is an "explicitly named dimension" ?
...
...
@@ -3798,7 +3820,11 @@ class Subtensor(Op):
if slice:
start/stop/step members of each slice are integer indices
into the inputs array or None
integer indices be actual integers or theano scalars
integer indices be actual integers or theano scalar types
Note that the idx_list defines the Op, so two Subtensor instances are
considered to be different Ops if they have different idx_list fields.
This means that the entries in it are theano Types, not theano Variables.
@todo: add support for advanced tensor indexing (in Subtensor_dx too).
...
...
@@ -3816,6 +3842,17 @@ class Subtensor(Op):
@staticmethod
def
collapse
(
idxs
,
cond
):
"""
idxs: a list of indices or slices.
cond: a callable that returns a bool
returns: idxs, with the slices flattened out into a list.
if cond is true for an entry, does not flatten it.
"""
ret
=
[]
def
helper
(
entry
):
...
...
@@ -3828,10 +3865,20 @@ class Subtensor(Op):
for
idx
in
idxs
:
helper
(
idx
)
return
ret
@staticmethod
def
convert
(
entry
,
slice_ok
=
True
):
"""
The "idx_list" field is unique to each Subtensor instance.
It is not unique to each Apply node, so it should not refer to
specific Variables. This method changes references to Variables
into references to Types.
TODO: WRITEME: This method also accepts "entry" already being a Type;
when would that happen?
"""
invalid_scal_types
=
[
scal
.
float64
,
scal
.
float32
]
scal_types
=
[
scal
.
int64
,
scal
.
int32
,
scal
.
int16
,
scal
.
int8
]
tensor_types
=
[
lscalar
,
iscalar
,
wscalar
,
bscalar
]
...
...
theano/tensor/nnet/conv.py
浏览文件 @
40bbb7da
...
...
@@ -801,10 +801,9 @@ class ConvOp(OpenMPOp):
# mimic what happens inside theano.grad: get the input gradient
# of the final cost wrt all variables involved.
tmp_gmap
=
theano
.
gradient
.
grad_sources_inputs
(
[(
node
,
gz
)],
[
inputs
,
kerns
])
return
theano
.
gradient
.
grad
(
cost
=
None
,
known_grads
=
{
node
:
gz
},
wrt
=
[
inputs
,
kerns
])
return
[
tmp_gmap
[
inputs
],
tmp_gmap
[
kerns
]]
if
self
.
dx
not
in
(
1
,
2
)
or
self
.
dy
not
in
(
1
,
2
):
raise
NotImplementedError
(
...
...
theano/tests/test_gradient.py
浏览文件 @
40bbb7da
...
...
@@ -6,7 +6,6 @@ import unittest
import
theano
from
theano
import
gof
from
theano.gradient
import
grad_sources_inputs
from
theano
import
gradient
from
theano.tensor.nnet.Conv3D
import
conv3D
from
theano
import
config
...
...
@@ -16,6 +15,16 @@ from theano.gof.null_type import NullType
one
=
theano
.
tensor
.
as_tensor_variable
(
1.
)
def
grad_sources_inputs
(
sources
,
inputs
):
"""
This implements the old grad_sources_inputs function in terms of
the new interface so the tests don't need to be rewritten.
"""
if
inputs
is
None
:
inputs
=
theano
.
gof
.
graph
.
inputs
([
source
[
0
]
for
source
in
sources
])
return
dict
(
zip
(
inputs
,
theano
.
gradient
.
grad
(
cost
=
None
,
known_grads
=
dict
(
sources
),
wrt
=
inputs
,
consider_constant
=
inputs
)))
class
testgrad_sources_inputs
(
unittest
.
TestCase
):
def
test_retNone1
(
self
):
...
...
@@ -369,35 +378,6 @@ class test_grad(unittest.TestCase):
# If we made it to here without an exception, then the
# connection_pattern functionality worked correctly
def
test_sum_disconnected
(
self
):
# Tests that we can add DisconnectedType to other terms correctly
x
=
theano
.
tensor
.
scalar
()
y
=
x
*
2.
z
=
x
+
1.
cost
=
y
+
z
theano
.
tensor
.
grad
(
cost
,
x
,
consider_constant
=
[
y
,
z
])
# In an earlier version of theano, the above line would have failed
# while trying to add two DisconnectedTypes
def
test_output_grad_on_int
(
self
):
# If the g_cost argument is specified when x has a discrete dtype,
# g_cost should be equivalent to 0.
x
=
theano
.
tensor
.
iscalar
(
'x'
)
y
=
x
*
2
# Should work:
c0
=
theano
.
tensor
.
constant
(
0
)
theano
.
grad
(
y
,
x
,
g_cost
=
c0
)
theano
.
grad
(
y
,
x
,
g_cost
=
y
.
zeros_like
())
theano
.
grad
(
y
,
x
,
g_cost
=
y
.
zeros_like
()
.
astype
(
'float64'
))
# Should raise ValueError
c1
=
theano
.
tensor
.
constant
(
1
)
self
.
assertRaises
(
ValueError
,
theano
.
grad
,
y
,
x
,
g_cost
=
c1
)
s0
=
theano
.
shared
(
np
.
zeros
((),
dtype
=
'int8'
))
self
.
assertRaises
(
ValueError
,
theano
.
grad
,
y
,
x
,
g_cost
=
s0
)
def
test_downcast_dtype
(
self
):
# Test that the gradient of a cost wrt a float32 variable does not
# get upcasted to float64.
...
...
@@ -418,6 +398,124 @@ class test_grad(unittest.TestCase):
# be downcasted to float32, so dc_dx should also be float32
assert
dc_dx
.
dtype
==
'float32'
def
test_grad_constant
(
self
):
# Test that the gradient handles Constants and consider_constant variables
# consistently
x
=
theano
.
tensor
.
scalar
()
y
=
theano
.
tensor
.
scalar
()
z_x
=
x
+
y
z_one
=
one
+
y
g_x
=
theano
.
tensor
.
grad
(
z_x
,
x
,
consider_constant
=
[
x
])
g_one
=
theano
.
tensor
.
grad
(
z_one
,
one
)
f
=
theano
.
function
([
x
,
y
],[
g_x
,
g_one
])
g_x
,
g_one
=
f
(
1
,
.
5
)
if
not
np
.
allclose
(
g_x
,
g_one
):
raise
AssertionError
(
"Gradient using consider constant is "
+
str
(
g_x
)
\
+
" but gradient with respect to the same Constant is "
+
\
str
(
g_one
))
def
test_known_grads
():
# Tests that the grad method with no known_grads
# matches what happens if you put its own known_grads
# in for each variable
full_range
=
theano
.
tensor
.
arange
(
10
)
x
=
theano
.
tensor
.
scalar
(
'x'
)
t
=
theano
.
tensor
.
iscalar
(
't'
)
ft
=
full_range
[
t
]
ft
.
name
=
'ft'
coeffs
=
theano
.
tensor
.
vector
(
'c'
)
ct
=
coeffs
[
t
]
ct
.
name
=
'ct'
p
=
x
**
ft
p
.
name
=
'p'
y
=
ct
*
p
y
.
name
=
'y'
cost
=
theano
.
tensor
.
sqr
(
y
)
cost
.
name
=
'cost'
layers
=
[
[
cost
],
[
y
],
[
ct
,
p
],
[
ct
,
x
,
ft
],
[
coeffs
,
t
,
full_range
,
x
]
]
inputs
=
[
coeffs
,
t
,
x
]
rng
=
np
.
random
.
RandomState
([
2012
,
11
,
15
])
values
=
[
rng
.
randn
(
10
),
rng
.
randint
(
10
),
rng
.
randn
()
]
values
=
[
np
.
cast
[
ipt
.
dtype
](
value
)
for
ipt
,
value
in
zip
(
inputs
,
values
)]
true_grads
=
theano
.
tensor
.
grad
(
cost
,
inputs
,
disconnected_inputs
=
'ignore'
)
true_grads
=
theano
.
function
(
inputs
,
true_grads
)
true_grads
=
true_grads
(
*
values
)
for
layer
in
layers
:
print
'Testing by separately computing '
,
layer
first
=
theano
.
tensor
.
grad
(
cost
,
layer
,
disconnected_inputs
=
'ignore'
)
known
=
dict
(
zip
(
layer
,
first
))
full
=
theano
.
tensor
.
grad
(
cost
=
None
,
known_grads
=
known
,
wrt
=
inputs
,
disconnected_inputs
=
'ignore'
)
full
=
theano
.
function
(
inputs
,
full
)
full
=
full
(
*
values
)
assert
len
(
true_grads
)
==
len
(
full
)
for
a
,
b
,
var
in
zip
(
true_grads
,
full
,
inputs
):
if
not
np
.
allclose
(
a
,
b
):
print
'Failure'
print
a
print
b
print
var
print
layer
for
v
in
known
:
print
v
,
':'
,
theano
.
function
(
inputs
,
known
[
v
])(
*
values
)
assert
False
def
test_dxdx
():
# Tests that the gradient of a scalar with respect to itself is 1
# I use an integer in this case because people keep changing this
# gradient to be 0 on integers but according to our interpretation
# of the gradient as defined in the Op contract, it should be 1.
# If you feel the need to change this unit test you are probably
# modifying the Op contract and should definitely get the approval
# of multiple people on theano-dev.
x
=
theano
.
tensor
.
iscalar
()
g
=
theano
.
tensor
.
grad
(
x
,
x
)
g
=
g
.
eval
({
x
:
12
})
assert
np
.
allclose
(
g
,
1.
)
def
test_known_grads_integers
():
# Tests that known_grads works on integers
x
=
theano
.
tensor
.
iscalar
()
g_expected
=
theano
.
tensor
.
scalar
()
g_grad
=
theano
.
gradient
.
grad
(
cost
=
None
,
known_grads
=
{
x
:
g_expected
},
wrt
=
x
)
f
=
theano
.
function
([
g_expected
],
g_grad
)
x
=
-
3
gv
=
np
.
cast
[
theano
.
config
.
floatX
](
.
6
)
g_actual
=
f
(
gv
)
assert
np
.
allclose
(
g_actual
,
gv
)
if
__name__
==
'__main__'
:
unittest
.
main
()
theano/tests/test_rop.py
浏览文件 @
40bbb7da
...
...
@@ -341,15 +341,9 @@ class test_RopLop(RopLop_checker):
rop_out2
=
tensor
.
Rop
((
m
,
v
,
m
+
v
),
[
m
,
v
],
[
m_
,
v_
])
assert
isinstance
(
rop_out2
,
tuple
)
assert
len
(
rop_out2
)
==
3
lop_out1
=
tensor
.
Lop
([
m
,
v
,
m
+
v
],
(
m
,
v
),
[
m_
,
v_
])
assert
isinstance
(
lop_out1
,
tuple
)
assert
len
(
lop_out1
)
==
2
lop_out2
=
tensor
.
Lop
((
m
,
v
,
m
+
v
),
[
m
,
v
],
[
m_
,
v_
])
assert
isinstance
(
lop_out2
,
list
)
assert
len
(
lop_out2
)
==
2
all_outs
=
[]
for
o
in
rop_out1
,
rop_out2
,
lop_out1
,
lop_out2
:
for
o
in
rop_out1
,
rop_out2
:
all_outs
.
extend
(
o
)
f
=
theano
.
function
([
m
,
v
,
m_
,
v_
],
all_outs
)
f
(
mval
,
vval
,
m_val
,
v_val
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论