提交 ece4c2e4 authored 作者: khaotik's avatar khaotik 提交者: khaotik

better readability / draft for OfG R_op

上级 8d9fa9e5
......@@ -13,11 +13,112 @@ from theano.gof.graph import io_connection_pattern
class OpFromGraph(gof.Op):
"""
class for Ops with user-defined inner graph
This creates an `Op` from inputs and outputs lists of variables.
The signature is similar to theano.function() and the resulting
`Op`'s perform will do the same operation as::
orig_function(inputs, outputs, **kwargs)
Currently does not support 'updates' or 'givens' argument.
Parameters
----------
inputs: list of variables
outputs: list of variables
inline: bool, optional
if True, will cause the Op's original graph being used during
compilation, otherwise will use a pre-compiled function inside.
grad_overrides: None | function | list of (None|function), optional
Used to override default gradient routine.
Overriding function(s) must take two list of variable(s) as inputs,
the original inputs and ups gradients
For different `grad_overrides`:
- `None` : will use default gradient routine.
- function : must return list of Variable.
- list : each function must return a single Variable. The order
of the list must corresponds to inputs
TODO:
- examples for a multi-layer mlp. where?
- __hash__, __eq__ otherwise won't merge, try
gof.opt.is_same_graph_with_merge(op1.local_outputs, op2,
local_outputs)
- c_code() to remove the double overhead?
- grad() make it support DisconnectedType and the new interface
- implement R_op()
- check how it works with updates.
- add test with constant as input or inside the inner graph.
- Add support for the GPU? Probably just need an opt to remove transfer
- Add support to pickle this Op.
- Add support/test with random generator
- Recursion detection to prevent Op "forkbomb", either set depth
limit or manually check them.
Notes
-----
- We support shared variables in the inner graph. This is automatic
and invisible to the user. They can be as input to the node or in
the inner graph.
- We support unused inputs. This is needed for the grad.
- `inline=True` will cause better runtime optimization at the cost
of compilation time. Like "inline" keyword in C, this is merely a
suggestion to compiler which is not guaranteed. Currently only
works with "fast_compile" or "fast_run" mode.
Examples
--------
Example 1:
.. code-block:: python
from theano import function, op_from_graph, tensor
x, y, z = tensor.scalars('xyz')
e = x + y * z
op = op_from_graph([x, y, z], [e])
# op behaves like a normal theano op
e2 = op(x, y, z) + op(z, y, x)
fn = function([x, y, z], [e2])
Example 2 with shared variable:
.. code-block:: python
import numpy as np
import theano
from theano import config, function, op_from_graph, tensor
x, y, z = tensor.scalars('xyz')
s = theano.shared(np.random.rand(2, 2).astype(config.floatX))
e = x + y * z + s
op = op_from_graph([x, y, z], [e])
# op behaves like a normal theano op
e2 = op(x, y, z) + op(z, y, x)
fn = function([x, y, z], [e2])
Example 3 override gradient
.. code-block:: python
from thenao import funciton, op_from_graph, tensor, grad
x, y, z = tensor.scalars('xyz')
e = x + y * z
def rescale_dy(inps, grads):
x, y, z = inps
g = grads
return z*2
op = op_from_graph(
[x, y, z], [e], grad_overrides=[None, rescale_dy, None])
e2 = op(x, y, z)
dx, dy, dz = grad(e2, [x, y, z])
fn = function([x, y, z], [dx, dy, dz])
# the graident wrt y is now doubled
fn(2., 3., 4.) # [1., 8., 3.]
"""
# NOTE: if you make a subclass of this, make sure add test for it under:
# theano/compile/tests/test_builders.py
def __init__(self, inputs, outputs, inline=False, grad_overrides=None, **kwargs):
def __init__(self, inputs, outputs, inline=False, grad_overrides=None, rop_overrides=None, **kwargs):
if not isinstance(outputs, list):
raise TypeError('outputs must be list', outputs)
for i in inputs + outputs:
......@@ -52,18 +153,11 @@ class OpFromGraph(gof.Op):
self.kwargs = kwargs
self.input_types = [inp.type for inp in inputs]
self.output_types = [out.type for out in outputs]
# grad_op: a functor takes form:
#
# def grad_op(inputs:list, ups_grads:list):
# return dns_grads:list
#
# This is used to cache gradient for subgraph
# for __init__, just set as grad_overrides
#
# grad_op should be build on the 1st call to grad()
# after which grad_op_is_cached should be True
self.grad_op = grad_overrides
self.grad_op_is_cached = False
self.set_grad_overrides(grad_overrides)
# TODO
if rop_overrides is not None:
raise NotImplementedError('Overriding Rop is not implemented yet.')
def __eq__(self, other):
# TODO: recognize a copy
......@@ -73,46 +167,67 @@ class OpFromGraph(gof.Op):
# TODO: use internal variables in hash
return hash(type(self))
def grad(self, inputs, output_grads):
if self.grad_op_is_cached:
return self.grad_op(inputs, output_grads)
# TODO impl me
# def R_op(self, inputs, eval_points):
# pass
if self.grad_op is None:
self.grad_op = []
def _recompute_grad_op(self):
output_grads = [out_t() for out_t in self.output_types]
if self._grad_op is None:
self._grad_op = []
# we need to convert a list into a single funtor
if isinstance(self.grad_op, list):
grad_op_l = self.grad_op
if len(grad_op_l) > len(self.local_inputs):
# we need to convert a list/function into an OfG instance
if isinstance(self._grad_op, list):
goverrides_l = self._grad_op
if len(goverrides_l) > len(self.local_inputs):
raise ValueError(
'Can override %d gradients at most, got %d' % (
len(self.local_inputs), len(grad_op_l)))
if len(grad_op_l) < len(self.local_inputs):
grad_op_l += [None] * (
len(self.local_inputs) - len(grad_op_l))
wrt = [self.local_inputs[i] for i, go in
enumerate(grad_op_l) if not go]
# compute non-overriding downsteam gradients from upstreams grads
len(self.local_inputs), len(goverrides_l)))
if len(goverrides_l) < len(self.local_inputs):
goverrides_l += [None] * (
len(self.local_inputs) - len(goverrides_l))
wrt_l = [lin for lin, gov in
izip(self.local_inputs, goverrides_l) if not gov]
# compute non-overriding downsteam grads from upstreams grads
# it's normal some input may be disconnected, thus the 'ignore'
ups_grads_d = dict(izip(self.local_outputs, output_grads))
nat_dns_grads = iter(theano.gradient.grad(
gdefaults = iter(theano.gradient.grad(
cost=None,
known_grads=ups_grads_d,
wrt=wrt,
disconnected_inputs='ignore'))
known_grads=dict(izip(self.local_outputs, output_grads)),
wrt=wrt_l,
disconnected_inputs='ignore') if wrt_l else [])
# combine overriding gradients
dns_grads_l = [
go(self.local_inputs, output_grads) if go else next(nat_dns_grads) for go in grad_op_l]
grad_ofg = type(self)(
inputs=self.local_inputs + output_grads,
outputs=dns_grads_l,
inline=self.is_inline, on_unused_input='ignore')
def grad_op(inps, grds):
return grad_ofg(*(list(inps) + list(grds)))
self.grad_op = grad_op
self.grad_op_is_cached = True
return self.grad_op(inputs, output_grads)
all_grads_l = [
gov(self.local_inputs, output_grads) if gov
else next(gdefaults) for gov in goverrides_l]
else:
all_grads_l = self._grad_op(self.local_inputs, output_grads)
self._grad_op = type(self)(
inputs=self.local_inputs + output_grads,
outputs=all_grads_l,
inline=self.is_inline, on_unused_input='ignore')
self._grad_op_is_cached = True
def get_grad_op(self):
"""
getter method for self._grad_op
"""
if not self._grad_op_is_cached:
self._recompute_grad_op()
return self._grad_op
def set_grad_overrides(self, grad_overrides):
"""
Set gradient overrides, see help(theano.OpFromGraph) for syntax
This will completed remove any previously set gradient overrides
"""
self._grad_op = grad_overrides
self._grad_op_is_cached = False
def grad(self, inputs, output_grads):
if not self._grad_op_is_cached:
self._recompute_grad_op()
return self._grad_op(*(list(inputs) + list(output_grads)))
def make_node(self, *inputs):
for input, type in zip(inputs, self.input_types):
......@@ -164,6 +279,7 @@ class OpFromGraph(gof.Op):
self.fn = orig_function(self.local_inputs,
self.local_outputs,
**self.kwargs)
self.fn.trust_input = True
def perform(self, node, inputs, outputs):
variables = self.fn(*inputs)
......@@ -178,7 +294,7 @@ class OpFromGraph(gof.Op):
def inline_ofg_expansion(node):
"""
This optimization expands internal graph of OpFromGraph.
Only performed if node.op.is_inline == True
Doing so can improve optimization at the cost of compilation speed.
"""
op = node.op
......@@ -201,112 +317,3 @@ optdb.register(
ops_with_inner_function[OpFromGraph] = 'fn'
# API for OpFromGraph
def op_from_graph(
inputs, outputs, inline=False, grad_overrides=None, **kwargs
):
"""
This creates an `Op` from inputs and outputs lists of variables.
The signature is similar to theano.function() and the resulting
`Op`'s perform will do the same operation as::
orig_function(inputs, outputs, **kwargs)
Currently does not support 'updates' or 'givens' argument.
Parameters
----------
inputs: list of variables
outputs: list of variables
inline: bool, optional
if True, will cause the Op's original graph being used during
compilation, otherwise will use a pre-compiled function inside.
grad_overrides: None | function | list of (None|function), optional
Used to override default gradient routine.
Overriding function(s) must take two list of variable as inputs,
the original inputs and ups gradients
For different `grad_overrides`:
- `None` : will use default gradient routine.
- function : must return list of Variable.
- list : each function must return a single Variable. The order
of the list must corresponds to inputs
TODO:
- examples for a multi-layer mlp. where?
- __hash__, __eq__ otherwise won't merge, try
gof.opt.is_same_graph_with_merge(op1.local_outputs, op2,
local_outputs)
- c_code() to remove the double overhead?
- grad() make it support DisconnectedType and the new interface
- check how it works with updates.
- add test with constant as input or inside the inner graph.
- Add support for the GPU? Probably just need an opt to remove transfer
- Add support to pickle this Op.
- Add support/test with random generator
- Recursion detection to prevent Op "forkbomb", either set depth
limit or manually check them.
Notes
-----
- We support shared variables in the inner graph. This is automatic
and invisible to the user. They can be as input to the node or in
the inner graph.
- We support unused inputs. This is needed for the grad.
- `inline=True` will cause better runtime optimization at the cost
of compilation time. Like "inline" keyword in C, this is merely a
suggestion to compiler which is not guaranteed. Currently only
works with "fast_compile" or "fast_run" mode.
Examples
--------
Example 1:
.. code-block:: python
from theano import function, op_from_graph, tensor
x, y, z = tensor.scalars('xyz')
e = x + y * z
op = op_from_graph([x, y, z], [e])
# op behaves like a normal theano op
e2 = op(x, y, z) + op(z, y, x)
fn = function([x, y, z], [e2])
Example 2 with shared variable:
.. code-block:: python
import numpy as np
import theano
from theano import config, function, op_from_graph, tensor
x, y, z = tensor.scalars('xyz')
s = theano.shared(np.random.rand(2, 2).astype(config.floatX))
e = x + y * z + s
op = op_from_graph([x, y, z], [e])
# op behaves like a normal theano op
e2 = op(x, y, z) + op(z, y, x)
fn = function([x, y, z], [e2])
Example 3 override gradient
.. code-block:: python
from thenao import funciton, op_from_graph, tensor, grad
x, y, z = tensor.scalars('xyz')
e = x + y * z
def rescale_dy(inps, grads):
x, y, z = inps
g = grads
return z*2
op = op_from_graph(
[x, y, z], [e], grad_overrides=[None, rescale_dy, None])
e2 = op(x, y, z)
dx, dy, dz = grad(e2, [x, y, z])
fn = function([x, y, z], [dx, dy, dz])
# the graident wrt y is now doubled
fn(2., 3., 4.) # [1., 8., 3.]
"""
return OpFromGraph(
inputs, outputs, inline=inline, grad_overrides=grad_overrides, **kwargs)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论