better readability / draft for OfG R_op

ece4c2e4 · khaotik · khaotik · 8d9fa9e5 · ece4c2e4
--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -13,11 +13,112 @@ from theano.gof.graph import io_connection_pattern

 class OpFromGraph(gof.Op):
    """
-    class for Ops with user-defined inner graph
+    This creates an `Op` from inputs and outputs lists of variables.
+    The signature is similar to theano.function() and the resulting
+    `Op`'s perform will do the same operation as::
+
+        orig_function(inputs, outputs, **kwargs)
+    Currently does not support 'updates' or 'givens' argument.
+
+    Parameters
+    ----------
+
+    inputs: list of variables
+    outputs: list of variables
+    inline: bool, optional
+        if True, will cause the Op's original graph being used during
+        compilation, otherwise will use a pre-compiled function inside.
+    grad_overrides: None | function | list of (None|function), optional
+        Used to override default gradient routine.
+        Overriding function(s) must take two list of variable(s) as inputs,
+        the original inputs and ups gradients
+        For different `grad_overrides`:
+
+        - `None` : will use default gradient routine.
+        - function : must return list of Variable.
+        - list : each function must return a single Variable. The order
+            of the list must corresponds to inputs
+
+    TODO:
+        - examples for a multi-layer mlp. where?
+        - __hash__, __eq__ otherwise won't merge, try
+          gof.opt.is_same_graph_with_merge(op1.local_outputs, op2,
+          local_outputs)
+        - c_code() to remove the double overhead?
+        - grad() make it support DisconnectedType and the new interface
+        - implement R_op()
+        - check how it works with updates.
+        - add test with constant as input or inside the inner graph.
+        - Add support for the GPU? Probably just need an opt to remove transfer
+        - Add support to pickle this Op.
+        - Add support/test with random generator
+        - Recursion detection to prevent Op "forkbomb", either set depth
+          limit or manually check them.
+
+    Notes
+    -----
+    - We support shared variables in the inner graph. This is automatic
+      and invisible to the user. They can be as input to the node or in
+      the inner graph.
+    - We support unused inputs. This is needed for the grad.
+    - `inline=True` will cause better runtime optimization at the cost
+      of compilation time. Like "inline" keyword in C, this is merely a
+      suggestion to compiler which is not guaranteed. Currently only
+      works with "fast_compile" or "fast_run" mode.
+
+    Examples
+    --------
+
+    Example 1:
+
+    .. code-block:: python
+
+        from theano import function, op_from_graph, tensor
+        x, y, z = tensor.scalars('xyz')
+        e = x + y * z
+        op = op_from_graph([x, y, z], [e])
+        # op behaves like a normal theano op
+        e2 = op(x, y, z) + op(z, y, x)
+        fn = function([x, y, z], [e2])
+
+    Example 2 with shared variable:
+
+    .. code-block:: python
+
+        import numpy as np
+        import theano
+        from theano import config, function, op_from_graph, tensor
+        x, y, z = tensor.scalars('xyz')
+        s = theano.shared(np.random.rand(2, 2).astype(config.floatX))
+        e = x + y * z + s
+        op = op_from_graph([x, y, z], [e])
+        # op behaves like a normal theano op
+        e2 = op(x, y, z) + op(z, y, x)
+        fn = function([x, y, z], [e2])
+
+    Example 3 override gradient
+
+    .. code-block:: python
+
+        from thenao import funciton, op_from_graph, tensor, grad
+        x, y, z = tensor.scalars('xyz')
+        e = x + y * z
+        def rescale_dy(inps, grads):
+            x, y, z = inps
+            g = grads
+            return z*2
+        op = op_from_graph(
+            [x, y, z], [e], grad_overrides=[None, rescale_dy, None])
+        e2 = op(x, y, z)
+        dx, dy, dz = grad(e2, [x, y, z])
+        fn = function([x, y, z], [dx, dy, dz])
+        # the graident wrt y is now doubled
+        fn(2., 3., 4.) # [1., 8., 3.]
+
    """
    # NOTE: if you make a subclass of this, make sure add test for it under:
    # theano/compile/tests/test_builders.py
-    def __init__(self, inputs, outputs, inline=False, grad_overrides=None, **kwargs):
+    def __init__(self, inputs, outputs, inline=False, grad_overrides=None, rop_overrides=None, **kwargs):
        if not isinstance(outputs, list):
            raise TypeError('outputs must be list', outputs)
        for i in inputs + outputs:
@@ -52,18 +153,11 @@ class OpFromGraph(gof.Op):
        self.kwargs = kwargs
        self.input_types = [inp.type for inp in inputs]
        self.output_types = [out.type for out in outputs]
-        # grad_op: a functor takes form:
-        #
-        # def grad_op(inputs:list, ups_grads:list):
-        #     return dns_grads:list
-        #
-        # This is used to cache gradient for subgraph
-        # for __init__, just set as grad_overrides
-        #
-        # grad_op should be build on the 1st call to grad()
-        # after which grad_op_is_cached should be True
-        self.grad_op = grad_overrides
-        self.grad_op_is_cached = False
+        self.set_grad_overrides(grad_overrides)
+
+        # TODO
+        if rop_overrides is not None:
+            raise NotImplementedError('Overriding Rop is not implemented yet.')

    def __eq__(self, other):
        # TODO: recognize a copy
@@ -73,46 +167,67 @@ class OpFromGraph(gof.Op):
        # TODO: use internal variables in hash
        return hash(type(self))

-    def grad(self, inputs, output_grads):
-        if self.grad_op_is_cached:
-            return self.grad_op(inputs, output_grads)
+    # TODO impl me
+    # def R_op(self, inputs, eval_points):
+    #     pass

-        if self.grad_op is None:
-            self.grad_op = []
+    def _recompute_grad_op(self):
+        output_grads = [out_t() for out_t in self.output_types]
+        if self._grad_op is None:
+            self._grad_op = []

-        # we need to convert a list into a single funtor
-        if isinstance(self.grad_op, list):
-            grad_op_l = self.grad_op
-            if len(grad_op_l) > len(self.local_inputs):
+        # we need to convert a list/function into an OfG instance
+        if isinstance(self._grad_op, list):
+            goverrides_l = self._grad_op
+            if len(goverrides_l) > len(self.local_inputs):
                raise ValueError(
                    'Can override %d gradients at most, got %d' % (
-                        len(self.local_inputs), len(grad_op_l)))
-            if len(grad_op_l) < len(self.local_inputs):
-                grad_op_l += [None] * (
-                    len(self.local_inputs) - len(grad_op_l))
-            wrt = [self.local_inputs[i] for i, go in
-                   enumerate(grad_op_l) if not go]
-            # compute non-overriding downsteam gradients from upstreams grads
+                        len(self.local_inputs), len(goverrides_l)))
+            if len(goverrides_l) < len(self.local_inputs):
+                goverrides_l += [None] * (
+                    len(self.local_inputs) - len(goverrides_l))
+            wrt_l = [lin for lin, gov in
+                     izip(self.local_inputs, goverrides_l) if not gov]
+            # compute non-overriding downsteam grads from upstreams grads
            # it's normal some input may be disconnected, thus the 'ignore'
-            ups_grads_d = dict(izip(self.local_outputs, output_grads))
-            nat_dns_grads = iter(theano.gradient.grad(
+            gdefaults = iter(theano.gradient.grad(
                cost=None,
-                known_grads=ups_grads_d,
-                wrt=wrt,
-                disconnected_inputs='ignore'))
+                known_grads=dict(izip(self.local_outputs, output_grads)),
+                wrt=wrt_l,
+                disconnected_inputs='ignore') if wrt_l else [])
            # combine overriding gradients
-            dns_grads_l = [
-                go(self.local_inputs, output_grads) if go else next(nat_dns_grads) for go in grad_op_l]
-            grad_ofg = type(self)(
-                inputs=self.local_inputs + output_grads,
-                outputs=dns_grads_l,
-                inline=self.is_inline, on_unused_input='ignore')
-
-            def grad_op(inps, grds):
-                return grad_ofg(*(list(inps) + list(grds)))
-            self.grad_op = grad_op
-        self.grad_op_is_cached = True
-        return self.grad_op(inputs, output_grads)
+            all_grads_l = [
+                gov(self.local_inputs, output_grads) if gov
+                else next(gdefaults) for gov in goverrides_l]
+        else:
+            all_grads_l = self._grad_op(self.local_inputs, output_grads)
+        self._grad_op = type(self)(
+            inputs=self.local_inputs + output_grads,
+            outputs=all_grads_l,
+            inline=self.is_inline, on_unused_input='ignore')
+        self._grad_op_is_cached = True
+
+    def get_grad_op(self):
+        """
+        getter method for self._grad_op
+        """
+        if not self._grad_op_is_cached:
+            self._recompute_grad_op()
+        return self._grad_op
+
+    def set_grad_overrides(self, grad_overrides):
+        """
+        Set gradient overrides, see help(theano.OpFromGraph) for syntax
+        This will completed remove any previously set gradient overrides
+
+        """
+        self._grad_op = grad_overrides
+        self._grad_op_is_cached = False
+
+    def grad(self, inputs, output_grads):
+        if not self._grad_op_is_cached:
+            self._recompute_grad_op()
+        return self._grad_op(*(list(inputs) + list(output_grads)))

    def make_node(self, *inputs):
        for input, type in zip(inputs, self.input_types):
@@ -164,6 +279,7 @@ class OpFromGraph(gof.Op):
            self.fn = orig_function(self.local_inputs,
                                    self.local_outputs,
                                    **self.kwargs)
+            self.fn.trust_input = True

    def perform(self, node, inputs, outputs):
        variables = self.fn(*inputs)
@@ -178,7 +294,7 @@ class OpFromGraph(gof.Op):
 def inline_ofg_expansion(node):
    """
    This optimization expands internal graph of OpFromGraph.
-
+    Only performed if node.op.is_inline == True
    Doing so can improve optimization at the cost of compilation speed.
    """
    op = node.op
@@ -201,112 +317,3 @@ optdb.register(
 ops_with_inner_function[OpFromGraph] = 'fn'


-# API for OpFromGraph
-def op_from_graph(
-    inputs, outputs, inline=False, grad_overrides=None, **kwargs
-):
-    """
-    This creates an `Op` from inputs and outputs lists of variables.
-    The signature is similar to theano.function() and the resulting
-    `Op`'s perform will do the same operation as::
-
-        orig_function(inputs, outputs, **kwargs)
-    Currently does not support 'updates' or 'givens' argument.
-
-    Parameters
-    ----------
-
-    inputs: list of variables
-    outputs: list of variables
-    inline: bool, optional
-        if True, will cause the Op's original graph being used during
-        compilation, otherwise will use a pre-compiled function inside.
-    grad_overrides: None | function | list of (None|function), optional
-        Used to override default gradient routine.
-        Overriding function(s) must take two list of variable as inputs,
-        the original inputs and ups gradients
-        For different `grad_overrides`:
-
-        - `None` : will use default gradient routine.
-        - function : must return list of Variable.
-        - list : each function must return a single Variable. The order
-            of the list must corresponds to inputs
-
-    TODO:
-        - examples for a multi-layer mlp. where?
-        - __hash__, __eq__ otherwise won't merge, try
-          gof.opt.is_same_graph_with_merge(op1.local_outputs, op2,
-          local_outputs)
-        - c_code() to remove the double overhead?
-        - grad() make it support DisconnectedType and the new interface
-        - check how it works with updates.
-        - add test with constant as input or inside the inner graph.
-        - Add support for the GPU? Probably just need an opt to remove transfer
-        - Add support to pickle this Op.
-        - Add support/test with random generator
-        - Recursion detection to prevent Op "forkbomb", either set depth
-          limit or manually check them.
-
-    Notes
-    -----
-    - We support shared variables in the inner graph. This is automatic
-      and invisible to the user. They can be as input to the node or in
-      the inner graph.
-    - We support unused inputs. This is needed for the grad.
-    - `inline=True` will cause better runtime optimization at the cost
-      of compilation time. Like "inline" keyword in C, this is merely a
-      suggestion to compiler which is not guaranteed. Currently only
-      works with "fast_compile" or "fast_run" mode.
-
-    Examples
-    --------
-
-    Example 1:
-
-    .. code-block:: python
-
-        from theano import function, op_from_graph, tensor
-        x, y, z = tensor.scalars('xyz')
-        e = x + y * z
-        op = op_from_graph([x, y, z], [e])
-        # op behaves like a normal theano op
-        e2 = op(x, y, z) + op(z, y, x)
-        fn = function([x, y, z], [e2])
-
-    Example 2 with shared variable:
-
-    .. code-block:: python
-
-        import numpy as np
-        import theano
-        from theano import config, function, op_from_graph, tensor
-        x, y, z = tensor.scalars('xyz')
-        s = theano.shared(np.random.rand(2, 2).astype(config.floatX))
-        e = x + y * z + s
-        op = op_from_graph([x, y, z], [e])
-        # op behaves like a normal theano op
-        e2 = op(x, y, z) + op(z, y, x)
-        fn = function([x, y, z], [e2])
-
-    Example 3 override gradient
-
-    .. code-block:: python
-
-        from thenao import funciton, op_from_graph, tensor, grad
-        x, y, z = tensor.scalars('xyz')
-        e = x + y * z
-        def rescale_dy(inps, grads):
-            x, y, z = inps
-            g = grads
-            return z*2
-        op = op_from_graph(
-            [x, y, z], [e], grad_overrides=[None, rescale_dy, None])
-        e2 = op(x, y, z)
-        dx, dy, dz = grad(e2, [x, y, z])
-        fn = function([x, y, z], [dx, dy, dz])
-        # the graident wrt y is now doubled
-        fn(2., 3., 4.) # [1., 8., 3.]
-
-    """
-    return OpFromGraph(
-        inputs, outputs, inline=inline, grad_overrides=grad_overrides, **kwargs)