Final version of scan op

ed7f4669 · pascanur@simplet.iro.umontreal.ca · 268fc4f7 · ed7f4669 · ed7f4669
--- a/theano/sandbox/scan.py
+++ b/theano/sandbox/scan.py
@@ -4,14 +4,13 @@ __docformat__ = 'restructedtext en'
 import traceback
 import numpy 
 import theano
+import theano.compile.sandbox
 from theano.tensor import opt
 from theano import gof
 from theano.compile import optdb

 '''
- TODO : test_gradinet 
-        test_time_taps 
-        add_class_description -- postponed: re-write/extend
+ TODO : move out of sandbox !
 '''

 class Scan(theano.Op):
@@ -53,7 +52,8 @@ class Scan(theano.Op):
    """
    @classmethod
    def symbolic(cls,(in_args,out_args), n_ins, n_outs,\
-                n_inplace=0, n_inplace_ignore=0, grad_inplace=0,taps={}):
+                n_inplace=0, n_inplace_ignore=0, taps={},
+                mode = 'FAST_RUN'):
        
        # if in_args is not a list assume it is just a variable and 
        # convert it to a list (if this is neither the case the code will 
@@ -66,7 +66,7 @@ class Scan(theano.Op):
            out_args = [out_args]
 
        # Create fn 
-        my_fn   = theano.function(in_args, out_args)
+        my_fn   = theano.compile.sandbox.pfunc(in_args, out_args, mode = mode)

        # Create gradient function 
        gy_next  = [out_args[0].type()]
@@ -76,12 +76,12 @@ class Scan(theano.Op):
            g_ls = theano.tensor.grad(y_next,in_args,g_cost=gy_next[-1])
            for i in xrange(len(in_args)):
                g_inputs[i] += g_ls[i]
-            
-        g_fn=theano.function(inputs=gy_next+in_args,outputs=g_inputs)
+        g_fn=theano.compile.sandbox.pfunc(gy_next+in_args,g_inputs,
+                             mode=mode)

    
        return cls(my_fn, g_fn, n_ins, n_outs,\
-                   n_inplace,n_inplace_ignore, grad_inplace,taps)
+                   n_inplace,n_inplace_ignore, taps)

    @classmethod
    def compiled(cls,fn,n_ins, n_outs,\
@@ -92,8 +92,7 @@ class Scan(theano.Op):


    def __init__(self,fn,grad_fn,n_ins,n_outs,
-                 n_inplace=0, n_inplace_ignore=0,
-                 grad_inplace=0, 
+                 n_inplace=0, n_inplace_ignore=0,                 
                 taps={}, inplace=False):
        """Create an instance of the scan class

@@ -108,7 +107,10 @@ class Scan(theano.Op):
        need to give the initial state of each outputs, this will be from 
        'n_ins' to 'n_outs'; each initial state should be a matrix where 
        the first dimension is time and should be sufficiently large to 
-        cover the time taps.
+        cover the time taps. The matrix for an initial state should be 
+        ordered such that if you use k delays, index 0 of matrix stands for 
+        the value at time -k, index 1 for value at time 1-k, index 2 for 
+        value at time 2-k and index k-1 for value at time -1

        :param n_inplace: indicates the number of outputs that should be 
        computed inplace; in the list of arguments there will be the first
@@ -119,8 +121,6 @@ class Scan(theano.Op):
        should not be given as arguments to the function applied 
        recursevly

-        :param grad_inplace: the number of gradients to be computed in 
-        place of their corresponding inputs

        :param taps: a dictionary which for each output index gives
        a list of what taps it uses; a tap is given as an int, 
@@ -139,10 +139,6 @@ class Scan(theano.Op):
           (n_inplace > n_outs):
           raise ValueError('Number of inline outs should be smaller then'\
             'the number of inputs or outputs')
-        if (grad_inplace <0) or \
-           (grad_inplace >n_ins+n_outs - n_inplace_ignore):
-            raise ValueError('Wrong number of gradients to be computed'\
-                             'inplace')
        if (n_inplace < 0):
            raise ValueError('Number of inplace outputs should be larger '\
                             'or equal to 0')
@@ -170,7 +166,6 @@ class Scan(theano.Op):
        self.n_inplace_ignore = n_inplace_ignore
        self.fn = fn
        self.grad_fn = grad_fn
-        self.grad_inplace = grad_inplace

    def make_node(self,*inputs):
        """Create an node for the Scan operation
@@ -192,7 +187,7 @@ class Scan(theano.Op):
        out_types = []
        for i in xrange(self.n_ins,self.n_ins+self.n_outs):
            out_types += [theano.tensor.Tensor(dtype=inputs[i].dtype,\
-                    broadcastable=list(inputs[i].broadcastable))()]
+                    broadcastable=(False,)+inputs[i].broadcastable[1:])()]
        return theano.Apply(self,inputs, out_types)


@@ -208,8 +203,7 @@ class Scan(theano.Op):
                   (self.n_inplace == other.n_inplace) and \
                   (self.n_inplace_ignore == other.n_inplace_ignore) and\
                   (self.inplace == other.inplace) and\
-                   (self.taps == other.taps) and\
-                   (self.grad_inplace == other.grad_inplace)
+                   (self.taps == other.taps) 
        return rval

    def __hash__(self):
@@ -228,8 +222,7 @@ class Scan(theano.Op):
               hash(self.n_inplace) ^ \
               hash(self.n_inplace_ignore) ^\
               hash(self.inplace) ^\
-               taps_hash ^\
-               hash(self.grad_inplace)
+               taps_hash 



@@ -240,26 +233,26 @@ class Scan(theano.Op):
            print 'Warning! no gradient for the recursive function was given'
            return [None for i in inputs]
        else:
-            y = self(*inputs).owner.outputs
-#            if not( type(y) in (list,tuple)):
-#                y = [y]
+            y = self(*inputs)
+            if not( type(y) in (list,tuple)):
+                y = [y]
 
-            for o,go in zip(y,g_outs):
-                print o.type
-                print go.type
-                assert o.type == go.type
+            for i in xrange(len(y)):
+                if g_outs[i] == None:
+                    g_outs[i] = theano.tensor.zeros_like(y[i])

            # Construct my gradient class: 
            gradScan = ScanGrad(self.grad_fn, 
                            self.n_ins- self.n_inplace_ignore, self.n_outs,
-                            self.grad_inplace, self.taps)
-
+                            self.taps)

-            args = g_outs[self.n_inplace_ignore:] + y + \
+             
+            args = g_outs + y + \
                   inputs[self.n_inplace_ignore:]
+            
            grads = gradScan(*args)
-              
-            return [None for i in inputs[:self.n_inplace_ignore]]+grads
+            rval = [None for i in inputs[:self.n_inplace_ignore]]+grads
+            return rval


    def perform(self,node,args, outs):
@@ -324,7 +317,6 @@ class Scan(theano.Op):
            fn_args += list(args[(self.n_ins+self.n_outs):])
            # compute output
            something = self.fn(*fn_args)
-            
            # update y and inplace outputs
            for j in xrange(self.n_outs):
                y[j][i] = something[j]
@@ -341,7 +333,7 @@ def scan_make_inplace(node):
    if isinstance(op, Scan) and (not op.inplace) and (op.n_inplace>0):
        return Scan(op.fn, op.grad_fn, op.n_ins,\
                    op.n_outs, op.n_inplace, op.n_inplace_ignore,\
-                    op.grad_inplace,op.taps,inplace=True\
+                    op.taps,inplace=True\
                                       ).make_node(*node.inputs).outputs
    return False

@@ -354,18 +346,20 @@ optdb.register('scan_make_inplace', opt.in2out(scan_make_inplace,\
 class ScanGrad(theano.Op):
    """Gradient Op for Scan"""

-    def __init__(self, grad_fn, n_ins, n_outs, grad_inplace=0,
+    def __init__(self, grad_fn, n_ins, n_outs, 
                 taps = {},inplace=False):
        self.grad_fn = grad_fn
        self.n_ins = n_ins # number of inputs of Scan op not of Grad Scan !!
        self.n_outs = n_outs # number of outs of Scan op not of Grad Scan !!
-        self.grad_inplace = grad_inplace
        self.inplace = inplace
        self.taps = taps
        self.destroy_map = {}
        if self.inplace:
-          for i in xrange(self.grad_inplace):
-            self.destroy_map.update( {i:[i+n_ins+n_outs]} )
+          for i in xrange(self.n_outs):
+            # claiming that output "-i" is destroying inputs is the way to
+            # declare that no real output is aliased to any inputs.  We just
+            # trash the inputs by using them as workspace.
+            self.destroy_map.update( {-i:[i]})


    def __eq__(self,other): 
@@ -374,9 +368,8 @@ class ScanGrad(theano.Op):
           rval = (self.grad_fn is other.grad_fn) and \
                  (self.n_ins == other.n_ins) and \
                  (self.n_outs == other.n_outs) and \
-                  (self.grad_inplace == other.grad_inplace) and \
                  (self.inplace == other.inplace) and \
-                  (self.taps == taps)
+                  (self.taps == other.taps)
        return rval

    def __hash__(self):
@@ -390,61 +383,43 @@ class ScanGrad(theano.Op):
               hash(self.grad_fn) ^ \
               hash(self.n_ins) ^ \
               hash(self.n_outs) ^ \
-               hash(self.grad_inplace) ^ \
               hash(self.inplace) ^ taps_hash

    def make_node(self, *args):
        # input of the gradient op : 
-        # |g_outs | y      | ins   | outs   | other_args |
-        # | n_ins | n_outs | n_ins | n_outs | unknown    |
+        # | g_outs | y      | ins   | outs   | other_args |
+        # | n_outs | n_outs | n_ins | n_outs | unknown    |
        # return 
        # | grad of ins | grad of outs | grad of other_args|
        # |   n_ins     |  n_outs      |  unknown          |
        return theano.Apply(self, list(args),
-                    [i.type() for i in args[self.n_ins+self.n_outs:] ])
+                    [i.type() for i in args[self.n_outs+self.n_outs:] ])

    def perform(self, node, args, storage):
            # get scan inputs
-            inputs = args[self.n_ins+self.n_outs:]
+            inputs = args[self.n_outs+self.n_outs:]
            ins = inputs[:self.n_ins]
            initSt = inputs[self.n_ins:self.n_ins+self.n_outs]
            otherArgs = inputs[self.n_outs+self.n_ins:]
            
            # generate space for gradient 
            # not do if inplace !?
-            if not self.inplace:
-                g_ins   = [numpy.zeros_like(k) for k in ins]
-                g_initSt = [numpy.zeros_like(k) for k in initSt]
-            else:
-                if self.grad_inplace > self.n_ins:
-                    g_ins = ins
-                    g_initSt = initSt[:self.grad_inplace-self.n_ins]
-                    g_initSt += [numpy.zeros_like(k) for k in \
-                                initSt[self.grad_inplace-self.n_ins:]]
-                else:
-                    g_ins = ins[:self.grad_inplace]
-                    g_ins += [numpy.zeros_like(k) for k in \
-                              ins[self.grad_inplace:]]
-                    g_initSt = [numpy.zeros_like(k) for k in initSt]
-
+            g_ins   = [numpy.zeros_like(k) for k in ins]
+            g_initSt = [numpy.zeros_like(k) for k in initSt]
            g_otherArgs = [numpy.zeros_like(k) for k in otherArgs]
-            
            # get gradient from above
-            g_outs = args[:self.n_ins]
+            g_outs = args[:self.n_outs]
            # we modify g_outs inplace ..
            if not self.inplace:
                g_outs = [gout.copy() for gout in g_outs]

-
            # get the output of the scan operation
-            outs = args[self.n_ins:self.n_ins+self.n_outs]
+            outs = args[self.n_outs:2*self.n_outs]

-            # diagnostic:
-            print 'g_outs:' ,g_outs
-            print 'outs:', outs
-            print 'ins:', ins
-            print 'initSt:', initSt
-            print 'otherArgs:', otherArgs
+            # check for Nones (non - differentiable )
+            #for i,g_o in enumerate(g_outs):
+            #    if numpy.all(g_o == 0.):
+            #        g_outs[i] = numpy.zeros_like(outs[i])

            # go back through time to 0 (use a time window !?)
            for i in xrange(len(ins[0])-1,-1,-1):
@@ -464,8 +439,9 @@ class ScanGrad(theano.Op):
                        _outs += [outs[j][i- tap_value]]

              g_out = [arg[i] for arg in g_outs]
-              grads=self.grad_fn(g_out,_ins,_outs,otherArgs)
-
+              grad_args = g_out + _ins + _outs + otherArgs
+              grads=self.grad_fn(*grad_args)
+ 
              # get gradient for inputs 
              for j in xrange(self.n_ins):
                g_ins[j][i] = grads[j]
@@ -479,14 +455,13 @@ class ScanGrad(theano.Op):
                maxVal = max(ls_taps)
                for tap_value in ls_taps:
                    if i - tap_value < 0:
-                        g_initSt[maxVal-tap_value+i] = grads[pos]
+                        g_initSt[j][maxVal-tap_value+i] += grads[pos]
                        pos +=1
                    else:
-                       g_outs[i-tap_value]+= grads[pos]
+                       g_outs[j][i-tap_value]+= grads[pos]
                       pos += 1
              for j in xrange(len(g_otherArgs)):
                g_otherArgs[j] += grads[j+pos]
-            
            # return the gradient 
            for i in xrange(len(g_ins)):
                storage[i][0] = g_ins[i] 
@@ -497,17 +472,17 @@ class ScanGrad(theano.Op):
            for i in xrange(len(g_otherArgs)):
                storage[i+self.n_ins+self.n_outs][0] = g_otherArgs[i]

-'''
+
 @gof.local_optimizer([None])
 def grad_scan_make_inplace(node):
    op = node.op
    if isinstance(op, ScanGrad) and (not op.inplace):
-        return ScanGrad(op.grad_fn, op.n_ins, op.n_outs, op.grad_inplace, 
+        return ScanGrad(op.grad_fn, op.n_ins, op.n_outs, op.taps, 
                   inplace=True).make_node(*node.inputs).outputs
    return False

 optdb.register('grad_scan_make_inplace', opt.in2out(grad_scan_make_inplace,\
               ignore_newtrees=True), 75, 'fast_run', 'inplace')

-'''
+

--- a/theano/sandbox/test_scan.py
+++ b/theano/sandbox/test_scan.py
@@ -7,6 +7,74 @@ import random
 import numpy.random
 from theano.tests  import unittest_tools as utt

+
+
+def verify_grad(op, pt, n_tests=2, rng=None, eps = None, tol = None, 
+                mode = None, cast_to_output_type = False):
+    pt = [numpy.array(p) for p in pt]
+
+    _type_tol = dict( float32=1e-2, float64=1e-4)
+
+    if tol is None:
+        tol = max(_type_tol[str(p.dtype)] for p in pt)
+
+    if rng is None:
+        rng = numpy.random
+        utt.seed_rng()
+    
+    def function(inputs, outputs):
+        if mode is None:
+            f = theano.function(inputs, outputs, accept_inplace=True)
+        else:
+            f = theano.function(inputs,outputs,accept_inplace=True, mode=mode)
+        return f
+
+    for test_num in xrange(n_tests):
+        tensor_pt=[theano.tensor.value(p.copy(),name='input %i'%i) 
+                                       for i,p in enumerate(pt)]
+    # op outputs
+    o_outputs = op(*tensor_pt)
+    if not (type(o_outputs) in (list,tuple)):
+        o_outputs = [ o_outputs ]
+    o_fn = function(tensor_pt, o_outputs)
+    o_fn_outs = o_fn(*[p.copy() for p in pt])
+
+    if not type(o_fn_outs) in (list,tuple):
+        o_fn_outs = [o_fn_outs]
+
+    random_projection = rng.rand(*o_fn_outs[0].shape)
+    if cast_to_output_type:
+        random_projection = numpy.array(random_projection, 
+                             dtype = o_fn_outs[0].dtype)
+    t_r = theano.tensor.as_tensor_variable(random_projection)
+    cost = theano.tensor.sum( t_r * o_outputs[0])
+    for i, o in enumerate(o_fn_outs[1:] ):
+        random_projection = rng.rand(*o.shape)
+        if cast_to_output_type:
+            random_projection = numpy.array(random_projection,
+                                            dtype=o_outputs[i].dtype)
+        t_r  = theano.tensor.as_tensor_variable(random_projection)
+        cost += theano.tensor.sum( t_r * o_outputs[i])
+    cost_fn = function(tensor_pt, cost)
+    num_grad = theano.tensor.numeric_grad(cost_fn,[p.copy() for p in pt],eps)
+    g_cost = theano.tensor.as_tensor_variable(1.0,name='g_cost')
+    if cast_to_output_type:
+        g_cost = cast(g_cost, o_output.dtype)
+    symbolic_grad = theano.tensor.grad(cost, tensor_pt, g_cost)
+    
+
+    grad_fn = function(tensor_pt,symbolic_grad)
+    analytic_grad = grad_fn(*[p.copy() for p in pt])
+    if not isinstance(analytic_grad, (list,tuple)):
+        analytic_grad = [analytic_grad]
+
+    max_err, max_err_pos = num_grad.max_err(analytic_grad)
+    if max_err > tol:
+        raise Exception(theano.tensor.verify_grad.E_grad, 
+                                    (max_err, tol, max_err_pos))
+
+
+
 class T_Scan(unittest.TestCase):
    def setUp(self):
        utt.seed_rng()
@@ -389,30 +457,73 @@ class T_Scan(unittest.TestCase):
        out_1    = final_f(u_1,x_1)
        self.failUnless(numpy.all(out_1==numpy.asarray([2.,3.,4.])))

-    #####################################################################
-    def test_gradSimple(self):
+    ######################################################################
+    def test_gradOneInputOneOutput(self):
        u_1      = theano.tensor.dscalar('u_1')
        x_1      = theano.tensor.dscalar('x_1')
        x_1_next = u_1*x_1
        my_op    = Scan.symbolic( ([u_1,x_1],x_1_next), 1,1)
-        u_1      = theano.tensor.dvector('u_1')
-        x_1      = theano.tensor.dvector('x_1')
-        x_1_next = my_op(u_1,x_1)
-        #final_f  = theano.function([u_1,x_1],[x_1_next])
-        
        u_1     = [1.,2.,3.]
        x_1     = [1.]

-        utt.verify_grad( my_op , [u_1,x_1] )
+        verify_grad( my_op , [u_1,x_1] )

+
+    #######################################################################
    def test_gradManyInputsManyOutputs(self):
-        pass
+        u_1      = theano.tensor.dscalar('u_1')
+        u_2      = theano.tensor.dscalar('u_2')
+        x_1      = theano.tensor.dscalar('x_1')
+        x_2      = theano.tensor.dscalar('x_2')
+        x_1_next = x_1*u_1+x_2
+        x_2_next = x_2*u_2+x_1
+        my_op    = Scan.symbolic( ([u_1,u_2,x_1,x_2],
+                                   [x_1_next,x_2_next]),
+                          2,2)
+        u_1  = [1.,.2,3.]
+        u_2  = [1.5,1.25,.35]
+        x_1  = [.5]
+        x_2  = [.65]

+        verify_grad(my_op, [u_1,u_2,x_1,x_2])
+
+
+    ######################################################################
    def test_gradTimeTaps(self):
-        pass
+        u_1       = theano.tensor.dscalar('u_1')
+        x_1       = theano.tensor.dscalar('x_1')
+        x_1_t_2   = theano.tensor.dscalar('x_1_t_2')

+        x_1_next = x_1_t_2*x_1*u_1
+        my_op    = Scan.symbolic( ([u_1,x_1,x_1_t_2],
+                                   [x_1_next]),
+                        1,1,taps={0:[2]})
+        u_1 = [1.,2.,3.,4.]
+        x_1 = [2.,3.]
+
+        verify_grad(my_op, [u_1,x_1])
+
+    #######################################################################
    def test_gradManyInputsManyOutputsTimeTaps(self):
-        pass
+        u_1   = theano.tensor.dscalar('u_1')
+        u_2   = theano.tensor.dscalar('u_2')
+        x_1   = theano.tensor.dscalar('x_1')
+        x_1_2 = theano.tensor.dscalar('x_1_2')
+        x_2   = theano.tensor.dscalar('x_2')
+        x_2_2 = theano.tensor.dscalar('x_2_2')
+        x_1_n = x_1*x_2_2 + u_1*x_1_2
+        x_2_n = x_2*x_1_2 + u_2*x_2_2
+        my_op = Scan.symbolic(([u_1,u_2,x_1,x_1_2,
+                                x_2,x_2_2],[x_1_n,
+                                x_2_n]),2,2,taps=
+                                {0:[2],1:[2]})
+
+        u_1 = [1.,2.,3.,4.]
+        u_2 = [3.,2.,4.,1.]
+        x_1 = [0.1,0.2]
+        x_2 = [1.5,3.5]
+
+        verify_grad(my_op, [u_1,u_2,x_1,x_2])

 if __name__ == '__main__':
    unittest.main()