new optimization for scan + new feature for Dimshuffle

31904be5 · Razvan Pascanu · 3903e59f · 31904be5 · 31904be5 · 31904be5
--- a/theano/scan.py
+++ b/theano/scan.py
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -43,6 +43,10 @@ class DimShuffle(Op):
    dimension and a numerical index represents the dimension of the same
    rank in the tensor passed to perform.
+    Note 2.04.2010 RP Added 'f' - means that we insert a non-broadcastable
+    dimension; 'f' behaves exactly like 'x', just that the new dimension is
+    not broadcastable
    Examples:
      DimShuffle((False, False, False), ['x', 2, 'x', 0, 1])
@@ -120,10 +124,10 @@ class DimShuffle(Op):
        # transposition of non-broadcastable dimensions
        # This is how the dimensions will be permuted, without accounting for the extra
        # 'x' broadcastable dimensions to insert.
-        self.shuffle = [i2j[x] for x in new_order if x != 'x']
+        self.shuffle = [i2j[x] for x in new_order if x != 'x' and x != 'f']
        # list of dimensions of the output that are broadcastable and were not in the original input
-        self.augment = [i for i, x in enumerate(new_order) if x == 'x']
+        self.augment = [i for i, x in enumerate(new_order) if x == 'x' or x == 'f']
        if self.inplace:
            self.view_map = {0: [0]}
@@ -147,6 +151,8 @@ class DimShuffle(Op):
        for value in self.new_order:
            if value == 'x':
                ob.append(True)
+            elif value == 'f':
+                ob.append(False)
            else:
                ob.append(ib[value])
@@ -235,7 +241,7 @@ class DimShuffle(Op):
        shape_statements = ['npy_intp dimensions[%i]'%nd_out]
        for i, o in enumerate(self.new_order):
-          if o != 'x':
+          if o != 'x' and o != 'f':
            shape_statements += [('dimensions['+str(i)+'] = %(basename)s->dimensions['+str(o)+']')]
          else:
            shape_statements += [('dimensions['+str(i)+'] = 1')]
@@ -250,7 +256,7 @@ class DimShuffle(Op):
        #set the strides of the non-broadcasted dimensions
        for i, o in enumerate(self.new_order):
-          if o != 'x':
+          if o != 'x' and o != 'f':
             strides_statements += [('strides['+str(i)+'] = %(basename)s->strides['+str(o)+']')]
          else:
             strides_statements += [('strides['+str(i)+'] = 0')]
@@ -317,7 +323,7 @@ class DimShuffle(Op):
        gz = as_tensor_variable(gz)
        grad_order = ['x'] * len(x.type.broadcastable)
        for i, v in enumerate(self.new_order):
-            if v != 'x':
+            if v != 'x' and v !='f':
                grad_order[v] = i
        return [DimShuffle(gz.type.broadcastable, grad_order, inplace=True)(Elemwise(scalar.identity)(gz))]

--- a/theano/tests/test_scan.py
+++ b/theano/tests/test_scan.py
@@ -125,7 +125,7 @@ class T_Scan(unittest.TestCase):
        W_in = theano.tensor.dscalar()
        W    = theano.tensor.dscalar()
-        output, updates = theano.scan(f_rnn, u,x0,[W_in,W], n_steps = 0, truncate_gradient =
+        output, updates = theano.scan(f_rnn, u,x0,[W_in,W], n_steps = None, truncate_gradient =
                -1, go_backwards = False)
        f2   = theano.function([u,x0,W_in,W], output, updates = updates)
@@ -146,7 +146,6 @@ class T_Scan(unittest.TestCase):
        assert numpy.allclose(theano_values, v_out)
    # simple rnn, one input, one state, weights for each; input/state
    # are vectors, weights are scalars; using shared variables
    def test_one_sequence_one_output_weights_shared(self):
@@ -159,7 +158,7 @@ class T_Scan(unittest.TestCase):
        def f_rnn_shared(u_t,x_tm1, tmp_W_in, tmp_W):
            return u_t*tmp_W_in+x_tm1*tmp_W
-        output, updates = theano.scan(f_rnn_shared, u,x0,[W_in, W], n_steps =0,
+        output, updates = theano.scan(f_rnn_shared, u,x0,[W_in, W], n_steps =None,
                truncate_gradient= -1, go_backwards = False)
        f3    = theano.function([u,x0], output, updates = updates)
        # get random initial values
@@ -176,7 +175,6 @@ class T_Scan(unittest.TestCase):
        assert  numpy.allclose(theano_values, v_out)
    # some rnn with multiple outputs and multiple inputs; other
    # dimension instead of scalars/vectors
    def test_multiple_inputs_multiple_outputs(self):
@@ -203,7 +201,7 @@ class T_Scan(unittest.TestCase):
            return [theano.dot(u1_t,W_in1) + u2_t* W_in2 + \
                    theano.dot(x_tm1, W), theano.dot(x_tm1, W_out)]
-        outputs, updates = theano.scan(f_rnn_cmpl,[u1,u2],[x0,y0],W_in1, n_steps = 0,
+        outputs, updates = theano.scan(f_rnn_cmpl,[u1,u2],[x0,y0],W_in1, n_steps = None,
                truncate_gradient = -1, go_backwards = False)
        f4     = theano.function([u1,u2,x0,y0,W_in1], outputs, updates = updates)
@@ -222,7 +220,6 @@ class T_Scan(unittest.TestCase):
        assert numpy.allclose(theano_y , v_y)
    # simple rnn, one input, one state, weights for each; input/state are 
    # vectors, weights are scalars; using shared variables and past 
    # taps (sequences and outputs)
@@ -242,7 +239,7 @@ class T_Scan(unittest.TestCase):
            return u_tm2*W_in+x_tm1*W+x_tm2
        outputs, updates = theano.scan(f_rnn_shared, dict(input=u, taps=-2), 
-                dict(initial = x0, taps = [-1,-2]), [], n_steps = 0, truncate_gradient = -1, 
+                dict(initial = x0, taps = [-1,-2]), [], n_steps = None, truncate_gradient = -1, 
                go_backwards = False)
        f7   = theano.function([u,x0], outputs, updates = updates)
@@ -282,7 +279,7 @@ class T_Scan(unittest.TestCase):
            return (u_tm2+u_tp2)*W_in+x_tm1*W+x_tm2
        output,updates = theano.scan(f_rnn_shared, dict( input = u, taps=[-2,2]),\
-                dict(initial = x0, taps = [-1,-2]), [], n_steps =0, truncate_gradient =-1,
+                dict(initial = x0, taps = [-1,-2]), [], n_steps = None, truncate_gradient =-1,
                go_backwards = False)
        f8   = theano.function([u,x0], output, updates = updates)
@@ -324,7 +321,7 @@ class T_Scan(unittest.TestCase):
        outputs, updates = theano.scan(f_rnn_shared, [u0,u1,u2], 
                [dict( initial = x0, inplace =u2), dict(initial = x1, inplace = u1)],
-                [], n_steps = 0, truncate_gradient = -1, go_backwards = False, mode=mode )
+                [], n_steps = None, truncate_gradient = -1, go_backwards = False, mode=mode )
        f9   = theano.function([mu0,mu1,mu2,x0,x1], outputs , updates = updates, mode = mode)
       # compute output in numpy
@@ -374,7 +371,7 @@ class T_Scan(unittest.TestCase):
        outputs, updates = theano.scan(f_rnn_shared, 
                [u0,dict(input = u1, taps = [0,1]),dict( input = u2, taps= [-1,0,+1])], 
                [dict( initial = x0, inplace =u2), dict(initial = x1, inplace = u1)],
-                [], n_steps = 0, truncate_gradient = 01, go_backwards = False, mode=mode )
+                [], n_steps = None, truncate_gradient = 01, go_backwards = False, mode=mode )
        f9   = theano.function([mu0,mu1,mu2,x0,x1], outputs , updates = updates, mode = mode)
       # compute output in numpy
@@ -429,7 +426,7 @@ class T_Scan(unittest.TestCase):
        y0 = theano.tensor.matrix('y0')
        outputs,updates = theano.scan(f, [u1,u2], [ dict(initial = y0, taps = [-3,-2,-1]),y1,
-            None], [], n_steps = 0, go_backwards = False, truncate_gradient = -1)
+            None], [], n_steps = None, go_backwards = False, truncate_gradient = -1)
        f10 = theano.function([u2,y0], outputs, updates = updates)
        theano_y0,theano_y1,theano_y2 = f10(vu2, vy0)
@@ -545,7 +542,7 @@ class T_Scan(unittest.TestCase):
        u    = theano.tensor.dvector()
-        outputs, updates = theano.scan(f_rnn, u,[],[], n_steps =0 , truncate_gradient = -1,
+        outputs, updates = theano.scan(f_rnn, u,[],[], n_steps =None , truncate_gradient = -1,
                go_backwards = False)
        f2    = theano.function([u], outputs, updates = updates)
@@ -578,7 +575,7 @@ class T_Scan(unittest.TestCase):
        W_in = theano.tensor.dscalar()
        W    = theano.tensor.dscalar()
-        output, updates = theano.scan(f_rnn, u,x0,[W_in,W], n_steps = 0, truncate_gradient =
+        output, updates = theano.scan(f_rnn, u,x0,[W_in,W], n_steps = None, truncate_gradient =
                -1, go_backwards = True)
        f2   = theano.function([u,x0,W_in,W], output, updates = updates)
@@ -607,9 +604,7 @@ class T_Scan(unittest.TestCase):
        rng = numpy.random.RandomState(utt.fetch_seed())
        v_v = rng.uniform( size = (5,), low = -5., high = 5.)
        print f(v_v,0.)
-        assert ( numpy.sum(v_v) == f(v_v, 0.) ) 
+        assert abs(numpy.sum(v_v) - f(v_v, 0.)) < 1e-3