Merge pull request #203 from nouiz/max_and_argmax_grad

Implement the grad for inner dimensions in MaxAndArgMax. The pull request is fine. There is one question about gradient of max, that I'm not sure how to tackle, but the changes introduced in this code do not change the behaviour of Theano in this respect. The problem is how should the grad of max look if there are several instances of max ? Should all get the full gradient (like it is happening now), should only one of them receive the gradinet ?

Merge pull request #203 from nouiz/max_and_argmax_grad
1ddb1eda · Razvan Pascanu · bdee7c76 · 834ad203 · 1ddb1eda · 1ddb1eda
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1941,32 +1941,36 @@ class MaxAndArgmax(Op):
            return [eval_points[0][arange(eval_points[0].shape[0]),
                                   max_pos], None]

-
-
    def grad(self, inp, grads):
-        # @warning: This only works if axis is 0, else the max is
-        # broadcasted wrong in the call to eq.
        # @note: This function should work correctly for L{vector}s.
 #        (x, y), (gz, gw)
 #        gz*dz/dx + gw*dw/dx, gz*dz/dy + gw*dw/dy
-#        gMax * dMax/dx + gArgMax * dArgMax/dx, gMax * dMax/daxis + gArgMax * dArgMax/daxis
-#       g_max has one less dimension than x, so you need to complete g_max to x's shape
-#        when axis=0 the broadcasting mechanism does it automatically
+#        gMax * dMax/dx + gArgMax * dArgMax/dx,
+#                           gMax * dMax/daxis + gArgMax * dArgMax/daxis
+#       g_max has one less dimension than x, so you need to complete
+#        g_max to x's shape when axis=0 the broadcasting mechanism
+#        does it automatically
        x, axis = inp
        g_max, g_max_idx = grads
-        if not ( axis.data == 0 or axis.data == x.ndim-1):
-            raise NotImplementedError('MaxAndArgmax gradient with axis corresponding to internal dimension')
-        if axis.data==0:
-            g_max_pad = shape_padleft(g_max)
-        else:
-            g_max_pad = shape_padright(g_max)
+
        xmax = max(x, axis)
-        if axis.data==0:
-            xmax_pad = shape_padleft(xmax)
-        else:
-            xmax_pad = shape_padright(xmax)
+
+        # Raise the g_max and xmax to the same number of dim as the input.
+        pattern = []
+        out_dim = 0
+        for i in range(inp[0].ndim):
+            if i == axis.data:
+                pattern.append('x')
+            else:
+                pattern.append(out_dim)
+                out_dim += 1
+        g_max_pad = DimShuffle(g_max.broadcastable, pattern)(g_max)
+        xmax_pad = DimShuffle(xmax.broadcastable, pattern)(xmax)
+
+        # Set the grad to the correct position.
        g_x = eq(xmax_pad, x) * g_max_pad
        return g_x, None
+
    def __str__(self):
        return self.__class__.__name__


--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1561,13 +1561,16 @@ class T_max_and_argmax(unittest.TestCase):
        n = as_tensor_variable(data)

        def check_grad_max(data, max_grad_data, axis=None):
+            """
+            Why this is needed? verify_grad is not enought?
+            """
            #This work only for axis in [0,None]
            assert axis in [0,None]
            z = numpy.zeros_like(data)
            z = z.flatten()
            argmax=numpy.argmax(data,axis=axis)
            if argmax.ndim==0:
-                z[numpy.argmax(data,axis=axis)]+=1
+                z[argmax]+=1
            else:
                for id,v in enumerate(argmax):
                    z[v*numpy.prod(data.shape[data.ndim-1:axis:-1])+id]+=1
@@ -1592,6 +1595,14 @@ class T_max_and_argmax(unittest.TestCase):
        utt.verify_grad(lambda v: max_and_argmax(v.flatten())[1], [data])
        check_grad_max(data,eval_outputs(grad(max_and_argmax(n.flatten())[0],n)))

+        # Test 4d inner dimensions
+        data = numpy.random.rand(2, 3, 4, 5)
+        n = as_tensor_variable(data)
+        for i in [0, 1, 2, 3]:
+            utt.verify_grad(lambda v: max_and_argmax(v, axis=[i])[0], [data])
+            utt.verify_grad(lambda v: max_and_argmax(v, axis=[i])[1], [data])
+
+
 class T_argmin_argmax(unittest.TestCase):
    def setUp(self):
        utt.seed_rng()

--- a/theano/tensor/tests/test_rop.py
+++ b/theano/tensor/tests/test_rop.py
@@ -3,7 +3,12 @@

 Tests for the R operator / L operator

- For the list of op with r op defined, with or without missing test see this file: defined see this file
+ For the list of op with r op defined, with or without missing test
+ see this file: doc/library/tensor/basic.txt
+
+ For function to automatically test your Rop implementation, look at
+ the docstring of the functions: check_mat_rop_lop, check_rop_lop,
+ check_nondiff_rop,

 """

@@ -41,7 +46,9 @@ class BreakRop(Op):
 break_op = BreakRop()


-class test_RopLop(unittest.TestCase):
+class RopLop_checker(unittest.TestCase):
+    """ Don't peform any test, but provide the function to test the
+    Rop to class that inherit from it."""

    def setUp(self):
        # Using vectors make things a lot simpler for generating the same
@@ -56,6 +63,8 @@ class test_RopLop(unittest.TestCase):
                             5+self.rng.randint(30))

    def check_nondiff_rop(self, y):
+        """ If you op is not differentiable(so you can't define Rop)
+        test that an error is raised."""
        raised = False
        try:
            tmp = tensor.Rop(y, self.x, self.v)
@@ -67,6 +76,24 @@ class test_RopLop(unittest.TestCase):
                ' is not differentiable'))

    def check_mat_rop_lop(self, y, out_shape):
+        """ Test the Rop/Lop when input is a matrix and the output is a vector
+
+        :param y: the output variable of the op applied to self.mx
+        :param out_shape: Used to generate a random tensor
+                          corresponding to the evaluation point of the Rop
+                          (i.e. the tensor with which you multiply the
+                          Jacobian). It should be a tuple of ints.
+
+        If the Op have more then 1 input, one of them must be mx, the
+        other must be shared variable/constant. We will test only
+        again the input self.mx, so you must call
+        check_mat_rop_lop/check_rop_lop for the others input.
+
+        We expect all inputs/outputs have dtype floatX.
+
+        If you want to test an out with an output matrix, add a sum
+        after the Op you want to test.
+        """
        vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX)
        yv = tensor.Rop(y, self.mx, self.mv)
@@ -97,9 +124,12 @@ class test_RopLop(unittest.TestCase):
        v2 = scan_f(vx,vv)
        assert numpy.allclose(v1,v2), ('LOP mismatch: %s %s' % (v1, v2))

-
-
    def check_rop_lop(self, y, out_shape):
+        """
+        As check_mat_rop_lop, except the input is self.x witch is a
+        vector. The output is still a vector.
+
+        """
        # TEST ROP
        vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX)
@@ -138,6 +168,7 @@ class test_RopLop(unittest.TestCase):
        assert numpy.allclose(v1,v2), ('LOP mismatch: %s %s' % (v1, v2))


+class test_RopLop(RopLop_checker):
    def test_shape(self):
        self.check_nondiff_rop( self.x.shape[0])