Merge pull request #730 from larseeri/shape_tensor_nnet

Better infer_shape test for Softmax, SoftmaxWithBias, SoftmaxGrad, CrossentropySoftmaxArgmax1HotWithBias, ConvOp, Conv3D, ConvTransp3D, ConvGrad3D Added and tested infer_shape for CrossentropySoftmax1HotWithBiasDx,Prepend_scalar_constant_to_each_row and Prepend_scalar_to_each_row, CrossentropyCategorical1HotGrad Added disabled CrossentropyCategorical1Hot.infer_shape, see gh-788

Merge pull request #730 from larseeri/shape_tensor_nnet
28a095d6 · nouiz · bc365ed0 · b0b3dafa · 28a095d6 · 28a095d6
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -857,6 +857,9 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
            dx[i, y_idx[i]] -= dy[i]  # scalar decrement
        output_storage[0][0] = dx

+    def infer_shape(self, node, shapes):
+        return [shapes[1]]
+
    def grad(self, inp, grads):
        dy, sm, y_idx = inp
        g_dx, = grads
@@ -1031,8 +1034,11 @@ class CrossentropyCategorical1HotGrad(gof.Op):
        for i in xrange(len(g_y)):
            g_coding[i, true_one_of_n[i]] = -g_y[i] / coding_dist[i,
                                                        true_one_of_n[i]]
-
        g_coding_strg[0] = g_coding
+
+    def infer_shape(self, node, in_shapes):
+        return [in_shapes[1]]
+
 crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad()


@@ -1091,6 +1097,17 @@ class CrossentropyCategorical1Hot(gof.Op):
            y[i] = -numpy.log(coding[i, one_of_n[i]])
        y_out[0] = y

+#Enabling this infer_shape method make 2 tests fail:
+#theano/tensor/nnet/tests/test_nnet.py:T_CrossentropyCategorical1Hot.
+#     {test_softmax_grad_optimizations,test_softmax_grad_optimizations_vector}
+# This is caused by the local_fill_to_alloc that call broadcast_like
+# that look into the shape feature and return a Rebroadcast instead of an alloc.
+# I disable this infer_shape until we fix the optimizations or determine that
+# this is not needed anymore and we update the tests.
+        # see issue gh-788
+#    def infer_shape(self, node, in_shapes):
+#        return [(in_shapes[0][0],)]
+
    def grad(self, inp, grads):
        coding, one_of_n = inp
        g_y, = grads
@@ -1121,7 +1138,7 @@ def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x, b,
                            one_of_n)
                    fgraph.replace_all_validate([(nll, new_nll), (sm, new_sm)],
-                            reason="crossentropy_to_crossentropy_with_softmax")
+                reason="crossentropy_to_crossentropy_with_softmax_with_bias")
                    return True

        return False
@@ -1645,6 +1662,11 @@ class Prepend_scalar_constant_to_each_row(gof.Op):
        out[:, 0].fill(self.val.data)
        out[:, 1:] = mat

+    def infer_shape(self, node, in_shapes):
+        shp = (in_shapes[0][0], in_shapes[0][1] + 1)
+        return [shp]
+        
+
    def grad(self, inp, grads):
        mat, = inp
        goutput, = grads
@@ -1694,6 +1716,10 @@ class Prepend_scalar_to_each_row(gof.Op):
        out[:, 0].fill(val)
        out[:, 1:] = mat

+    def infer_shape(self, node, in_shapes):
+        shp = (in_shapes[1][0], in_shapes[1][1] + 1)
+        return [shp]
+
    def grad(self, inp, grads):
        val, mat = inp
        goutput, = grads

--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -13,10 +13,10 @@ from theano.tensor.nnet import conv
 from theano.tensor.basic import _allclose


-class TestConv2D(unittest.TestCase):
+class TestConv2D(utt.InferShapeTester):

    def setUp(self):
-        utt.seed_rng()
+        super (TestConv2D, self).setUp()
        self.input = T.dtensor4('input')
        self.filters = T.dtensor4('filters')

@@ -368,8 +368,7 @@ class TestConv2D(unittest.TestCase):
        gcc bug. So it should not crash anymore
        """
        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid',
-             verify_grad=False)
-        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid', verify_grad=False)
+                      verify_grad=False)

    def speed(self):
        n_calls = 20000
@@ -407,3 +406,100 @@ class TestConv2D(unittest.TestCase):
                        t2 = time.time()
                        print t2 - t1,
                    print
+
+    def test_infer_shape(self):
+    # Note: infer_shape is incomplete and thus input and filter shapes
+    # must be provided explicitly
+
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+
+        adtens = T.dtensor4()
+        bdtens = T.dtensor4()
+        aivec_val = [2, 2, 3, 3]
+        bivec_val = [2, 2, 2, 2]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='valid')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [2, 2, 3, 3]
+        bivec_val = [2, 2, 2, 2]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='full')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [3, 2, 8, 8]
+        bivec_val = [4, 2, 5, 5]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='valid')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [3, 2, 8, 8]
+        bivec_val = [4, 2, 5, 5]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='full')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [3, 2, 7, 5]
+        bivec_val = [5, 2, 3, 2]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='valid')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [3, 2, 7, 5]
+        bivec_val = [5, 2, 3, 2]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='full')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [3, 2, 7, 5]
+        bivec_val = [5, 2, 2, 3]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='valid')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [3, 2, 7, 5]
+        bivec_val = [5, 2, 2, 3]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='full')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [3, 2, 3, 3]
+        bivec_val = [4, 2, 3, 3]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='valid')], [adtens_val, bdtens_val], conv.ConvOp)
+
+        aivec_val = [3, 2, 3, 3]
+        bivec_val = [4, 2, 3, 3]
+        adtens_val = rand(*aivec_val)
+        bdtens_val = rand(*bivec_val)
+        self._compile_and_check([adtens, bdtens],
+                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val,
+                border_mode='full')], [adtens_val, bdtens_val], conv.ConvOp)
+
+
+if __name__ == '__main__':
+
+    t = TestConv2D('setUp')
+    t.setUp()
+    t.test_infer_shape()
--- a/theano/tensor/nnet/tests/test_conv3d.py
+++ b/theano/tensor/nnet/tests/test_conv3d.py
@@ -3,9 +3,9 @@ import theano
 import theano.tensor as T
 from theano import function, shared
 from theano.tests import unittest_tools as utt
-from theano.tensor.nnet.ConvTransp3D import convTransp3D
-from theano.tensor.nnet.ConvGrad3D import convGrad3D
-from theano.tensor.nnet.Conv3D import conv3D
+from theano.tensor.nnet.ConvTransp3D import convTransp3D, ConvTransp3D
+from theano.tensor.nnet.ConvGrad3D import convGrad3D, ConvGrad3D
+from theano.tensor.nnet.Conv3D import conv3D, Conv3D
 import numpy as N
 import copy
 import theano.sparse
@@ -20,7 +20,9 @@ floatX = theano.config.floatX
 #      a subset of the tests they will do different things than if you
 #      run all of them

+
 class DummyConv3D:
+
    """A dummy version of Conv3D passed to verify_grad
    Stores a fixed stride, since stride is not differentiable
    Exposes only one scalar argument, which is used as the position
@@ -30,149 +32,174 @@ class DummyConv3D:
    verify_grad will not need to test hundreds of variables. Disadvantage
    is we can't be certain that all of them are correct, advantange is that
    this random projection lets us test lots of variables very quickly """
+
    def __init__(self, rng, VWbVals, d):
        """
-        param: rng    Random number generator used to pick direction of the line
+        param: rng    Random number generator used to pick direction of the
+            line
        param: VWbVals    tuple containing values to test V,W,b around
        param: d    shared variable for d, the stride
        """

        self.V, self.W, self.b = VWbVals
-        self.dV = shared(rng.uniform(-1,1,self.V.get_value(borrow=True).shape))
-        self.dW = shared(rng.uniform(-1,1,self.W.get_value(borrow=True).shape))
-        self.db = shared(rng.uniform(-1,1,self.b.get_value(borrow=True).shape))
+        self.dV = shared(rng.uniform(-1, 1,
+                                     self.V.get_value(borrow=True).shape))
+        self.dW = shared(rng.uniform(-1, 1,
+                                     self.W.get_value(borrow=True).shape))
+        self.db = shared(rng.uniform(-1, 1,
+                                     self.b.get_value(borrow=True).shape))

        self.d = d

    def __call__(self, t):
-        output = conv3D(self.V+t*self.dV,self.W+t*self.dW,self.b+t*self.db,self.d)
+        output = conv3D(self.V + t * self.dV, self.W + t * self.dW,
+                        self.b + t * self.db, self.d)

        return output

+
 class DummyConvGrad3D:
+
    def __init__(self, rng, VdHvals, d, WShape):
        """
-        param: rng    Random number generator used to pick direction of the line
+        param: rng    Random number generator used to pick direction of the
+            line
        param: VWbVals    tuple containing values to test V,W,b around
        param: d    shared variable for d, the stride
        """

        self.V, self.dCdH = VdHvals
-        self.dV = shared(rng.uniform(-1,1,self.V.get_value(borrow=True).shape))
-        self.ddCdH = shared(rng.uniform(-1,1,self.dCdH.get_value(borrow=True).shape))
-
+        self.dV = shared(rng.uniform(-1, 1,
+                                     self.V.get_value(borrow=True).shape))
+        self.ddCdH = shared(rng.uniform(-1, 1,
+                                    self.dCdH.get_value(borrow=True).shape))
        self.d = d
        self.WShape = WShape

    def __call__(self, t):

-        output = convGrad3D(self.V+t*self.dV,self.d,self.WShape,self.dCdH + t * self.ddCdH)
+        output = convGrad3D(self.V + t * self.dV, self.d, self.WShape,
+                            self.dCdH + t * self.ddCdH)
        return output


 class DummyConvTransp3D:
+
    def __init__(self, rng, WbHvals, d, RShape):
        """
-        param: rng    Random number generator used to pick direction of the line
+        param: rng    Random number generator used to pick direction of the
+            line
        param: VWbVals    tuple containing values to test V,W,b around
        param: d    shared variable for d, the stride
        """

        self.W, self.b, self.H = WbHvals
-        self.dW = rng.uniform(-1,1,self.W.get_value(borrow=True).shape)
-        self.db = rng.uniform(-1,1,self.b.get_value(borrow=True).shape)
-        self.dH = rng.uniform(-1,1,self.H.get_value(borrow=True).shape)
-        self.dW, self.db, self.dH = shared(self.dW), shared(self.db), shared(self.dH)
-
+        self.dW = rng.uniform(-1, 1, self.W.get_value(borrow=True).shape)
+        self.db = rng.uniform(-1, 1, self.b.get_value(borrow=True).shape)
+        self.dH = rng.uniform(-1, 1, self.H.get_value(borrow=True).shape)
+        self.dW, self.db, self.dH = shared(self.dW), shared(self.db),
+        shared(self.dH)

        self.d = d
        self.RShape = RShape

    def __call__(self, t):
-        output = convTransp3D(self.W+t*self.dW,self.b+t*self.db,self.d,self.H+t*self.dH, self.RShape)
+        output = convTransp3D(self.W + t * self.dW, self.b + t * self.db,
+                              self.d, self.H + t * self.dH, self.RShape)

        return output

-class TestConv3D(unittest.TestCase):

-    def setUp(self):
+class TestConv3D(utt.InferShapeTester):

+    def setUp(self):
+        super(TestConv3D, self).setUp()
        utt.seed_rng()
-
        self.rng = N.random.RandomState(utt.fetch_seed())

        mode = copy.copy(theano.compile.mode.get_default_mode())
        mode.check_py_code = False

-        self.W  = shared(N.ndarray(shape=(1,1,1,1,1), dtype=floatX))
-        self.b  = shared(N.zeros(1,dtype=floatX))
-        self.rb = shared(N.zeros(1,dtype=floatX))
-        self.V  = shared(N.ndarray(shape=(1,1,1,1,1), dtype=floatX))
-        self.d  = shared(N.ndarray(shape=(3,),dtype=int))
+        self.W = shared(N.ndarray(shape=(1, 1, 1, 1, 1), dtype=floatX))
+        self.b = shared(N.zeros(1, dtype=floatX))
+        self.rb = shared(N.zeros(1, dtype=floatX))
+        self.V = shared(N.ndarray(shape=(1, 1, 1, 1, 1), dtype=floatX))
+        self.d = shared(N.ndarray(shape=(3, ), dtype=int))

        self.H = conv3D(self.V, self.W, self.b, self.d)
-
-        self.H_func = function([], self.H, mode = mode)
-
-        self.H_shape_func = function( [], self.H.shape, mode = mode)
+        self.H_func = function([], self.H, mode=mode)
+        self.H_shape_func = function([], self.H.shape, mode=mode)

        self.RShape = T.vector(dtype='int64')

-        self.otherH = T.TensorType(floatX,(False,False,False,False,False))(name='otherH')
-        self.transp = convTransp3D(self.W, self.rb, self.d, self.otherH, self.RShape)
-        self.transp_func = function([self.otherH,self.RShape],self.transp, mode=mode)
+        self.otherH = T.TensorType(floatX,
+                        (False, False, False, False, False))(name='otherH')
+        self.transp = convTransp3D(self.W, self.rb, self.d,
+                                   self.otherH, self.RShape)
+        self.transp_func = function([self.otherH, self.RShape],
+                                    self.transp, mode=mode)

        self.R = convTransp3D(self.W, self.rb, self.d, self.H, self.RShape)
-        self.R_func = function([self.RShape], self.R, mode = mode)
+        self.R_func = function([self.RShape], self.R, mode=mode)
        self.R_shape_func = function([self.RShape], self.R.shape)

-        self.reconsObj = T.sum(T.sqr(self.V-self.R))
+        self.reconsObj = T.sum(T.sqr(self.V - self.R))
        self.reconsObjFunc = function([self.RShape], self.reconsObj, mode=mode)

-        self.gradientsFunc = function([self.RShape], [ T.grad(self.reconsObj, self.W), T.grad(self.reconsObj, self.H), T.grad(self.reconsObj, self.V), T.grad(self.reconsObj,self.b) ] , mode=mode)
-        self.check_c_against_python = function([self.RShape], [ T.grad(self.reconsObj, self.W), T.grad(self.reconsObj, self.H), T.grad(self.reconsObj, self.V), T.grad(self.reconsObj,self.b) ] , mode='DEBUG_MODE')
-
+        self.gradientsFunc = function([self.RShape],
+                        [T.grad(self.reconsObj, self.W), T.grad(self.reconsObj,
+                        self.H), T.grad(self.reconsObj, self.V),
+                         T.grad(self.reconsObj, self.b)], mode=mode)

-        self.dCdW_shape_func = function([self.RShape],  T.grad(self.reconsObj, self.W).shape, mode=mode)
+        self.check_c_against_python = function([self.RShape],
+                        [T.grad(self.reconsObj, self.W), T.grad(self.reconsObj,
+                        self.H), T.grad(self.reconsObj, self.V),
+                         T.grad(self.reconsObj, self.b)], mode='DEBUG_MODE')

+        self.dCdW_shape_func = function([self.RShape],
+                        T.grad(self.reconsObj, self.W).shape, mode=mode)

-    def random_tensor(self,*dims):
-        return N.asarray(self.rng.uniform(-.05,.05,dims),dtype=floatX)
+    def random_tensor(self, *dims):
+        return N.asarray(self.rng.uniform(-.05, .05, dims), dtype=floatX)

    def randomize(self):
-        batchSize    = self.rng.randint(1,4)
-        videoDur     = self.rng.randint(8,30)
-        filterWidth = self.rng.randint(1,8)
-        filterHeight = self.rng.randint(1,8)
-        filterDur    = self.rng.randint(1,8)
-
-        tsteps = self.rng.randint(1,4)
-        rsteps = self.rng.randint(1,4)
-        csteps = self.rng.randint(1,4)
-
-        videoDur    = tsteps * filterDur    + self.rng.randint(0,3)
-        videoWidth  = csteps * filterWidth  + self.rng.randint(0,3)
-        videoHeight = rsteps * filterHeight + self.rng.randint(0,3)
-
-        numFilters = self.rng.randint(1,3)
-        inputChannels = self.rng.randint(1,3)
-        self.d.get_value(borrow=True, return_internal_type=True)[0] = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[1] = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[2] = self.rng.randint(1,15)
-
-        outputHeight = int( (videoHeight - filterHeight) / self.d.get_value(borrow=True)[0] )+1
-        outputWidth  = int( (videoWidth - filterWidth) / self.d.get_value(borrow=True)[1] )+1
-        outputDur    = int( (videoDur - filterDur) / self.d.get_value(borrow=True)[2] ) +1
-
-        self.W.set_value(
-                self.random_tensor(numFilters,filterHeight,filterWidth,filterDur,inputChannels),
-                borrow=True)
+        batchSize = self.rng.randint(1, 4)
+        videoDur = self.rng.randint(8, 30)
+        filterWidth = self.rng.randint(1, 8)
+        filterHeight = self.rng.randint(1, 8)
+        filterDur = self.rng.randint(1, 8)
+
+        tsteps = self.rng.randint(1, 4)
+        rsteps = self.rng.randint(1, 4)
+        csteps = self.rng.randint(1, 4)
+
+        videoDur = tsteps * filterDur + self.rng.randint(0, 3)
+        videoWidth = csteps * filterWidth + self.rng.randint(0, 3)
+        videoHeight = rsteps * filterHeight + self.rng.randint(0, 3)
+
+        numFilters = self.rng.randint(1, 3)
+        inputChannels = self.rng.randint(1, 3)
+        self.d.get_value(borrow=True, return_internal_type=True)[0] = \
+            self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[1] = \
+            self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[2] = \
+            self.rng.randint(1, 15)
+
+        outputHeight = int((videoHeight - filterHeight) /
+                           self.d.get_value(borrow=True)[0]) + 1
+        outputWidth = int((videoWidth - filterWidth) /
+                          self.d.get_value(borrow=True)[1]) + 1
+        outputDur = int((videoDur - filterDur) /
+                        self.d.get_value(borrow=True)[2]) + 1
+
+        self.W.set_value(self.random_tensor(numFilters, filterHeight,
+                    filterWidth, filterDur, inputChannels), borrow=True)
        self.b.set_value(self.random_tensor(numFilters), borrow=True)
        self.rb.set_value(self.random_tensor(inputChannels), borrow=True)

-        self.V.set_value(
-                self.random_tensor(batchSize,videoHeight,videoWidth,videoDur,inputChannels),
-                borrow=True)
+        self.V.set_value(self.random_tensor(batchSize, videoHeight,
+                    videoWidth, videoDur, inputChannels), borrow=True)
        self.rb.set_value(self.random_tensor(inputChannels), borrow=True)

    def test_c_against_python(self):
@@ -180,37 +207,38 @@ class TestConv3D(unittest.TestCase):
        self.check_c_against_python(self.V.get_value(borrow=True).shape[1:4])

    def test_c_against_mat_mul(self):
-        #Use a filter of the same size as the image, so the convolution is just a dense matrix multiply
-        #Check that dense matrix multiplication gives the same result as convolution
-
-        batchSize   = self.rng.randint(1,10)
-        videoDur    = self.rng.randint(3,10)
-        videoWidth  = self.rng.randint(1,5)
-        videoHeight = self.rng.randint(1,5)
-
-        filterWidth   = videoWidth
-        filterHeight  = videoHeight
-        filterDur     = videoDur
-        numFilters    = self.rng.randint(1,3)
-        inputChannels = self.rng.randint(1,4)
+        # Use a filter of the same size as the image, so the convolution is
+        # just a dense matrix multiply.
+        # Check that dense matrix multiplication gives the same result as
+        # convolution.

-        self.d.get_value(borrow=True, return_internal_type=True)[0] = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[1] = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[2] = self.rng.randint(1,15)
+        batchSize = self.rng.randint(1, 10)
+        videoDur = self.rng.randint(3, 10)
+        videoWidth = self.rng.randint(1, 5)
+        videoHeight = self.rng.randint(1, 5)

+        filterWidth = videoWidth
+        filterHeight = videoHeight
+        filterDur = videoDur
+        numFilters = self.rng.randint(1, 3)
+        inputChannels = self.rng.randint(1, 4)

-        self.W.set_value(
-                self.random_tensor(numFilters,filterHeight,filterWidth,filterDur,inputChannels),
-                borrow=True)
+        self.d.get_value(borrow=True, return_internal_type=True)[0] = \
+            self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[1] = \
+            self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[2] = \
+            self.rng.randint(1, 15)

-        self.W.set_value(
-                self.W.get_value(borrow=True) * (self.W.get_value(borrow=True) < 1e-5),
-                borrow=True)
+        self.W.set_value(self.random_tensor(numFilters, filterHeight,
+                filterWidth, filterDur, inputChannels), borrow=True)
+        self.W.set_value(self.W.get_value(borrow=True) *
+                (self.W.get_value(borrow=True) < 1e-5), borrow=True)

        self.b.set_value(self.random_tensor(numFilters), borrow=True)
-        self.V.set_value(
-                self.random_tensor(batchSize,videoHeight,videoWidth,videoDur,inputChannels),
-                borrow=True)
+
+        self.V.set_value(self.random_tensor(batchSize, videoHeight,
+                videoWidth, videoDur, inputChannels), borrow=True)

        Hv = self.H_func()

@@ -220,156 +248,163 @@ class TestConv3D(unittest.TestCase):

        n = inputChannels * videoHeight * videoWidth * videoDur
        W_mat = N.zeros((n, numFilters))
-        V_mat = N.zeros((batchSize,n))
+        V_mat = N.zeros((batchSize, n))
        Hv_mat = N.zeros((batchSize, numFilters))
-        for qi in xrange(0,numFilters):
-            W_mat[:,qi] = self.W.get_value(borrow=True)[qi,:,:,:,:].reshape((n))
-            Hv_mat[:,qi] = Hv[:,0,0,0,qi]
-        for qi in xrange(0,batchSize):
-            V_mat[qi,:] = self.V.get_value(borrow=True)[qi,:,:,:,:].reshape((n))
+        for qi in xrange(0, numFilters):
+            W_mat[:, qi] = \
+                    self.W.get_value(borrow=True)[qi, :, :, :, :].reshape((n))
+            Hv_mat[:, qi] = Hv[:, 0, 0, 0, qi]
+        for qi in xrange(0, batchSize):
+            V_mat[qi, :] = \
+                    self.V.get_value(borrow=True)[qi, :, :, :, :].reshape((n))

-        H_mat = N.dot(V_mat,W_mat) + self.b.get_value(borrow=True)
+        H_mat = N.dot(V_mat, W_mat) + self.b.get_value(borrow=True)

        tol = 1e-5
        if floatX == 'float32':
            tol = 1e-4

-        if N.abs(H_mat-Hv_mat).max() > tol and not N.allclose(H_mat,Hv_mat):
+        if N.abs(H_mat - Hv_mat).max() > tol and not N.allclose(H_mat, Hv_mat):
            print H_mat
            print Hv_mat
-            print 'max error: '+str(N.abs(H_mat-Hv_mat).max())
+            print 'max error: ' + str(N.abs(H_mat - Hv_mat).max())
            W.get_value(borrow=True)[W.get_value(borrow=True) != 0] += 1.0
-            print 'min non-zero kernel mag: '+str(N.abs(W.get_value(borrow=True)).min())
-
+            print 'min non-zero kernel mag: ' + \
+                str(N.abs(W.get_value(borrow=True)).min())
            assert False

    def test_c_against_mat_transp_mul(self):
-    #Use a filter of the same size as the image, so the convolution is just a dense matrix multiply
-    #Check that dense matrix multiplication by the transpose of the matrix gives the same result as ConvTransp
-        batchSize = self.rng.randint(1,10)
-        videoDur = self.rng.randint(3,15)
-        videoWidth = self.rng.randint(3,15)
-        videoHeight = self.rng.randint(3,15)
-
+    # Use a filter of the same size as the image, so the convolution is just a
+    # dense matrix multiply.
+    # Check that dense matrix multiplication by the transpose of the matrix
+    # gives the same result as ConvTransp.
+        batchSize = self.rng.randint(1, 10)
+        videoDur = self.rng.randint(3, 15)
+        videoWidth = self.rng.randint(3, 15)
+        videoHeight = self.rng.randint(3, 15)

        filterWidth = videoWidth
        filterHeight = videoHeight
        filterDur = videoDur
-        numFilters = self.rng.randint(1,15)
-        inputChannels = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[0] = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[1] = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[2] = self.rng.randint(1,15)
-
-
-
-        self.W.set_value(
-                self.random_tensor(numFilters,filterHeight,filterWidth,filterDur,inputChannels),
-                borrow=True)
+        numFilters = self.rng.randint(1, 15)
+        inputChannels = self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[0] = \
+            self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[1] = \
+            self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[2] = \
+            self.rng.randint(1, 15)
+
+        self.W.set_value(self.random_tensor(numFilters, filterHeight,
+                    filterWidth, filterDur, inputChannels), borrow=True)

        self.b.set_value(self.random_tensor(numFilters), borrow=True)

-        self.V.set_value(
-                self.random_tensor(batchSize,videoHeight,videoWidth,videoDur,inputChannels),
-                borrow=True)
+        self.V.set_value(self.random_tensor(batchSize, videoHeight,
+                    videoWidth, videoDur, inputChannels), borrow=True)
        self.rb.set_value(self.random_tensor(inputChannels), borrow=True)

        H_shape = self.H_shape_func()

-        assert H_shape[1] ==  1
+        assert H_shape[1] == 1
        assert H_shape[2] == 1
        assert H_shape[3] == 1

-        Hv = self.random_tensor( * H_shape )
+        Hv = self.random_tensor( * H_shape)

-        Vv = self.transp_func(Hv,[videoHeight,videoWidth,videoDur])
+        Vv = self.transp_func(Hv, [videoHeight, videoWidth, videoDur])

        n = inputChannels * videoHeight * videoWidth * videoDur
-        rbim = N.zeros((videoHeight,videoWidth,videoDur,inputChannels))
-        for qi in xrange(0,inputChannels):
-            rbim[:,:,:,qi] = self.rb.get_value(borrow=True)[qi]
+        rbim = N.zeros((videoHeight, videoWidth, videoDur, inputChannels))
+        for qi in xrange(0, inputChannels):
+            rbim[:, :, :, qi] = self.rb.get_value(borrow=True)[qi]
        rbv = rbim.reshape((n))
        W_mat = N.zeros((numFilters, n))
        Vv_mat = N.zeros((n, batchSize))
-        Hv_mat = N.zeros((numFilters,batchSize))
-        for qi in xrange(0,numFilters):
-            W_mat[qi,:] = self.W.get_value(borrow=True)[qi,:,:,:,:].reshape((n))
-            Hv_mat[qi,:] = Hv[:,0,0,0,qi]
-        for qi in xrange(0,batchSize):
-            Vv_mat[:,qi] = Vv[qi,:,:,:,:].reshape((n))
-
-
-        V_mat = (N.dot(W_mat.transpose(),Hv_mat).transpose() + rbv).transpose()
-
-        if N.abs(V_mat-Vv_mat).max() > 1e-5:
+        Hv_mat = N.zeros((numFilters, batchSize))
+        for qi in xrange(0, numFilters):
+            W_mat[qi, :] = \
+                    self.W.get_value(borrow=True)[qi, :, :, :, :].reshape((n))
+            Hv_mat[qi, :] = Hv[:, 0, 0, 0, qi]
+        for qi in xrange(0, batchSize):
+            Vv_mat[:, qi] = Vv[qi, :, :, :, :].reshape((n))
+
+        V_mat = (N.dot(W_mat.transpose(), Hv_mat).transpose() + \
+                 rbv).transpose()
+
+        if N.abs(V_mat - Vv_mat).max() > 1e-5:
            print V_mat
            print Vv_mat

            for qq in xrange(V_mat.shape[0]):
                for qqq in xrange(Vv_mat.shape[1]):
-                    if abs(V_mat[qq,qqq]-Vv_mat[qq,qqq]) > 1e-5:
-                        print 'wrong at '+str((qq,qqq))+': '+str((V_mat[qq,qqq],Vv_mat[qq,qqq]))
+                    if abs(V_mat[qq, qqq] - Vv_mat[qq, qqq]) > 1e-5:
+                        print ('wrong at ' + str((qq, qqq)) + ': ' +
+                        str(V_mat[qq, qqq], Vv_mat[qq, qqq]))
                        assert False

    def test_c_against_sparse_mat_transp_mul(self):
-    #like test_c_against_mat_transp_mul but using a sparse matrix and a kernel that is smaller than the image
+    # like test_c_against_mat_transp_mul but using a sparse matrix and a kernel
+    # that is smaller than the image
        if not theano.sparse.enable_sparse:
            raise SkipTest('Optional package sparse disabled')

-        batchSize    = self.rng.randint(1,3)
-        filterWidth  = self.rng.randint(1,8)
-        filterHeight = self.rng.randint(1,8)
-        filterDur    = self.rng.randint(1,8)
+        batchSize = self.rng.randint(1, 3)
+        filterWidth = self.rng.randint(1, 8)
+        filterHeight = self.rng.randint(1, 8)
+        filterDur = self.rng.randint(1, 8)

-        self.d.get_value(borrow=True, return_internal_type=True)[0] = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[1] = self.rng.randint(1,15)
-        self.d.get_value(borrow=True, return_internal_type=True)[2] = self.rng.randint(1,15)
+        self.d.get_value(borrow=True, return_internal_type=True)[0] = \
+            self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[1] = \
+            self.rng.randint(1, 15)
+        self.d.get_value(borrow=True, return_internal_type=True)[2] = \
+            self.rng.randint(1, 15)

        dr = self.d.get_value(borrow=True)[0]
        dc = self.d.get_value(borrow=True)[1]
        dt = self.d.get_value(borrow=True)[2]

-
-        numFilters = self.rng.randint(1,3)
-        row_steps  = self.rng.randint(1,4)
-        col_steps  = self.rng.randint(1,4)
-        time_steps = self.rng.randint(1,4)
+        numFilters = self.rng.randint(1, 3)
+        row_steps = self.rng.randint(1, 4)
+        col_steps = self.rng.randint(1, 4)
+        time_steps = self.rng.randint(1, 4)

        #print (row_steps,col_steps,time_steps)

-        videoDur    = (time_steps-1)*dt+filterDur   + self.rng.randint(0,3)
-        videoWidth  = (col_steps-1)*dc+filterWidth  + self.rng.randint(0,3)
-        videoHeight = (row_steps-1)*dr+filterHeight + self.rng.randint(0,3)
+        videoDur = (time_steps - 1) * dt + filterDur + \
+                      self.rng.randint(0, 3)
+        videoWidth = (col_steps - 1) * dc + filterWidth + \
+                      self.rng.randint(0, 3)
+        videoHeight = (row_steps - 1) * dr + filterHeight + \
+                      self.rng.randint(0, 3)

+        inputChannels = self.rng.randint(1, 15)

-        inputChannels = self.rng.randint(1,15)
-
-        self.W.set_value(
-                self.random_tensor(numFilters,filterHeight,filterWidth,filterDur,inputChannels),
-                borrow=True)
+        self.W.set_value(self.random_tensor(numFilters, filterHeight,
+                filterWidth, filterDur, inputChannels), borrow=True)
        self.b.set_value(self.random_tensor(numFilters), borrow=True)
        #just needed so H_shape works
-        self.V.set_value(
-                self.random_tensor(batchSize,videoHeight,videoWidth,videoDur,inputChannels),
-                borrow=True)
+        self.V.set_value(self.random_tensor(batchSize, videoHeight, videoWidth,
+                            videoDur, inputChannels), borrow=True)
        self.rb.set_value(self.random_tensor(inputChannels), borrow=True)

        H_shape = self.H_shape_func()

        #make index maps
-        h = N.zeros(  H_shape[1:])
-        r = N.zeros(  H_shape[1:])
-        c = N.zeros(  H_shape[1:])
-        t = N.zeros(  H_shape[1:])
-
-        for qi in xrange(0,H_shape[4]):
-            h[:,:,:,qi] = qi
-        for qi in xrange(0,H_shape[1]):
-            r[qi,:,:,:] = qi
-        for qi in xrange(0,H_shape[2]):
-            c[:,qi,:,:] = qi
-        for qi in xrange(0,H_shape[3]):
-            t[:,:,qi,:] = qi
+        h = N.zeros(H_shape[1:])
+        r = N.zeros(H_shape[1:])
+        c = N.zeros(H_shape[1:])
+        t = N.zeros(H_shape[1:])
+
+        for qi in xrange(0, H_shape[4]):
+            h[:, :, :, qi] = qi
+        for qi in xrange(0, H_shape[1]):
+            r[qi, :, :, :] = qi
+        for qi in xrange(0, H_shape[2]):
+            c[:, qi, :, :] = qi
+        for qi in xrange(0, H_shape[3]):
+            t[:, :, qi, :] = qi

        hn = H_shape[1] * H_shape[2] * H_shape[3] * H_shape[4]

@@ -378,21 +413,20 @@ class TestConv3D(unittest.TestCase):
        c = c.reshape((hn))
        t = t.reshape((hn))

+        Hv = self.random_tensor(*H_shape)

-        Hv = self.random_tensor( * H_shape )
-
-        Vv = self.transp_func(Hv,[videoHeight,videoWidth,videoDur])
+        Vv = self.transp_func(Hv, [videoHeight, videoWidth, videoDur])

        n = inputChannels * videoHeight * videoWidth * videoDur
-        rbim = N.zeros((videoHeight,videoWidth,videoDur,inputChannels))
-        for qi in xrange(0,inputChannels):
-            rbim[:,:,:,qi] = self.rb.get_value(borrow=True)[qi]
+        rbim = N.zeros((videoHeight, videoWidth, videoDur, inputChannels))
+        for qi in xrange(0, inputChannels):
+            rbim[:, :, :, qi] = self.rb.get_value(borrow=True)[qi]
        rbv = rbim.reshape((n))

-        W_mat = N.zeros((hn,n))
+        W_mat = N.zeros((hn, n))
        Vv_mat = N.zeros((n, batchSize))
-        Hv_mat = N.zeros((hn,batchSize))
-        for qi in xrange(0,hn):
+        Hv_mat = N.zeros((hn, batchSize))
+        for qi in xrange(0, hn):
            hi = h[qi]
            ri = r[qi]
            ci = c[qi]
@@ -401,57 +435,66 @@ class TestConv3D(unittest.TestCase):
            placed_filter = N.zeros(self.V.get_value(borrow=True).shape[1:])

            placed_filter[
-                    ri*dr:ri*dr+self.W.get_value(borrow=True).shape[1],
-                    ci*dc:ci*dc+self.W.get_value(borrow=True).shape[2],
-                    ti*dt:ti*dt+self.W.get_value(borrow=True).shape[3],
-                    :] = self.W.get_value(borrow=True)[hi,:,:,:,:]
-
-            W_mat[qi,:] = placed_filter.reshape((n))
-            Hv_mat[qi,:] = Hv[:,ri,ci,ti,hi]
-        for qi in xrange(0,batchSize):
-            Vv_mat[:,qi] = Vv[qi,:,:,:,:].reshape((n))
+                    ri * dr:ri * dr + self.W.get_value(borrow=True).shape[1],
+                    ci * dc:ci * dc + self.W.get_value(borrow=True).shape[2],
+                    ti * dt:ti * dt + self.W.get_value(borrow=True).shape[3],
+                    :] = self.W.get_value(borrow=True)[hi, :, :, :, :]

+            W_mat[qi, :] = placed_filter.reshape((n))
+            Hv_mat[qi, :] = Hv[:, ri, ci, ti, hi]
+        for qi in xrange(0, batchSize):
+            Vv_mat[:, qi] = Vv[qi, :, :, :, :].reshape((n))

        W_mat_T = sparse.csr_matrix(W_mat.transpose())

        temp = W_mat_T * Hv_mat
        V_mat = (temp.transpose() + rbv).transpose()

-        if N.abs(V_mat-Vv_mat).max() > 1e-5:
+        if N.abs(V_mat - Vv_mat).max() > 1e-5:
            print 'mul'
            print V_mat
            print 'conv'
            print Vv_mat
-            for i in xrange(0,n):
-                for j in xrange(0,batchSize):
-                    if abs(V_mat[i,j] - Vv_mat[i,j]) > 1e-5:
-                        print 'wrong at %d,%d: %f mul versus %f conv' % (i,j,V_mat[i,j],Vv_mat[i,j])
+            for i in xrange(0, n):
+                for j in xrange(0, batchSize):
+                    if abs(V_mat[i, j] - Vv_mat[i, j]) > 1e-5:
+                        print ('wrong at %d,%d: %f mul versus %f conv'
+                               % (i, j, V_mat[i, j], Vv_mat[i, j]))
            assert False

-
    def test_infer_shape(self):
        self.randomize()
-        Hv = self.H_func()
-        H_shape = self.H_shape_func()
-        assert N.all(Hv.shape == H_shape)
-
-        gradients = self.gradientsFunc(self.V.get_value(borrow=True).shape[1:4])
-        dCdWv = gradients[0]
-        dCdW_shape = self.dCdW_shape_func(self.V.get_value(borrow=True).shape[1:4])
+        # Conv3D
+        self._compile_and_check([], [self.H], [], Conv3D)

-        assert N.all(dCdWv.shape == dCdW_shape)
-
-        Rv = self.R_func(self.V.get_value(borrow=True).shape[1:4])
-        R_shape = self.R_shape_func(self.V.get_value(borrow=True).shape[1:4])
-
-        assert N.all(Rv.shape == R_shape)
+        # ConvTransp3D
+        self._compile_and_check([self.RShape], [self.R],
+                    [self.V.get_value(borrow=True).shape[1:4]], ConvTransp3D)

+        # ConvGrad3D
+        self._compile_and_check([self.RShape], [T.grad(self.reconsObj, self.W),
+                                            T.grad(self.reconsObj, self.H),
+                                            T.grad(self.reconsObj, self.V),
+                                            T.grad(self.reconsObj, self.b)],
+                    [self.V.get_value(borrow=True).shape[1:4]], ConvGrad3D)

    def test_gradient(self):
        self.randomize()
-        rng, V,W,b,d,rb = self.rng, self.V, self.W, self.b, self.d, self.rb
-        dCdH = shared(self.random_tensor( *self.H_shape_func() ))
+        rng, V, W, b, d, rb = self.rng, self.V, self.W, self.b, self.d, self.rb
+        dCdH = shared(self.random_tensor(*self.H_shape_func()))
        testsPerDir = 2
-        theano.tests.unittest_tools.verify_grad(DummyConv3D(rng, (V,W,b), d), [0.0], n_tests=testsPerDir)
-        theano.tests.unittest_tools.verify_grad(DummyConvTransp3D(rng, (W,rb,dCdH), d,V.get_value(borrow=True).shape[1:4]), [0.0], n_tests=testsPerDir)
-        theano.tests.unittest_tools.verify_grad(DummyConvGrad3D(rng, (V,dCdH), d, W.get_value(borrow=True).shape), [0.0], n_tests=testsPerDir)
+        theano.tests.unittest_tools.verify_grad(DummyConv3D(rng, (V, W, b), d),
+                                        [0.0], n_tests=testsPerDir)
+        theano.tests.unittest_tools.verify_grad(DummyConvTransp3D(rng,
+                        (W, rb, dCdH), d, V.get_value(borrow=True).shape[1:4]),
+                                        [0.0], n_tests=testsPerDir)
+        theano.tests.unittest_tools.verify_grad(DummyConvGrad3D(rng, (V,dCdH),
+                        d, W.get_value(borrow=True).shape),
+                                        [0.0], n_tests=testsPerDir)
+
+
+if __name__ == '__main__':
+
+    t = TestConv3D('setUp')
+    t.setUp()
+    t.test_infer_shape()
--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -17,47 +17,64 @@ from theano.tensor.nnet import (categorical_crossentropy,
                                crossentropy_softmax_1hot_with_bias,
                                crossentropy_softmax_1hot_with_bias_dx,
                                crossentropy_softmax_argmax_1hot_with_bias,
+                                CrossentropySoftmax1HotWithBiasDx,
+                                CrossentropySoftmaxArgmax1HotWithBias,
+                                CrossentropyCategorical1Hot,
+                                CrossentropyCategorical1HotGrad,
                                sigmoid, softplus,
-                                Softmax, softmax, SoftmaxWithBias, softmax_grad,
-                                softmax_with_bias,
+                                Softmax, softmax, SoftmaxWithBias,
+                                softmax_grad,
+                                softmax_with_bias, SoftmaxGrad,
                                Prepend_scalar_constant_to_each_row,
                                Prepend_scalar_to_each_row)
+from theano.tensor import dmatrix, dvector, lvector, dscalar
+

 class T_sigmoid(unittest.TestCase):
+
    def setUp(self):
        utt.seed_rng()
+
    def test_elemwise(self):
-        utt.verify_grad(sigmoid, [numpy.random.rand(3,4)])
+        utt.verify_grad(sigmoid, [numpy.random.rand(3, 4)])
+

 class T_softplus(unittest.TestCase):
+
    def setUp(self):
        utt.seed_rng()
+
    def test_elemwise(self):
-        utt.verify_grad(softplus, [numpy.random.rand(3,4)])
+        utt.verify_grad(softplus, [numpy.random.rand(3, 4)])
+
+
+class T_Softmax(utt.InferShapeTester):

-class T_Softmax(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
    def test0(self):
        def f(a):
-            return softmax(a)[:,0]
-        utt.verify_grad(f, [numpy.random.rand(3,4)])
+            return softmax(a)[:, 0]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
    def test1(self):
        def f(a):
-            return softmax(a)[:,1]
-        utt.verify_grad(f, [numpy.random.rand(3,4)])
+            return softmax(a)[:, 1]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
    def test2(self):
        def f(a):
-            return softmax(a)[:,2]
-        utt.verify_grad(f, [numpy.random.rand(3,4)])
+            return softmax(a)[:, 2]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
    def test3(self):
        def f(a):
-            return softmax(a)[:,3]
-        utt.verify_grad(f, [numpy.random.rand(3,4)])
+            return softmax(a)[:, 3]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])

    def test_infer_shape(self):
-        f=theano.function([],softmax(numpy.random.rand(3,4)).shape)
-        assert all(f()==[3,4])
+        admat = dmatrix()
+        admat_val = numpy.random.rand(3, 4)
+        self._compile_and_check([admat], [Softmax()(admat)],
+                            [admat_val], Softmax)

    def test_vector(self):
        x = T.vector()
@@ -65,109 +82,134 @@ class T_Softmax(unittest.TestCase):

        xv = numpy.random.randn(6).astype(config.floatX)
        assert numpy.allclose(f(xv), numpy.exp(xv) / numpy.exp(xv).sum())
+
    def test_vector_grad(self):
        def f(a):
            return softmax(a)
        utt.verify_grad(f, [numpy.random.rand(4)])


-class T_SoftmaxWithBias(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
+class T_SoftmaxWithBias(utt.InferShapeTester):
+
    def test0(self):
        def f(a, b):
-            return softmax_with_bias(a, b)[:,0]
-        utt.verify_grad(f, [numpy.random.rand(3,4),
+            return softmax_with_bias(a, b)[:, 0]
+        utt.verify_grad(f, [numpy.random.rand(3, 4),
            numpy.random.rand(4)])
+
    def test1(self):
        def f(a, b):
-            return softmax_with_bias(a, b)[:,1]
-        utt.verify_grad(f, [numpy.random.rand(3,4),
+            return softmax_with_bias(a, b)[:, 1]
+        utt.verify_grad(f, [numpy.random.rand(3, 4),
            numpy.random.rand(4)])
+
    def test2(self):
        def f(a, b):
-            return softmax_with_bias(a, b)[:,2]
-        utt.verify_grad(f, [numpy.random.rand(3,4),
+            return softmax_with_bias(a, b)[:, 2]
+        utt.verify_grad(f, [numpy.random.rand(3, 4),
            numpy.random.rand(4)])
+
    def test3(self):
        def f(a, b):
-            return softmax_with_bias(a, b)[:,3]
-        utt.verify_grad(f, [numpy.random.rand(3,4),
+            return softmax_with_bias(a, b)[:, 3]
+        utt.verify_grad(f, [numpy.random.rand(3, 4),
            numpy.random.rand(4)])
+
    def test_broadcast(self):
        #test that we don't raise an error during optimization for no good
        #reason as softmax_with_bias don't support correctly some/all
        #broadcasted inputs pattern
-        initial_W = numpy.asarray( [[0.1,0.1,0.1], \
-                            [0.1,0.1,0.1], \
-                            [0.1,0.1,0.1]], \
-                            dtype = theano.config.floatX)
-        W = theano.shared(value = initial_W, name = 'W')
-        vbias=theano.shared(value=0.1, name='vbias') #0.01
-        hid=T.vector('hid')
-
+        initial_W = numpy.asarray([[0.1, 0.1, 0.1], \
+                            [0.1, 0.1, 0.1], \
+                            [0.1, 0.1, 0.1]], \
+                            dtype=theano.config.floatX)
+        W = theano.shared(value=initial_W, name='W')
+        vbias = theano.shared(value=0.1, name='vbias')  # 0.01
+        hid = T.vector('hid')
        f = theano.function([hid],
                            T.nnet.softmax(T.dot(hid, W.T) + vbias))
        ops = [node.op for node in f.maker.fgraph.toposort()]
        assert softmax_with_bias not in ops
        assert softmax in ops

-        f([0,1,0])
+        f([0, 1, 0])
        #print f.maker.fgraph.toposort()

    def test_infer_shape(self):
-        fff=theano.function([],outputs=softmax_with_bias(numpy.random.rand(3,4),numpy.random.rand(4)).shape)
-        assert all(fff()==[3,4])
+        admat = dmatrix()
+        advec = dvector()
+        admat_val = numpy.random.rand(3, 4)
+        advec_val = numpy.random.rand(4)
+        self._compile_and_check([admat, advec],
+                            [SoftmaxWithBias()(admat, advec)],
+                            [admat_val, advec_val], SoftmaxWithBias)
+
+
+class T_SoftmaxGrad(utt.InferShapeTester):

-class T_SoftmaxGrad(unittest.TestCase):
    def test_infer_shape(self):
-        a=T.constant(numpy.random.rand(3,4))
-        b=T.constant(numpy.random.rand(3,4))
-        f=theano.function([],softmax_grad(a,b).shape)
-        assert numpy.all(f()==[3,4])
+        admat = dmatrix()
+        bdmat = dmatrix()
+        admat_val = numpy.random.rand(3, 4)
+        bdmat_val = numpy.random.rand(3, 4)
+        self._compile_and_check([admat, bdmat], [SoftmaxGrad()(admat, bdmat)],
+                            [admat_val, bdmat_val], SoftmaxGrad)
+

 class T_CrossentropySoftmax1Hot(unittest.TestCase):
+
    def setUp(self):
        utt.seed_rng()
+
    def test0(self):
-        y_idx = [0,1,3]
+        y_idx = [0, 1, 3]
+
        def f(a, b):
            return crossentropy_softmax_1hot_with_bias(a, b, y_idx)[0]
-        utt.verify_grad(f, [numpy.random.rand(3,4),
+        utt.verify_grad(f, [numpy.random.rand(3, 4),
            numpy.random.rand(4)])
+
    def test1(self):
-        y_idx = [0,1,3]
+        y_idx = [0, 1, 3]
+
        def f(a):
            return crossentropy_softmax_1hot(a, y_idx)[0]
-        utt.verify_grad(f, [numpy.random.rand(3,4)])
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
    def test_vector(self):
        y_idx = [3]
+
        def f(a):
            return crossentropy_softmax_1hot(T.shape_padleft(a), y_idx)[0]
        utt.verify_grad(f, [numpy.random.rand(4)])
+
    def test_vectors(self):
        y_idx = [3]
+
        def f(a, b):
-            return crossentropy_softmax_1hot(T.shape_padleft(a)+b, y_idx)[0]
+            return crossentropy_softmax_1hot(T.shape_padleft(a) + b, y_idx)[0]
        utt.verify_grad(f, [numpy.random.rand(4), numpy.random.rand(4)])

-class T_CrossentropySoftmax1HotWithBiasDx(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
+
+class T_CrossentropySoftmax1HotWithBiasDx(utt.InferShapeTester):
+
    def test0(self):
        def f(sm):
            return (theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(
                numpy.random.rand(10),  # Gradient w.r.t. NLL.
                sm,                     # Softmax output.
-                numpy.random.randint(low=0, high=5, size=10))) # Class indices.
+                numpy.random.randint(low=0,
+                high=5, size=10)))  # Class indices.
        # Build a random softmax output whose rows sum to 1.
        softmax_output = numpy.random.rand(10, 5)
        softmax_output /= softmax_output.sum(axis=1).reshape(10, 1)
        utt.verify_grad(f, [softmax_output])
+
    def test1(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        softmax_output = rng.rand(10, 5)
        softmax_output /= softmax_output.sum(axis=1).reshape(10, 1)
+
        def f(dy):
            return (theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(
                dy,
@@ -175,19 +217,38 @@ class T_CrossentropySoftmax1HotWithBiasDx(unittest.TestCase):
                rng.randint(low=0, high=5, size=10)))
        utt.verify_grad(f, [rng.rand(10)])

-class T_CrossentropySoftmaxArgmax1HotWithBias(unittest.TestCase):
+    def test_infer_shape(self):
+        admat = dmatrix()
+        advec = dvector()
+        alvec = lvector()
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        admat_val = rng.rand(10, 5)
+        admat_val /= admat_val.sum(axis=1).reshape(10, 1)
+        advec_val = rng.rand(10)
+        alvec_val = rng.randint(low=0, high=5, size=10)
+        self._compile_and_check([advec, admat, alvec],
+                    [CrossentropySoftmax1HotWithBiasDx()(advec, admat, alvec)],
+                    [advec_val, admat_val, alvec_val],
+                    CrossentropySoftmax1HotWithBiasDx)
+
+
+class T_CrossentropySoftmaxArgmax1HotWithBias(utt.InferShapeTester):
+
    def setUp(self):
-        utt.seed_rng()
+        super(T_CrossentropySoftmaxArgmax1HotWithBias, self).setUp()
        self.op = theano.tensor.nnet.crossentropy_softmax_argmax_1hot_with_bias
+
    def test0(self):
        n_classes = 5
        n_samples = 3
+
        # First test gradient when getting a gradient on the NLL output.
        def grad_on_nll(x, b):
            return self.op(x, b, y_idx=numpy.random.randint(
                low=0, high=n_classes, size=n_samples))[0]
        utt.verify_grad(grad_on_nll, [numpy.random.rand(n_samples, n_classes),
            numpy.random.rand(n_classes)])
+
        # Then test gradient when getting a gradient on the softmax output.
        def grad_on_softmax(x, b):
            return self.op(x, b, y_idx=numpy.random.randint(
@@ -197,68 +258,107 @@ class T_CrossentropySoftmaxArgmax1HotWithBias(unittest.TestCase):
                    numpy.random.rand(n_classes)])

    def test_infer_shape(self):
-        var = self.op(numpy.random.rand(3,5),numpy.random.rand(5), y_idx=numpy.random.randint(
-                low=0, high=5, size=3))
-        assert theano.function([],var[0].shape)() == [3]
-        assert all(theano.function([],var[1].shape)() == [3,5])
-        assert theano.function([],var[2].shape)() == [3]
+        admat = dmatrix()
+        advec = dvector()
+        alvec = lvector()
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        admat_val = rng.rand(3, 5)
+        advec_val = rng.rand(5)
+        alvec_val = rng.randint(low=0, high=5, size=3)
+        self._compile_and_check([admat, advec, alvec],
+                CrossentropySoftmaxArgmax1HotWithBias()(admat, advec, alvec),
+                [admat_val, advec_val, alvec_val],
+                CrossentropySoftmaxArgmax1HotWithBias)
+
+
+class T_prepend(utt.InferShapeTester):

-class T_prepend(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
    def test0(self):
-        """basic functionality"""
-        x=tensor.matrix('x')
-        y=Prepend_scalar_constant_to_each_row(4.)(x)
-        f=theano.function([x],[y])
-        m=numpy.random.rand(3,5)
+        x = tensor.matrix('x')
+        y = Prepend_scalar_constant_to_each_row(4.)(x)
+        f = theano.function([x], [y])
+        m = numpy.random.rand(3, 5)
        my = f(m)
        self.assertTrue(my.shape == (3, 6), my.shape)
-        self.assertTrue(numpy.all( my[:,0] == 4.0))
-
+        self.assertTrue(numpy.all(my[:, 0] == 4.0))

-class T_prepend(unittest.TestCase):
-    def test0(self):
-        """basic functionality"""
-        x=tensor.matrix('x')
-        y=Prepend_scalar_to_each_row()(5.,x)
-        f=theano.function([x],y)
-        m=numpy.ones((3,5),dtype="float32")
+    def test1(self):
+        "basic functionality"
+        x = tensor.matrix('x')
+        y = Prepend_scalar_to_each_row()(5., x)
+        f = theano.function([x], y)
+        m = numpy.ones((3, 5), dtype="float32")
        my = f(m)
        self.assertTrue(my.shape == (3, 6))
-        self.assertTrue(numpy.all(my[:,0] == 5.0))
+        self.assertTrue(numpy.all(my[:, 0] == 5.0))
+
+    def test_infer_shape(self):
+        admat = dmatrix()
+        adscal = dscalar()
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        admat_val = rng.rand(3, 5)
+        adscal_val = rng.rand()
+        self._compile_and_check([admat],
+                   [Prepend_scalar_constant_to_each_row(adscal_val)(admat)],
+                    [admat_val],
+                    Prepend_scalar_constant_to_each_row)
+
+        self._compile_and_check([adscal, admat],
+                   [Prepend_scalar_to_each_row()(adscal, admat)],
+                    [adscal_val, admat_val],
+                    Prepend_scalar_to_each_row)

-class T_CrossentropyCategorical1Hot(unittest.TestCase):
-    def setUp(self):
-        utt.seed_rng()
+
+class T_CrossentropyCategorical1HotGrad(utt.InferShapeTester):
+
+    def test_infer_shape(self):
+        advec = dvector()
+        admat = dmatrix()
+        alvec = lvector()
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        advec_val = rng.rand(3)
+        admat_val = rng.rand(3, 2)
+        alvec_val = [0, 1, 0]
+        self._compile_and_check([advec, admat, alvec],
+                    [CrossentropyCategorical1HotGrad()(advec, admat, alvec)],
+                    [advec_val, admat_val, alvec_val],
+                    CrossentropyCategorical1HotGrad)
+
+
+class T_CrossentropyCategorical1Hot(utt.InferShapeTester):

    def test_grad(self):
        x = tensor.matrix('x')
        one_of_n = tensor.lvector('one_of_n')
-
        op = crossentropy_categorical_1hot
-
        xe = op(x, one_of_n)
-
        f = theano.function([x, one_of_n], xe)
-
        x_val = numpy.asarray([[.4, .6, .0], [.1, .8, .1]],
                dtype=config.floatX)
-        xe_val = f(x_val, [0,1])
-
+        xe_val = f(x_val, [0, 1])
        assert numpy.allclose(xe_val, -numpy.log([.4, .8]))

        def oplike(x):
-            return op(x, [0,1])
+            return op(x, [0, 1])

        tensor.verify_grad(oplike, [x_val], rng=numpy.random)

+        # see issue gh-788
+    def est_infer_shape(self):
+        admat = dmatrix()
+        alvec = lvector()
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        admat_val = rng.rand(3, 2)
+        alvec_val = [0, 1, 0]
+        self._compile_and_check([admat, alvec],
+                    [CrossentropyCategorical1Hot()(admat, alvec)],
+                    [admat_val, alvec_val],
+                    CrossentropyCategorical1Hot)

    def test_softmax_optimizations(self):
        x = tensor.matrix('x')
        one_of_n = tensor.lvector('one_of_n')
        op = crossentropy_categorical_1hot
-
        xe = op(x, one_of_n)

        fgraph = gof.FunctionGraph(
@@ -270,7 +370,8 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)

        assert str(fgraph.outputs[0].owner.op) == 'OutputGuard'
-        assert fgraph.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
+        assert (fgraph.outputs[0].owner.inputs[0].owner.op ==
+                crossentropy_softmax_argmax_1hot_with_bias)

    def test_softmax_optimizations_vector(self):
        x = tensor.vector('x')
@@ -284,19 +385,19 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        theano.compile.mode.optdb.query(
                theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)
        assert str(fgraph.outputs[0].owner.op) == 'OutputGuard'
-        assert fgraph.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
+        assert (fgraph.outputs[0].owner.inputs[0].owner.op ==
+                crossentropy_softmax_argmax_1hot_with_bias)

    def test_softmax_optimizations_w_bias(self):
        x = tensor.matrix('x')
        b = tensor.vector('b')
        one_of_n = tensor.lvector('one_of_n')
        op = crossentropy_categorical_1hot
-
        xe = op(x, one_of_n)

        fgraph = gof.FunctionGraph(
                [x, b, one_of_n],
-                [op(softmax(x+b), one_of_n)])
+                [op(softmax(x + b), one_of_n)])
        assert fgraph.outputs[0].owner.op == op

        #print 'BEFORE'
@@ -316,7 +417,8 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        assert len(fgraph.toposort()) == 2

        assert str(fgraph.outputs[0].owner.op) == 'OutputGuard'
-        assert fgraph.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
+        assert (fgraph.outputs[0].owner.inputs[0].owner.op ==
+                crossentropy_softmax_argmax_1hot_with_bias)

    def test_softmax_optimizations_w_bias2(self):
        x = tensor.matrix('x')
@@ -327,7 +429,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):

        fgraph = gof.FunctionGraph(
                [x, b, c, one_of_n],
-                [op(softmax(T.add(x,b,c)), one_of_n)])
+                [op(softmax(T.add(x, b, c)), one_of_n)])
        assert fgraph.outputs[0].owner.op == op

        #print 'BEFORE'
@@ -345,7 +447,8 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        assert len(fgraph.toposort()) == 3

        assert str(fgraph.outputs[0].owner.op) == 'OutputGuard'
-        assert fgraph.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
+        assert (fgraph.outputs[0].owner.inputs[0].owner.op ==
+                crossentropy_softmax_argmax_1hot_with_bias)

    def test_softmax_optimizations_w_bias_vector(self):
        x = tensor.vector('x')
@@ -354,7 +457,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        op = crossentropy_categorical_1hot
        fgraph = gof.FunctionGraph(
                [x, b, one_of_n],
-                [op(softmax(x+b), one_of_n)])
+                [op(softmax(x + b), one_of_n)])
        assert fgraph.outputs[0].owner.op == op
        #print 'BEFORE'
        #for node in fgraph.toposort():
@@ -370,15 +473,14 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        #print '===='
        assert len(fgraph.toposort()) == 3
        assert str(fgraph.outputs[0].owner.op) == 'OutputGuard'
-        assert fgraph.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
+        assert (fgraph.outputs[0].owner.inputs[0].owner.op ==
+                crossentropy_softmax_argmax_1hot_with_bias)

    def test_softmax_grad_optimizations(self):
        x = tensor.matrix('x')
        one_of_n = tensor.lvector('one_of_n')
        op = crossentropy_categorical_1hot
-
        xe = op(softmax(x), one_of_n)
-
        sum_xe = tensor.sum(xe)
        g_x = tensor.grad(sum_xe, x)
        fgraph = gof.FunctionGraph(
@@ -396,8 +498,8 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        #for node in fgraph.toposort():
        #    print node.op, node.inputs

-        # the function has 9 ops because the dimshuffle and elemwise{second} aren't getting
-        # cleaned up as well as we'd like.
+        # the function has 9 ops because the dimshuffle and lemwise{second}
+        # aren't getting cleaned up as well as we'd like.
        has_cx1hot = False
        has_cx1hotdx = False
        has_softmax = False
@@ -405,13 +507,12 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        for node in fgraph.toposort():
            if node.op == crossentropy_softmax_argmax_1hot_with_bias:
                has_cx1hot = True
-            if node.op == crossentropy_softmax_1hot_with_bias_dx :
+            if node.op == crossentropy_softmax_1hot_with_bias_dx:
                has_cx1hotdx = True
            if node.op == softmax:
                has_softmax = True
            if node.op == softmax_grad:
                has_softmaxdx = True
-
        assert has_cx1hot
        assert has_cx1hotdx
        assert not has_softmax
@@ -439,8 +540,8 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        #for node in fgraph.toposort():
        #    print node.op, node.inputs

-        # the function has 9 ops because the dimshuffle and elemwise{second} aren't getting
-        # cleaned up as well as we'd like.
+        # the function has 9 ops because the dimshuffle and elemwise{second}
+        # aren't getting cleaned up as well as we'd like.
        has_cx1hot = False
        has_cx1hotdx = False
        has_softmax = False
@@ -448,13 +549,12 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        for node in fgraph.toposort():
            if node.op == crossentropy_softmax_argmax_1hot_with_bias:
                has_cx1hot = True
-            if node.op == crossentropy_softmax_1hot_with_bias_dx :
+            if node.op == crossentropy_softmax_1hot_with_bias_dx:
                has_cx1hotdx = True
            if node.op == softmax:
                has_softmax = True
            if node.op == softmax_grad:
                has_softmaxdx = True
-
        assert has_cx1hot
        assert has_cx1hotdx
        assert not has_softmax
@@ -469,13 +569,10 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        mode = theano.compile.mode.get_default_mode()
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
-
        rng = numpy.random.RandomState(utt.fetch_seed())
-
-        x_val = rng.randn(3,5)
+        x_val = rng.randn(3, 5)
        b_val = rng.randn(5)
-        y_val = numpy.asarray([2,4,1])
-
+        y_val = numpy.asarray([2, 4, 1])
        x = T.dmatrix('x')
        b = T.dvector('b')
        y = T.lvector('y')
@@ -487,10 +584,10 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                -T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y]),
                T.sum(-T.log(softmax(x))[T.arange(y.shape[0]), y])
                ]
-
        for expr in expressions:
+
            # Verify the optimizer worked on the expressions
-            f = theano.function([x,y], expr, mode=mode)
+            f = theano.function([x, y], expr, mode=mode)
            if verbose:
                theano.printing.debugprint(f)
            try:
@@ -501,7 +598,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                raise

            # Also verify the gradient wrt x
-            g = theano.function([x,y], T.grad(expr, x), mode=mode)
+            g = theano.function([x, y], T.grad(expr, x), mode=mode)
            if verbose:
                theano.printing.debugprint(g)
            try:
@@ -513,23 +610,22 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):

        ## Test that a biased softmax is optimized correctly
        bias_expressions = [
-                T.sum(-T.log(softmax(x+b)[T.arange(y.shape[0]), y])),
-                -T.sum(T.log(softmax(b+x)[T.arange(y.shape[0]), y])),
-                -T.sum(T.log(softmax(x+b))[T.arange(y.shape[0]), y]),
-                T.sum(-T.log(softmax(b+x))[T.arange(y.shape[0]), y])]
+                T.sum(-T.log(softmax(x + b)[T.arange(y.shape[0]), y])),
+                -T.sum(T.log(softmax(b + x)[T.arange(y.shape[0]), y])),
+                -T.sum(T.log(softmax(x + b))[T.arange(y.shape[0]), y]),
+                T.sum(-T.log(softmax(b + x))[T.arange(y.shape[0]), y])]

        for expr in bias_expressions:
-            f = theano.function([x,b,y], expr, mode=mode)
+            f = theano.function([x, b, y], expr, mode=mode)
            if verbose:
                theano.printing.debugprint(f)
            try:
-                assert len(f.maker.fgraph.toposort()) == 2 # [big_op, sum]
+                assert len(f.maker.fgraph.toposort()) == 2  # [big_op, sum]
                f(x_val, b_val, y_val)
            except Exception:
                theano.printing.debugprint(f)
                raise
-
-            g = theano.function([x,b,y], T.grad(expr, x), mode=mode)
+            g = theano.function([x, b, y], T.grad(expr, x), mode=mode)
            if verbose:
                theano.printing.debugprint(g)
            try:
@@ -547,7 +643,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                T.mean(-T.log(softmax(x))[T.arange(y.shape[0]), y])]

        for expr in mean_expressions:
-            f = theano.function([x,y], expr, mode=mode)
+            f = theano.function([x, y], expr, mode=mode)
            if verbose:
                theano.printing.debugprint(f)
            try:
@@ -557,11 +653,12 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                theano.printing.debugprint(f)
                raise

-            g = theano.function([x,y], T.grad(expr, x), mode=mode)
+            g = theano.function([x, y], T.grad(expr, x), mode=mode)
            if verbose:
                theano.printing.debugprint(g)
            try:
-                assert len(g.maker.fgraph.toposort()) in (6,7) #there's an extra dimshuffle in there
+                assert len(g.maker.fgraph.toposort()) in (6, 7)
+                #there's an extra dimshuffle in there
                # but I can't think of a good rule to get rid of it
                g(x_val, y_val)
            except Exception:
@@ -569,13 +666,13 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                raise

        mean_bias_expressions = [
-                T.mean(-T.log(softmax(x+b)[T.arange(y.shape[0]), y])),
-                -T.mean(T.log(softmax(b+x)[T.arange(y.shape[0]), y])),
-                -T.mean(T.log(softmax(x+b))[T.arange(y.shape[0]), y]),
-                T.mean(-T.log(softmax(b+x))[T.arange(y.shape[0]), y])]
+                T.mean(-T.log(softmax(x + b)[T.arange(y.shape[0]), y])),
+                -T.mean(T.log(softmax(b + x)[T.arange(y.shape[0]), y])),
+                -T.mean(T.log(softmax(x + b))[T.arange(y.shape[0]), y]),
+                T.mean(-T.log(softmax(b + x))[T.arange(y.shape[0]), y])]

        for expr in mean_bias_expressions:
-            f = theano.function([x,b,y], expr, mode=mode)
+            f = theano.function([x, b, y], expr, mode=mode)
            if verbose:
                theano.printing.debugprint(f)
            try:
@@ -583,12 +680,11 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
            except Exception:
                theano.printing.debugprint(f)
                raise
-
-            g = theano.function([x,b,y], T.grad(expr, x), mode=mode)
+            g = theano.function([x, b, y], T.grad(expr, x), mode=mode)
            if verbose:
                theano.printing.debugprint(g)
            try:
-                assert len(g.maker.fgraph.toposort()) in (6,7)
+                assert len(g.maker.fgraph.toposort()) in (6, 7)
                g(x_val, b_val, y_val)
            except Exception:
                theano.printing.debugprint(g)
@@ -600,15 +696,13 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
        rng = numpy.random.RandomState(utt.fetch_seed())
-        x_val = rng.randn(3,5)
+        x_val = rng.randn(3, 5)
        b_val = rng.randn(5)
-        y_val = numpy.asarray([2,4,1], dtype='int64')
-
+        y_val = numpy.asarray([2, 4, 1], dtype='int64')
        x = T.dmatrix('x')
        b = T.dvector('b')
        y = T.lvector('y')
        yi = T.cast(y, 'int32')
-
        expressions = [
                T.sum(-T.log(softmax(x)[T.arange(yi.shape[0]), yi])),
                -T.sum(T.log(softmax(x)[T.arange(yi.shape[0]), yi])),
@@ -618,7 +712,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):

        for expr in expressions:
            # Verify the optimizer worked on the expressions
-            f = theano.function([x,y], expr, mode=mode)
+            f = theano.function([x, y], expr, mode=mode)
            if verbose:
                theano.printing.debugprint(f)
            try:
@@ -629,7 +723,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                raise

            # Also verify the gradient wrt x
-            g = theano.function([x,y], T.grad(expr, x), mode=mode)
+            g = theano.function([x, y], T.grad(expr, x), mode=mode)
            if verbose:
                theano.printing.debugprint(g)
            try:
@@ -639,7 +733,6 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                theano.printing.debugprint(g)
                raise

-
    def test_optimize_xent_vector(self):
        verbose = 0
        mode = theano.compile.mode.get_default_mode()
@@ -665,8 +758,9 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                -T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y]))]

        for expr in bias_expressions:
-            f = theano.function([x,y], expr, mode=mode)
-            if verbose: print_graph(f)
+            f = theano.function([x, y], expr, mode=mode)
+            if verbose:
+                print_graph(f)
            try:
                prev, last = f.maker.fgraph.toposort()[-2:]
                assert len(f.maker.fgraph.toposort()) == 5
@@ -674,8 +768,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
            except Exception:
                theano.printing.debugprint(f)
                raise
-
-            g = theano.function([x,y], T.grad(expr, x), mode=mode)
+            g = theano.function([x, y], T.grad(expr, x), mode=mode)
            print_graph(g)
            try:
                ops = [node.op for node in g.maker.fgraph.toposort()]
@@ -711,17 +804,19 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):

        ## Test that a biased softmax is optimized correctly
        bias_expressions = [
-                T.sum(-T.log(softmax(x+b)[T.arange(y.shape[0]), y])),
-                -T.sum(T.log(softmax(b+x)[T.arange(y.shape[0]), y])),
-                -T.sum(T.log(softmax(x+b))[T.arange(y.shape[0]), y]),
-                T.sum(-T.log(softmax(b+x))[T.arange(y.shape[0]), y])]
+                T.sum(-T.log(softmax(x + b)[T.arange(y.shape[0]), y])),
+                -T.sum(T.log(softmax(b + x)[T.arange(y.shape[0]), y])),
+                -T.sum(T.log(softmax(x + b))[T.arange(y.shape[0]), y]),
+                T.sum(-T.log(softmax(b + x))[T.arange(y.shape[0]), y])]

        for expr in bias_expressions:
-            f = theano.function([x,b,y], expr, mode=mode)
-            if verbose: print_graph(f)
+            f = theano.function([x, b, y], expr, mode=mode)
+            if verbose:
+                print_graph(f)
            try:
                prev, last = f.maker.fgraph.toposort()[-2:]
-                assert len(f.maker.fgraph.toposort()) == 3 # [big_op, sum, dim_shuffle]
+                assert len(f.maker.fgraph.toposort()) == 3
+                # [big_op, sum, dim_shuffle]
                f(x_val, b_val, y_val)
            except Exception:
                theano.printing.debugprint(f)
@@ -730,7 +825,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
            backup = config.warn.sum_div_dimshuffle_bug
            config.warn.sum_div_dimshuffle_bug = False
            try:
-                g = theano.function([x,b,y], T.grad(expr, x), mode=mode)
+                g = theano.function([x, b, y], T.grad(expr, x), mode=mode)
            finally:
                config.warn.sum_div_dimshuffle_bug = backup

@@ -752,13 +847,10 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        mode = theano.compile.mode.get_default_mode()
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
-
        rng = numpy.random.RandomState(utt.fetch_seed())
-
-        x_val = rng.randn(3,5)
+        x_val = rng.randn(3, 5)
        b_val = rng.randn(5)
-        y_val = numpy.asarray([2,4,1])
-
+        y_val = numpy.asarray([2, 4, 1])
        x = T.dmatrix('x')
        b = T.dvector('b')
        y = T.lvector('y')
@@ -800,7 +892,6 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
            assert has_softmax
            assert not has_softmaxdx

-
        ## Cases to test
        expressions = [
                a * T.sum(-T.log(softmax(x)[T.arange(y.shape[0]), y])),
@@ -826,7 +917,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):

        for expr in expressions:
            # Verify the optimizer worked on the expressions
-            f = theano.function([x,y,a], expr, mode=mode)
+            f = theano.function([x, y, a], expr, mode=mode)
            try:
                assert 5 <= len(f.maker.fgraph.toposort()) <= 10
                validate_fn_graph(f)
@@ -836,7 +927,7 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                raise

            # Verify the gradient wrt x
-            g = theano.function([x,y,a], T.grad(expr, x), mode=mode)
+            g = theano.function([x, y, a], T.grad(expr, x), mode=mode)
            try:
                assert 5 <= len(g.maker.fgraph.toposort()) <= 12
                validate_grad_graph(g)
@@ -846,7 +937,8 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
                raise

            # Verify the gradient when providing output gradient
-            h = theano.function([x,y,a], T.grad(expr, x, g_cost=a*x.sum()), mode=mode)
+            h = theano.function([x, y, a],
+                                T.grad(expr, x, g_cost=a * x.sum()), mode=mode)
            try:
                assert 8 <= len(h.maker.fgraph.toposort()) <= 17
                validate_grad_graph(h)
@@ -866,14 +958,13 @@ def test_argmax_pushdown():
    fgraph = gof.FunctionGraph(
            [x],
            [out])
-
    theano.compile.mode.optdb.query(
            theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)

    #print 'AFTER'
    #for node in fgraph.toposort():
        #print node.op
-    assert len(fgraph.toposort()) == 2 # an output_guard is second
+    assert len(fgraph.toposort()) == 2  # an output_guard is second
    assert fgraph.toposort()[0].op == tensor.basic._max_and_argmax
    assert str(fgraph.toposort()[1].op) == 'OutputGuard'
    x = tensor.dmatrix()
@@ -910,7 +1001,7 @@ def test_argmax_pushdown_bias():

    out = tensor.argmax(softmax_with_bias(x, b), axis=-1)
    fgraph = gof.FunctionGraph(
-            [x,b],
+            [x, b],
            [out])

    theano.compile.mode.optdb.query(
@@ -927,10 +1018,9 @@ def test_argmax_pushdown_bias():

    x = tensor.dmatrix()
    b = tensor.dvector()
-
    out = tensor.max_and_argmax(softmax_with_bias(x, b), axis=-1)[0]
    fgraph = gof.FunctionGraph(
-            [x,b],
+            [x, b],
            [out])

    backup = config.warn.argmax_pushdown_bug
@@ -950,13 +1040,15 @@ def test_argmax_pushdown_bias():
    assert isinstance(fgraph.toposort()[1].op.scalar_op, theano.scalar.Maximum)
    assert str(fgraph.toposort()[2].op) == 'OutputGuard'

+
 def test_asymptotic_32():
    """
-    This test makes sure that our functions behave sensibly when huge values are present
+    This test makes sure that our functions behave sensibly when
+    huge values are present
    """

-    #TODO: consider adding the optimization of crossentropy into the current mode for the
-    # purpose of running this test
+    #TODO: consider adding the optimization of crossentropy into the current
+    # mode for the purpose of running this test

    for dtype in 'float32', 'float64':
        if dtype == 'float32':
@@ -967,20 +1059,20 @@ def test_asymptotic_32():
            x2 = tensor.dvector()
        y = tensor.lvector()

-        c = categorical_crossentropy(softmax(x+x2), y)
-        f = theano.function([x,y,x2], [c.sum(), tensor.grad(c.sum(), x)], mode='FAST_RUN')
+        c = categorical_crossentropy(softmax(x + x2), y)
+        f = theano.function([x, y, x2], [c.sum(),
+                            tensor.grad(c.sum(), x)], mode='FAST_RUN')
        if 0:
-            for i, n in enumerate( f.maker.fgraph.toposort()):
+            for i, n in enumerate(f.maker.fgraph.toposort()):
                print i, n

        xval = numpy.zeros((5, 5), dtype=dtype)
        x2val = numpy.zeros(5, dtype=xval.dtype)
        for i in xrange(100):
-
-            cval, gxval =  f(xval, numpy.arange(5), x2val)
+            cval, gxval = f(xval, numpy.arange(5), x2val)
            xval -= 100.3 * gxval
            #print cval, gxval
-        assert cval == 0 # no problem going to zero error
+        assert cval == 0  # no problem going to zero error

        #what about when x gets really big?

@@ -988,56 +1080,55 @@ def test_asymptotic_32():
        x2val = numpy.zeros(5, dtype=xval.dtype)
        for i in xrange(100):

-            cval, gxval =  f(xval, numpy.arange(5), x2val)
+            cval, gxval = f(xval, numpy.arange(5), x2val)
            xval += 100000.3 * gxval
            #print cval, gxval

        assert cval > 61750000
-        assert gxval[0,0] == -1.0
-        assert gxval[0,1] == 0.25
+        assert gxval[0, 0] == -1.0
+        assert gxval[0, 1] == 0.25


 class Test_softmax_opt:
-    # Test that expressions of softmax in terms of exponentiated things divided by row sums
-    # are replaced by softmax expressions.
-
+    # Test that expressions of softmax in terms of exponentiated things
+    # divided by row sums are replaced by softmax expressions.
    #
-    # Softmax_grad isn't that interesting as an Op, but it's the signature we look for when
-    # trying to insert CrossEntropySoftmax... grad.  So for now, we add softmax_grad to graphs.
-    # In future, we may modify the CrossEntropySoftmax...grad to look for the more basic
-    # pattern.
+    # Softmax_grad isn't that interesting as an Op, but it has the signature
+    # we look for when trying to insert CrossEntropySoftmax... grad.  So, for
+    # now, we add softmax_grad to graphs. In the future, we may modify the
+    # CrossEntropySoftmax...grad to look for the more basic pattern.
    #

    def setUp(self):
        utt.seed_rng()
        self.rng = numpy.random.RandomState(utt.fetch_seed())
-        self.mode=theano.compile.mode.get_default_mode()
-        self.mode=self.mode.including('canonicalize')
+        self.mode = theano.compile.mode.get_default_mode()
+        self.mode = self.mode.including('canonicalize')

    def test_basic(self):
        c = T.matrix()
-        p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0,'x')
+        p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0, 'x')

        # test that function contains softmax and no div.
-        f = theano.function([c],p_y, mode=self.mode)
+        f = theano.function([c], p_y, mode=self.mode)
        f_ops = [n.op for n in f.maker.fgraph.toposort()]
        #print '--- f ='
        #printing.debugprint(f)
        #print '==='
        assert len(f_ops) == 1
        assert softmax in f_ops
-        f(self.rng.rand(3,4).astype(config.floatX))
+        f(self.rng.rand(3, 4).astype(config.floatX))

    def test_grad(self):
        c = T.matrix()
-        p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0,'x')
+        p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0, 'x')

        # test that function contains softmax and softmaxgrad
        w = T.matrix()
        backup = config.warn.sum_div_dimshuffle_bug
        config.warn.sum_div_dimshuffle_bug = False
        try:
-            g = theano.function([c,w],T.grad((p_y*w).sum(), c))
+            g = theano.function([c, w], T.grad((p_y * w).sum(), c))
        finally:
            config.warn.sum_div_dimshuffle_bug = backup
        g_ops = [n.op for n in g.maker.fgraph.toposort()]
@@ -1049,7 +1140,7 @@ class Test_softmax_opt:
        assert len(g_ops) == 2
        assert softmax in g_ops
        assert softmax_grad in g_ops
-        g(self.rng.rand(3,4), self.rng.uniform(.5, 1, (3,4)))
+        g(self.rng.rand(3, 4), self.rng.uniform(.5, 1, (3, 4)))

    def test_transpose_basic(self):
        # this should be a transposed softmax
@@ -1057,14 +1148,14 @@ class Test_softmax_opt:
        p_y = T.exp(c) / T.exp(c).sum(axis=0)

        # test that function contains softmax and no div.
-        f = theano.function([c],p_y)
+        f = theano.function([c], p_y)
        #printing.debugprint(f)

        # test that function contains softmax and no div.
        backup = config.warn.sum_div_dimshuffle_bug
        config.warn.sum_div_dimshuffle_bug = False
        try:
-            g = theano.function([c],T.grad(p_y.sum(), c))
+            g = theano.function([c], T.grad(p_y.sum(), c))
        finally:
            config.warn.sum_div_dimshuffle_bug = backup
        #printing.debugprint(g)
@@ -1089,7 +1180,10 @@ class Test_softmax_opt:
        #printing.debugprint(g)
        raise SkipTest('Optimization not enabled for the moment')

-    # REPEAT 3 CASES in presence of log(softmax) with the advanced indexing etc.
+    # REPEAT 3 CASES in presence of log(softmax) with the advanced indexing
+    # etc.

 if __name__ == '__main__':
    unittest.main()
+
+