pep8

07deef6b · Frederic · b7bc0916 · 07deef6b
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -19,23 +19,28 @@ import theano.sandbox.cuda.basic_ops as B
 from theano.tensor.basic import _allclose
 from theano.tests import unittest_tools as utt

-if theano.config.mode=='FAST_COMPILE':
+if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')

+
 def rand_cuda_ndarray(shape):
-    return cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    return cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                    dtype='float32'))
+

 #intentionally disabled
 def tes_use():
    tcn.use()

+
 def test_sum():
    """
-    test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111, 0011, 0101, 0111, 1011, 1111
+    test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111,
+    0011, 0101, 0111, 1011, 1111

    test sum pattern implemented with reshape:
    1000, 0100, 0010, 0001, 11111
@@ -91,18 +96,18 @@ def test_sum():
                           ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111

                           ]:
-        a = tensor.TensorType('float32',(False,)*len(shape))()
+        a = tensor.TensorType('float32', (False,) * len(shape))()
        b = T.Sum(pattern)(a)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
 #        val = numpy.ones(shape)
 #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val,dtype='float32')
-        f = theano.function([a],b, mode=mode_with_gpu)
-        f2 = theano.function([a],b, mode=mode_without_gpu)
+        val = theano._asarray(val, dtype='float32')
+        f = theano.function([a], b, mode=mode_with_gpu)
+        f2 = theano.function([a], b, mode=mode_without_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
-        if val.size==0:
-            assert f2(val)==f(val), ('shape', shape, 'pattern', pattern)
+        if val.size == 0:
+            assert f2(val) == f(val), ('shape', shape, 'pattern', pattern)
        else:
            try:
                #We raise the error threashold as we sum big matrix
@@ -110,7 +115,9 @@ def test_sum():
                #example in debug mode with unittests.rseed=9275
                orig_rtol = theano.tensor.basic.float32_rtol
                theano.tensor.basic.float32_rtol = 2e-5
-                assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
+                assert _allclose(f2(val), f(val)), ('shape', shape,
+                                                    'pattern', pattern,
+                                                    sum([shape[i] for i in pattern]))
            finally:
                theano.tensor.basic.float32_rtol = orig_rtol

@@ -121,21 +128,23 @@ def test_sum():
                           ((5,4),[0,1]),((5,4),[0]),
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
-        a = tensor.TensorType('float32',(False,)*len(shape))()
+        a = tensor.TensorType('float32', (False,) * len(shape))()
        dim_pattern = range(len(shape))
-        dim_pattern[0]=1
-        dim_pattern[1]=0
+        dim_pattern[0] = 1
+        dim_pattern[1] = 0
        a = a.dimshuffle(dim_pattern)
        b = T.Sum(pattern)(a)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
 #        val = numpy.ones(shape)
 #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val,dtype='float32')
-        f = theano.function([a],b, mode=mode_with_gpu)
-        f2 = theano.function([a],b, mode=mode_without_gpu)
+        val = theano._asarray(val, dtype='float32')
+        f = theano.function([a], b, mode=mode_with_gpu)
+        f2 = theano.function([a], b, mode=mode_without_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
-        assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
+        assert _allclose(f2(val), f(val)), ('shape', shape,
+                                            'pattern', pattern,
+                                            sum([shape[i] for i in pattern]))


        #test with broadcast
@@ -143,69 +152,77 @@ def test_sum():
                           ((5,4),[0,1]),((5,4),[0]),
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
-        shape = numpy.asarray(shape)*2
-        a = tensor.TensorType('float32',(False,)*len(shape))()
-        a2 = tcn.CudaNdarrayType((False,)*len(shape))()
+        shape = numpy.asarray(shape) * 2
+        a = tensor.TensorType('float32', (False,) * len(shape))()
+        a2 = tcn.CudaNdarrayType((False,) * len(shape))()
        b = T.Sum(pattern)(a)
        b2 = T.Sum(pattern)(a2)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
 #        val = numpy.ones(shape)
 #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val,dtype='float32')
+        val = theano._asarray(val, dtype='float32')
        val2 = cuda.CudaNdarray(val)
-        if len(shape)==1:
+        if len(shape) == 1:
            val = val[::2]
            val2 = val2[::2]
-        elif len(shape)==2:
-            val = val[::2,::2]
-            val2 = val2[::2,::2]
-        elif len(shape)==3:
-            val = val[::2,::2,::2]
-            val2 = val2[::2,::2,::2]
-        elif len(shape)==4:
-            val = val[::2,::2,::2,::2]
-            val2 = val2[::2,::2,::2,::2]
-        f = theano.function([a],b, mode=mode_without_gpu)
-        f2 = theano.function([a2],b2, mode=mode_with_gpu)
+        elif len(shape) == 2:
+            val = val[::2, ::2]
+            val2 = val2[::2, ::2]
+        elif len(shape) == 3:
+            val = val[::2, ::2, ::2]
+            val2 = val2[::2, ::2, ::2]
+        elif len(shape) == 4:
+            val = val[::2, ::2, ::2, ::2]
+            val2 = val2[::2, ::2, ::2, ::2]
+        f = theano.function([a], b, mode=mode_without_gpu)
+        f2 = theano.function([a2], b2, mode=mode_with_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()]
-        assert _allclose(f2(val2),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
+        assert _allclose(f2(val2), f(val)), ('shape', shape,
+                                             'pattern', pattern,
+                                             sum([shape[i] for i in pattern]))
+

 def test_flatten():
    x = cuda.fmatrix('x')
    f = theano.function([x], x.flatten())
-    assert len(f( [[0.,0.],[0.,0.]] ).shape)==1
+    assert len(f([[0., 0.], [0., 0.]]).shape) == 1
+

 def test_reshape():

    a = tcn.CudaNdarrayType((False,))()
-    b = tcn.CudaNdarrayType((False,False))()
-    c = T.reshape(a, [2,3])
+    b = tcn.CudaNdarrayType((False, False))()
+    c = T.reshape(a, [2, 3])

    #basic
    f = theano.function([a], c, mode=mode_with_gpu)
-    fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32')))
+    fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
+                                                    dtype='float32')))
    topo = f.maker.env.toposort()
    assert any([isinstance(node.op, B.GpuReshape) for node in topo])
-    assert numpy.all(fv == numpy.asarray([[0,1,2], [3,4,5]]))
+    assert numpy.all(fv == numpy.asarray([[0, 1, 2], [3, 4, 5]]))

    #test that it works without inplace operations
-    a_val = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32'))
-    a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32'))
-    b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0,1,2],[3,4,5]],dtype='float32'))
-
-    f_sub = theano.function([a,b], c-b, mode=mode_with_gpu)
+    a_val = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
+                                                     dtype='float32'))
+    a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
+                                                          dtype='float32'))
+    b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0, 1, 2], [3, 4, 5]],
+                                                     dtype='float32'))
+
+    f_sub = theano.function([a, b], c - b, mode=mode_with_gpu)
    topo = f_sub.maker.env.toposort()
    assert any([isinstance(node.op, B.GpuReshape) for node in topo])
    assert numpy.all(f_sub(a_val, b_val) == 0.0)
    assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy))

    #test that it works with inplace operations
-    a_val = theano._asarray([0,1,2,3,4,5], dtype='float32')
-    a_val_copy = theano._asarray([0,1,2,3,4,5], dtype='float32')
-    b_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float32')
+    a_val = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')
+    a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')
+    b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float32')

-    f_sub = theano.function([a,b], c-b, mode=mode_with_gpu)
+    f_sub = theano.function([a, b], c - b, mode=mode_with_gpu)
    topo = f_sub.maker.env.toposort()
    assert any([isinstance(node.op, B.GpuReshape) for node in topo])
    assert numpy.all(f_sub(a_val, b_val) == 0.0)
@@ -213,52 +230,57 @@ def test_reshape():

    # verify gradient
    def just_vals(v):
-        return T.Reshape(2)(v, theano._asarray([2,3], dtype='int32'))
+        return T.Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
    utt.verify_grad(just_vals, [a_val])

+
 def test_elemwise_empty():
    #test with 0 element
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0,0), dtype='float32'), 'a')
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0, 0),
+                                               dtype='float32'), 'a')

    b = tensor.fmatrix()

-    f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
-    f2 = pfunc([b], [], updates=[(a, a+b)], mode=mode_without_gpu)
+    f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
+    f2 = pfunc([b], [], updates=[(a, a + b)], mode=mode_without_gpu)

    a0 = a.get_value() * 1.0
-    f(numpy.ones((0,0), dtype='float32'))
+    f(numpy.ones((0, 0), dtype='float32'))

    assert numpy.all(a0 + 1.0 == a.get_value())

+
 def test_elemwise0():

-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(4,4), dtype='float32'), 'a')
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(4, 4),
+                                               dtype='float32'), 'a')

    b = tensor.fmatrix()

-    f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)

    #check that we work inplace.
-    assert f.maker.env.toposort()[1].op.destroy_map.items()==[(0,[0])]
+    assert f.maker.env.toposort()[1].op.destroy_map.items() == [(0, [0])]

    a0 = a.get_value() * 1.0
    print 'BEFORE ADD', a.get_value()
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
-    f(numpy.ones((4,4), dtype='float32'))
+    f(numpy.ones((4, 4), dtype='float32'))
    print 'AFTER ADD', a.get_value()

    assert numpy.all(a0 + 1.0 == a.get_value())

+
 def test_elemwise_bad_broadcast():
    x = cuda.fmatrix('x')
    y = cuda.fmatrix('y')

    f = theano.function([x, y], x * y, mode=mode_with_gpu)
    print f.maker.env.toposort()
-    assert len(f.maker.env.toposort())==2
+    assert len(f.maker.env.toposort()) == 2
    assert isinstance(f.maker.env.toposort()[0].op, cuda.GpuElemwise)
-    assert f.maker.env.toposort()[1].op==cuda.host_from_gpu
+    assert f.maker.env.toposort()[1].op == cuda.host_from_gpu

    try:
        f(rand_cuda_ndarray((10, 3)), rand_cuda_ndarray((10, 1)))
@@ -267,41 +289,48 @@ def test_elemwise_bad_broadcast():
    else:
        raise Exception("Theano should have raised an error")

+
 def test_elemwise1():
-    """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """
+    """ Several kinds of elemwise expressions with no broadcasting,
+    non power-of-two shape """

-    shape = (3,4)
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.5, 'a')
+    shape = (3, 4)
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
+                                               dtype='float32') + 0.5, 'a')
    b = tensor.fmatrix()

    #let debugmode catch any mistakes
    print >> sys.stdout, "STARTING FUNCTION 1"
-    f = pfunc([b], [], updates=[(a, b**a)], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu)
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
-    f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3)
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)

    print >> sys.stdout, "STARTING FUNCTION 2"
    #let debugmode catch any mistakes
-    f = pfunc([b], [], updates=[(a, tensor.exp(b**a))], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu)
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
-    f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3)
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)

    print >> sys.stdout, "STARTING FUNCTION 3"
    #let debugmode catch any mistakes
-    f = pfunc([b], [], updates=[(a, a+b * tensor.exp(b**a))], mode=mode_with_gpu)
-    f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3)
+    f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))],
+              mode=mode_with_gpu)
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
+

 def test_elemwise2():
    """ Several kinds of elemwise expressions with dimension permutations """
    rng = numpy.random.RandomState(int(time.time()))
    print 'random?', rng.rand(3)
-    shape = (3,5)
-    for pattern in [(0,1), (1,0)]:
-        a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),dtype='float32'), name=None)
-        b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))()
-        f = pfunc([b], [], updates=[(a, (a+b).dimshuffle(pattern))], mode=mode_with_gpu)
+    shape = (3, 5)
+    for pattern in [(0, 1), (1, 0)]:
+        a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),
+                                                   dtype='float32'), name=None)
+        b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
+        f = pfunc([b], [], updates=[(a, (a + b).dimshuffle(pattern))],
+                  mode=mode_with_gpu)
        has_elemwise = False
        for i, node in enumerate(f.maker.env.toposort()):
            print >> sys.stdout, i, node
@@ -309,34 +338,39 @@ def test_elemwise2():
        assert not has_elemwise
        #let debugmode catch errors
        print >> sys.stdout, 'pattern', pattern
-        f(theano._asarray(rng.rand(*shape),dtype='float32')*.3)
-
-    shape = (3,4,5,6)
-    a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),dtype='float32'), 'a')
-    b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))()
-    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) *
-        tensor.exp(b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu)
+        f(theano._asarray(rng.rand(*shape), dtype='float32') * .3)
+
+    shape = (3, 4, 5, 6)
+    a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),
+                                               dtype='float32'), 'a')
+    b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
+    f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) *
+        tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
    assert not has_elemwise
    #let debugmode catch errors
-    f(theano._asarray(rng.rand(*shape),dtype='float32'))
+    f(theano._asarray(rng.rand(*shape), dtype='float32'))
+

 def test_elemwise3():
-    """ Several kinds of elemwise expressions with dimension permutations and broadcasting"""
+    """ Several kinds of elemwise expressions with dimension
+    permutations and broadcasting"""

-    shape = (3,4,5,6)
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
+    shape = (3, 4, 5, 6)
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
+                                               dtype='float32'), 'a')
    b = tensor.fvector()
    print b.type
    print tensor.constant(1).type
    print (1 + b).type
-    print (1 + b**a).type
-    print tensor.exp((1 + b**a)).type
-    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 +
-        b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu)
+    print (1 + b ** a).type
+    print tensor.exp((1 + b ** a)).type
+    new_val = (a + b).dimshuffle([2, 0, 3, 1])
+    new_val *= tensor.exp(1 + b ** a).dimshuffle([2, 0, 3, 1])
+    f = pfunc([b], [], updates=[(a, new_val)], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        print >> sys.stdout, i, node
@@ -345,75 +379,86 @@ def test_elemwise3():
    #let debugmode catch errors
    f(theano._asarray(numpy.random.rand(6), dtype='float32'))

+
 def test_elemwise4():
-    """ Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update"""
+    """ Test that two vectors can be broadcast to form an outer
+    product (by performing rank-1 matrix update"""

-    shape = (3,4)
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
+    shape = (3, 4)
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
+                                               dtype='float32'), 'a')
    b = tensor.fvector()
    c = tensor.fvector()
-    f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*c.dimshuffle(0, 'x')))], mode=mode_with_gpu)
+    f = pfunc([b, c], [],
+              updates=[(a, (a + b.dimshuffle('x', 0) * c.dimshuffle(0, 'x')))],
+              mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        print >> sys.stdout, i, node
        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
    assert not has_elemwise
    #let debugmode catch errors
-    f(theano._asarray(numpy.random.rand(4), dtype='float32'), theano._asarray(numpy.random.rand(3), dtype='float32'))
+    f(theano._asarray(numpy.random.rand(4), dtype='float32'),
+      theano._asarray(numpy.random.rand(3), dtype='float32'))


 def test_elemwise_comparaison_cast():
    """
-    test if an elemwise comparaison followed by a cast to float32 are pushed to gpu.
+    test if an elemwise comparaison followed by a cast to float32 are
+    pushed to gpu.
    """

    a = tensor.fmatrix()
    b = tensor.fmatrix()
-    av = theano._asarray(numpy.random.rand(4,4), dtype='float32')
-    bv = numpy.ones((4,4), dtype='float32')
+    av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
+    bv = numpy.ones((4, 4), dtype='float32')

-    for g,ans in [(tensor.lt, av<bv), (tensor.gt, av>bv),
-                  (tensor.le, av<=bv), (tensor.ge, av>=bv)]:
+    for g, ans in [(tensor.lt, av < bv), (tensor.gt, av > bv),
+                   (tensor.le, av <= bv), (tensor.ge, av >= bv)]:

-        f = pfunc([a,b], tensor.cast(g(a,b),'float32'), mode=mode_with_gpu)
+        f = pfunc([a, b], tensor.cast(g(a, b), 'float32'), mode=mode_with_gpu)

        #theano.printing.debugprint(f)
-        out = f(av,bv)
+        out = f(av, bv)
        assert numpy.all(out == ans)
-        assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.toposort()])
-        #assert any([isinstance(node.op, tensor.Elemwise) for node in f.maker.env.toposort()])
+        assert any([isinstance(node.op, cuda.GpuElemwise)
+                    for node in f.maker.env.toposort()])
+

 def test_elemwise_composite_float64():
    # test that we don't fuse composite elemwise with float64 somewhere inside
-    # nvcc by default downcast them to float32. We would need to tell him not to
-    # do so, but that possible only on some device.
+    # nvcc by default downcast them to float32. We would need to tell him not
+    # to do so, but that possible only on some device.
    a = tensor.fmatrix()
    b = tensor.fmatrix()
-    av = theano._asarray(numpy.random.rand(4,4), dtype='float32')
-    bv = numpy.ones((4,4), dtype='float32')
+    av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
+    bv = numpy.ones((4, 4), dtype='float32')

    def get_all_basic_scalar(composite_op):
-        l=[]
+        l = []
        for i in composite_op.env.toposort():
            if isinstance(i, theano.scalar.Composite):
                l += get_all_basic_scalar(i)
            else:
                l.append(i)
        return l
-    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]:
-        f = pfunc([a,b], tensor.cast(tensor.lt(tensor.cast(a,'float64')**2,#*numpy.asarray(2, 'float32'),
+    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
+                 mode_with_gpu.excluding('elemwise_fusion')]:
+        f = pfunc([a, b],
+                  tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2,
                                               b),
                                     'float32'), mode=mode)

        #theano.printing.debugprint(f, print_type=True)
-        out = f(av,bv)
-        assert numpy.all(out == ((av**2)<bv))
+        out = f(av, bv)
+        assert numpy.all(out == ((av ** 2) < bv))
        for node in f.maker.env.toposort():
            if isinstance(node.op, cuda.GpuElemwise):
                if isinstance(node.op.scalar_op, theano.scalar.Composite):
                    scals = get_all_basic_scalar(node.op.scalar_op)
                    for s in scals:
-                        assert not any([i.type.dtype=='float64' for i in s.inputs+s.outputs])
+                        assert not any([i.type.dtype == 'float64'
+                                        for i in s.inputs + s.outputs])


 def test_elemwise_composite_support_code():
@@ -449,205 +494,226 @@ def test_elemwise_composite_support_code():
 def speed_elemwise_collapse():
    """ used to time if the collapse of ccontiguous dims are useful """

-    shape = (30,40,50,600)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (30, 40, 50, 600)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2[:,::2,:,:]
+    a3 = a2[:, ::2, :, :]
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = a3+b * tensor.exp(1 + b**a3)
+    c = a3 + b * tensor.exp(1 + b ** a3)
    f = pfunc([b], [c], mode=mode_with_gpu)

-
-    v = theano._asarray(numpy.random.rand(*shape),dtype='float32')
-    v = v[:,::2,:,:]
-    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
+    v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
+    v = v[:, ::2, :, :]
+    v = cuda_ndarray.CudaNdarray(v)
+    for id, n in enumerate(f.maker.env.toposort()):
        print id, n
-    t1=time.time()
+    t1 = time.time()
    for i in range(100):
        #let debugmode catch errors
        f(v)
-    t2=time.time()
+    t2 = time.time()
+

 def speed_elemwise_collapse2():
-    """ used to test the speed up of the generalised collapse of ccontiguous dims"""
+    """ used to test the speed up of the generalised collapse of
+    ccontiguous dims"""

-    shape = (30,40,50,600)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (30, 40, 50, 600)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2[:,:,:,::2]
+    a3 = a2[:, :, :, ::2]
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = a3+b * tensor.exp(1 + b**a3)
+    c = a3 + b * tensor.exp(1 + b ** a3)
    f = pfunc([b], [c], mode=mode_with_gpu)

-
-    v = theano._asarray(numpy.random.rand(*shape),dtype='float32')
-    v = v[:,:,:,::2]
-    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
+    v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
+    v = v[:, :, :, ::2]
+    v = cuda_ndarray.CudaNdarray(v)
+    for id, n in enumerate(f.maker.env.toposort()):
        print id, n
-    t1=time.time()
+    t1 = time.time()
    for i in range(100):
        #let debugmode catch errors
        f(v)
-    t2=time.time()
+    t2 = time.time()
+

 def test_elemwise_collapse():
    """ Test when all inputs have one(and the same) broadcastable dimension """

-    shape = (4,5,60)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (4, 5, 60)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle(0,'x',1,2)
+    a3 = a2.dimshuffle(0, 'x', 1, 2)
    b = tcn.CudaNdarrayType((False, True, False, False))()
-    c = a3+b
+    c = a3 + b
    f = pfunc([b], [c], mode=mode_with_gpu)

-
-    v = theano._asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = theano._asarray(numpy.random.rand(shape[0], 1, *shape[1:]),
+                        dtype='float32')
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
-    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
+    out = f(v)[0]
+    assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
    print "Expected collapse of all dimensions"

+
 def test_elemwise_collapse2():
    """ Test when only one inputs have one broadcastable dimension """

-    shape = (4,5,9)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (4, 5, 9)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle(0,'x',1,2)
+    a3 = a2.dimshuffle(0, 'x', 1, 2)
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = a3+b
+    c = a3 + b
    f = pfunc([b], [c], mode=mode_with_gpu)

-
-    v = theano._asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = theano._asarray(numpy.random.rand(shape[0], 5, *shape[1:]),
+                        dtype='float32')
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
-    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
+    out = f(v)[0]
+    assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
    print "Expected collapse to 3 dimensions"

+
 def test_elemwise_collapse3():
    """ Test when only one inputs have two broadcastable dimension at each ends """

-    shape = (4,5)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (4, 5)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape),
+                        dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle('x',0,1,'x')
+    a3 = a2.dimshuffle('x', 0, 1, 'x')
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = (a3+b)
+    c = (a3 + b)
    f = pfunc([b], [c], mode=mode_with_gpu)

-
-    v = theano._asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4),
+                        dtype='float32')
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n  in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
-    assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v)
+    out = f(v)[0]
+    assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v)
    print "Expected collapse to 3 dimensions"

+
 def test_elemwise_collapse4():
-    """ Test when only one inputs have two broadcastable dimension at each ends and we add a scalar"""
+    """ Test when only one inputs have two broadcastable dimension at
+    each ends and we add a scalar"""

-    shape = (4,5)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (4, 5)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle('x',0,1,'x')
+    a3 = a2.dimshuffle('x', 0, 1, 'x')
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = (a3+b+2)
+    c = (a3 + b + 2)
    f = pfunc([b], [c], mode=mode_with_gpu)

-
-    v = theano._asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4),
+                        dtype='float32')
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
-    assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v+2)
+    out = f(v)[0]
+    assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v + 2)
    print "Expected collapse to 3 dimensions"

+
 def test_elemwise_collapse5():
-    """ Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar"""
+    """ Test when only one inputs have two broadcastable dimension at
+    the beginning and we add a scalar"""

-    shape = (4,5)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (4, 5)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle('x','x',0,1)
+    a3 = a2.dimshuffle('x', 'x', 0, 1)
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = (a3+b+2)
+    c = (a3 + b + 2)
    f = pfunc([b], [c], mode=mode_with_gpu)

-
-    v = theano._asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = theano._asarray(numpy.random.rand(5, 4, shape[0], shape[1]),
+                        dtype='float32')
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
-    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2)
+    out = f(v)[0]
+    assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v + 2)
    print "Expected collapse to 2 dimensions"

+
 def test_elemwise_collapse6():
-    """ Test when all inputs have two broadcastable dimension at the beginning"""
+    """ Test when all inputs have two broadcastable dimension at the
+    beginning"""

-    shape = (4,5)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (4, 5)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle('x','x',0,1)
+    a3 = a2.dimshuffle('x', 'x', 0, 1)
    b = tcn.CudaNdarrayType((True, True, False, False))()
-    f = pfunc([b], [a3+b], mode=mode_with_gpu)
+    f = pfunc([b], [a3 + b], mode=mode_with_gpu)

-    v = theano._asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = theano._asarray(numpy.random.rand(1, 1, shape[0], shape[1]),
+                        dtype='float32')
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
-    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v)
+    out = f(v)[0]
+    assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v)
    print "Expected collapse to c contiguous"


 def test_elemwise_collapse7(atol=1e-6):
-    """ Test when one input have one broadcastable dimension and the other is a scalar"""
+    """ Test when one input have one broadcastable dimension and the
+    other is a scalar"""

-    shape = (5,4,1)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    shape = (5, 4, 1)
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a.copy(), 'a')
    a3 = a2.dimshuffle(0, 'x', 1, 2)
-    f = pfunc([], [a3+2], mode=mode_with_gpu)
+    f = pfunc([], [a3 + 2], mode=mode_with_gpu)

    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f()[0]
-    ans=(a+2).reshape(shape[0],1,shape[1],shape[2])
-    assert numpy.allclose(out,ans, atol=atol)
+    out = f()[0]
+    ans = (a + 2).reshape(shape[0], 1, shape[1], shape[2])
+    assert numpy.allclose(out, ans, atol=atol)
    print "Expected collapse to c contiguous"


@@ -657,40 +723,45 @@ def test_hostfromgpu_shape_i():
    """
    pass

-    m = mode_with_gpu.including('local_dot_to_dot22','local_dot22_to_dot22scalar','specialize')
-    a=T.fmatrix('a')
-    ca=theano.sandbox.cuda.var.CudaNdarrayType((False,False))()
+    m = mode_with_gpu.including('local_dot_to_dot22',
+                                'local_dot22_to_dot22scalar','specialize')
+    a = T.fmatrix('a')
+    ca = theano.sandbox.cuda.var.CudaNdarrayType((False, False))()

-    av=numpy.asarray(numpy.random.rand(5,4),dtype='float32')
-    cv=cuda.CudaNdarray(numpy.asarray(numpy.random.rand(5,4),dtype='float32'))
+    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
+    cv = cuda.CudaNdarray(numpy.asarray(numpy.random.rand(5, 4),
+                                      dtype='float32'))

-    f = theano.function([a],cuda.basic_ops.gpu_from_host(a), mode=m)
-    assert cuda.basic_ops.gpu_from_host in [x.op for x in f.maker.env.toposort()]
-    f = theano.function([a],cuda.basic_ops.gpu_from_host(a).shape, mode=m)
+    f = theano.function([a], cuda.basic_ops.gpu_from_host(a), mode=m)
+    assert cuda.basic_ops.gpu_from_host in [x.op
+                                            for x in f.maker.env.toposort()]
+    f = theano.function([a], cuda.basic_ops.gpu_from_host(a).shape, mode=m)
    topo = f.maker.env.toposort()
-    assert isinstance(topo[0].op,T.opt.Shape_i)
-    assert isinstance(topo[1].op,T.opt.Shape_i)
-    assert isinstance(topo[2].op,T.opt.MakeVector)
-    assert tuple(f(av))==(5,4)
+    assert isinstance(topo[0].op, T.opt.Shape_i)
+    assert isinstance(topo[1].op, T.opt.Shape_i)
+    assert isinstance(topo[2].op, T.opt.MakeVector)
+    assert tuple(f(av)) == (5, 4)



-    f = theano.function([ca],cuda.basic_ops.host_from_gpu(ca), mode=m)
-    assert cuda.basic_ops.host_from_gpu in [x.op for x in f.maker.env.toposort()]
-    f = theano.function([ca],cuda.basic_ops.host_from_gpu(ca).shape, mode=m)
+    f = theano.function([ca], cuda.basic_ops.host_from_gpu(ca), mode=m)
+    assert cuda.basic_ops.host_from_gpu in [x.op
+                                            for x in f.maker.env.toposort()]
+    f = theano.function([ca], cuda.basic_ops.host_from_gpu(ca).shape, mode=m)
    topo = f.maker.env.toposort()
-    assert isinstance(topo[0].op,T.opt.Shape_i)
-    assert isinstance(topo[1].op,T.opt.Shape_i)
-    assert isinstance(topo[2].op,T.opt.MakeVector)
-    assert tuple(f(cv))==(5,4)
+    assert isinstance(topo[0].op, T.opt.Shape_i)
+    assert isinstance(topo[1].op, T.opt.Shape_i)
+    assert isinstance(topo[2].op, T.opt.MakeVector)
+    assert tuple(f(cv)) == (5, 4)

 # -----------------------------------------------------------------------

 import theano.sandbox.cuda as cuda_ndarray

+
 def test_gpujoin_assert_cndas():
    # this will end up being an ndarray, as it's float64
-    _a = numpy.asarray([[1,2],[3,4]],dtype='float64')
+    _a = numpy.asarray([[1, 2], [3, 4]], dtype='float64')
    a = theano.shared(_a)

    try:
@@ -703,64 +774,80 @@ def test_gpujoin_assert_cndas():

    assert False

+
 def test_gpujoin_no_rebroadcast():
-    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
+    _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32')
    a = tcn.shared_constructor(_a)
-    f = theano.function([],T.join(1,a))
+    f = theano.function([], T.join(1, a))
    l = f.maker.env.toposort()
-    assert not any([isinstance(x.op,T.Rebroadcast) for x in l])
+    assert not any([isinstance(x.op, T.Rebroadcast) for x in l])


 def test_gpualloc_input_on_gpu():
-    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    a = tcn.shared_constructor(a_val)

    b = T.fscalar()
-    f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu)
-    f_gpu = theano.function([b], T.ones_like(a)+b, mode=mode_with_gpu)
+    f = theano.function([b], T.ones_like(a) + b, mode=mode_without_gpu)
+    f_gpu = theano.function([b], T.ones_like(a) + b, mode=mode_with_gpu)
+
+    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 1
+    assert sum([node.op == B.gpu_alloc
+                for node in f_gpu.maker.env.toposort()]) == 1

-    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1
-    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1
+    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape) + 9,
+                          f_gpu(9))
+    assert numpy.allclose(f(5), f_gpu(5))

-    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
-    assert numpy.allclose(f(5),f_gpu(5))

 def test_gpujoin_gpualloc():
    a = T.fmatrix('a')
-    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    b = T.fmatrix('b')
-    b_val = numpy.asarray(numpy.random.rand(3,5),dtype='float32')
-
-    f = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_without_gpu)
-    f_gpu = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b)), mode=mode_with_gpu)
-    f_gpu2 = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_with_gpu)
+    b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
+
+    f = theano.function([a, b], T.join(0, T.zeros_like(a),T.ones_like(b)) + 4,
+                        mode=mode_without_gpu)
+    f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
+                            mode=mode_with_gpu)
+    f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
+                                           T.ones_like(b)) + 4,
+                             mode=mode_with_gpu)
+
+    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 2
+    assert sum([node.op == T.join for node in f.maker.env.toposort()]) == 1
+    assert sum([node.op == B.gpu_alloc
+                for node in f_gpu.maker.env.toposort()]) == 2
+    assert sum([node.op == B.gpu_join
+                for node in f_gpu.maker.env.toposort()]) == 1
+    assert sum([node.op == B.gpu_alloc
+                for node in f_gpu2.maker.env.toposort()]) == 2
+    assert sum([node.op == B.gpu_join
+                for node in f_gpu2.maker.env.toposort()]) == 1
+    assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))

-    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==2
-    assert sum([node.op == T.join for node in f.maker.env.toposort()])==1
-    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==2
-    assert sum([node.op == B.gpu_join for node in f_gpu.maker.env.toposort()])==1
-    assert sum([node.op == B.gpu_alloc for node in f_gpu2.maker.env.toposort()])==2
-    assert sum([node.op == B.gpu_join for node in f_gpu2.maker.env.toposort()])==1
-    assert numpy.allclose(f(a_val,b_val),f_gpu2(a_val,b_val))

 def test_gpualloc_output_to_gpu():
-    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    a = tcn.shared_constructor(a_val)

    b = T.fscalar()
-    f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu)
-    f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a))+b, mode=mode_with_gpu)
+    f = theano.function([b], T.ones_like(a) + b, mode=mode_without_gpu)
+    f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a)) + b,
+                            mode=mode_with_gpu)

    print f.maker.env.toposort()
    print f_gpu.maker.env.toposort()
    print f(2)
    print f_gpu(2)

-    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1
-    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1
+    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 1
+    assert sum([node.op == B.gpu_alloc
+                for node in f_gpu.maker.env.toposort()]) == 1

-    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
-    assert numpy.allclose(f(5),f_gpu(5))
+    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape) + 9,
+                          f_gpu(9))
+    assert numpy.allclose(f(5), f_gpu(5))


 import theano.tensor.tests.test_basic
@@ -772,6 +859,7 @@ class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
    shared = staticmethod(cuda.shared_constructor)
    allocs = [B.GpuAlloc, B.GpuAlloc, tensor.Alloc]

+
 class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
    def setUp(self):
        utt.seed_rng()
@@ -789,128 +877,152 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):

 # This is to don't duplicate test.
 class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
-    shared=staticmethod(cuda.shared_constructor)
-    sub=cuda.GpuSubtensor
-    inc_sub=cuda.GpuIncSubtensor
-    adv_sub1=cuda.GpuAdvancedSubtensor1
-    adv_incsub1=cuda.GpuAdvancedIncSubtensor1
-    mode=mode_with_gpu
-    dtype='float32'
-    ignore_topo=(B.HostFromGpu, B.GpuFromHost)
+    shared = staticmethod(cuda.shared_constructor)
+    sub = cuda.GpuSubtensor
+    inc_sub = cuda.GpuIncSubtensor
+    adv_sub1 = cuda.GpuAdvancedSubtensor1
+    adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
+    mode = mode_with_gpu
+    dtype = 'float32'
+    ignore_topo = (B.HostFromGpu, B.GpuFromHost)
    fast_compile = theano.config.mode == 'FAST_COMPILE'
+
    def __init__(self, name):
-        return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name)
+        return super(theano.tensor.tests.test_basic.T_subtensor,
+                     self).__init__(name)
+

 def test_advinc_subtensor1():
    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
    shared = cuda.shared_constructor
    #shared = tensor.shared
-    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                      dtype='float32')
-    yval = numpy.asarray([[10,10,10], [10,10,10]],
+    yval = numpy.asarray([[10, 10, 10], [10, 10, 10]],
                      dtype='float32')
-    x = shared(xval, name = 'x')
+    x = shared(xval, name='x')
    y = T.fmatrices('y')
-    expr = T.advanced_inc_subtensor1(x,y,[0,2])
-    f=theano.function([y], expr, mode=mode_with_gpu)
-    assert sum([isinstance(node.op,cuda.GpuAdvancedIncSubtensor1) for node in f.maker.env.toposort() ])==1
-    assert numpy.allclose(f(yval),[[11.,12.,13.], [4.,5.,6.], [17.,18.,19.]])
+    expr = T.advanced_inc_subtensor1(x, y, [0, 2])
+    f = theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
+                for node in f.maker.env.toposort()]) == 1
+    assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.],
+                                    [17., 18., 19.]])


 def test_inc_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared
-    x,y = T.fmatrices('x','y')
-    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+    x, y = T.fmatrices('x', 'y')
+    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                      dtype='float32')
-    yval = numpy.asarray([[10,10,10], [10,10,10], [10,10,10]],
+    yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]],
                      dtype='float32')
-    expr = T.inc_subtensor(x[:,1:3], y[:,1:3])
-    f=theano.function([x,y], expr, mode=mode_with_gpu)
+    expr = T.inc_subtensor(x[:, 1:3], y[:, 1:3])
+    f = theano.function([x, y], expr, mode=mode_with_gpu)
    print f.maker.env.toposort()
-    assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
-    assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==False for node in f.maker.env.toposort() ])==1
-    assert numpy.allclose(f(xval,yval),[[1.,12.,13.], [4.,15.,16.], [7.,18.,19.]])
+    assert sum([isinstance(node.op, cuda.GpuSubtensor)
+                for node in f.maker.env.toposort()]) == 1
+    assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
+                node.op.set_instead_of_inc==False
+                for node in f.maker.env.toposort()]) == 1
+    assert numpy.allclose(f(xval, yval), [[1., 12., 13.],
+                                          [4., 15., 16.], [7., 18., 19.]])


 def test_set_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared
-    x,y = T.fmatrices('x','y')
-    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+    x, y = T.fmatrices('x', 'y')
+    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                      dtype='float32')
-    yval = numpy.asarray([[10,10,10], [10,10,10], [10,10,10]],
+    yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]],
                      dtype='float32')
-    expr = T.set_subtensor(x[:,1:3], y[:,1:3])
-    f=theano.function([x,y], expr, mode=mode_with_gpu)
-    assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
-    assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==True for node in f.maker.env.toposort() ])==1
-    print f(xval,yval)
+    expr = T.set_subtensor(x[:, 1:3], y[:, 1:3])
+    f = theano.function([x, y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op, cuda.GpuSubtensor)
+                for node in f.maker.env.toposort()]) == 1
+    assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
+                node.op.set_instead_of_inc == True
+                for node in f.maker.env.toposort()]) == 1
+    print f(xval, yval)


 def test_many_arg_elemwise():
    """this test checks whether the + and * elemwise ops can handle extremely large numbers of
    arguments on gpu
    i.e., it is a test of the optimization theano/sandbox/cuda/opt.py:local_gpu_huge_add_or_mul """
-    rng = numpy.random.RandomState( [1,2,3])
+    rng = numpy.random.RandomState([1, 2, 3])

    for num_args in [25]:
-        for op_to_test in [ theano.tensor.add, theano.tensor.mul ]:
-            for nb_dim in [2,3,4,5]:
-                shapes = [rng.randint(1,5) for i in range(nb_dim)]
-                args = [ numpy.cast['float32'](rng.randn(*shapes)) for arg in xrange(0,num_args) ]
+        for op_to_test in [theano.tensor.add, theano.tensor.mul]:
+            for nb_dim in [2, 3, 4, 5]:
+                shapes = [rng.randint(1, 5) for i in range(nb_dim)]
+                args = [numpy.cast['float32'](rng.randn(*shapes))
+                        for arg in xrange(0, num_args)]

-                symb_args = [ theano.tensor.TensorType('float32', (False,)*nb_dim)() for arg in xrange(0,num_args) ]
+                symb_args = [theano.tensor.TensorType('float32',
+                                                      (False,)*nb_dim)()
+                             for arg in xrange(0, num_args)]


                outputs = []
-                for mode in [ mode_with_gpu, mode_without_gpu ]:
+                for mode in [mode_with_gpu, mode_without_gpu]:
                    #test the optijmization local_gpu_elemwise_0
-                    f = theano.function( symb_args, op_to_test(*symb_args), mode = mode.excluding("local_gpu_elemwise_1") )
-                    outputs.append( f( * args) )
+                    f = theano.function(
+                        symb_args, op_to_test(*symb_args),
+                        mode=mode.excluding("local_gpu_elemwise_1"))
+                    outputs.append(f(*args))
                    #assert that the test was done on the gpu.
                    if mode is mode_with_gpu:
-                        assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes])
+                        assert any([isinstance(node.op, cuda.GpuElemwise)
+                                    for node in f.maker.env.nodes])

                    #test the optijmization local_gpu_elemwise_1
-                    f = theano.function( symb_args,
-                                         cuda.gpu_from_host(op_to_test(*symb_args)),
-                                         mode = mode.excluding("local_gpu_elemwise_0") )
-                    out = f( * args)
+                    f = theano.function(
+                        symb_args,
+                        cuda.gpu_from_host(op_to_test(*symb_args)),
+                        mode=mode.excluding("local_gpu_elemwise_0"))
+                    out = f(*args)
                    #assert that the test was done on the gpu.
                    if mode is mode_with_gpu:
-                        assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes])
+                        assert any([isinstance(node.op, cuda.GpuElemwise)
+                                    for node in f.maker.env.nodes])
                    assert numpy.allclose(out, outputs[-1])

                results_gpu, results_cpu = outputs

                assert numpy.allclose(results_gpu, results_cpu)

+
 def test_duplicate_arg_elemwise():
    A = theano.tensor.fmatrix()
    B = A + A

-    f = theano.function([A],B, mode = mode_with_gpu)
+    f = theano.function([A], B, mode=mode_with_gpu)

-    Aval = numpy.random.RandomState([1,2,3]).randn(5,5).astype('float32')
+    Aval = numpy.random.RandomState([1, 2, 3]).randn(5, 5).astype('float32')
    Bval = Aval + Aval

-    assert numpy.allclose(Bval,f(Aval))
+    assert numpy.allclose(Bval, f(Aval))
+

 def test_shared_float32():
    '''Test use of cuda.shared_constructor through theano.shared'''
    # Register cuda.shared_constructor in theano.shared
    theano.shared.constructors.append(cuda.shared_constructor)

-    a = theano.shared(numpy.ones((2,3), dtype='float32'))
+    a = theano.shared(numpy.ones((2, 3), dtype='float32'))
    assert isinstance(a.type, tcn.CudaNdarrayType)

    # Unregister
    del theano.shared.constructors[-1]

+
 def test_shared_cudandarray():
-    '''Test that we can create a CudaNdarraySharedVariable from a CudaNdarray'''
-    a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2,3)))
+    '''Test that we can create a CudaNdarraySharedVariable from a
+    CudaNdarray'''
+    a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2, 3)))
    assert isinstance(a.type, tcn.CudaNdarrayType)


@@ -993,38 +1105,38 @@ class test_size(unittest.TestCase):
 import theano.tensor.tests.test_sharedvar
 #This test the case when the shared constructor view an CudaNdarray as input
 test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester(
-    shared_constructor_ = tcn.shared_constructor,
-    dtype_ = 'float32',
-    get_value_borrow_true_alias_ = True,
-    shared_borrow_true_alias_ = True,#True when the original value is already a CudaNdarray!
-    set_value_borrow_true_alias_ = True,
-    set_value_inplace_ = True,
-    set_cast_value_inplace_ = False,
-    shared_constructor_accept_ndarray_ = True,
-    internal_type_ = cuda_ndarray.CudaNdarray,
-    test_internal_type_ = lambda a: isinstance(a,cuda_ndarray.CudaNdarray),
-    theano_fct_ = theano.tensor.exp,
-    ref_fct_ = numpy.exp,
-    cast_value_ = cuda.as_cuda_array,
-    op_by_matrix_ = True,
+    shared_constructor_=tcn.shared_constructor,
+    dtype_='float32',
+    get_value_borrow_true_alias_=True,
+    shared_borrow_true_alias_=True,#True when the original value is already a CudaNdarray!
+    set_value_borrow_true_alias_=True,
+    set_value_inplace_=True,
+    set_cast_value_inplace_=False,
+    shared_constructor_accept_ndarray_=True,
+    internal_type_=cuda_ndarray.CudaNdarray,
+    test_internal_type_=lambda a: isinstance(a, cuda_ndarray.CudaNdarray),
+    theano_fct_=theano.tensor.exp,
+    ref_fct_=numpy.exp,
+    cast_value_=cuda.as_cuda_array,
+    op_by_matrix_=True,
    name='test_shared_options')

 #This test the case when the shared constructor view an ndarray as input
 test_shared_options2 = theano.tensor.tests.test_sharedvar.makeSharedTester(
-    shared_constructor_ = tcn.shared_constructor,
-    dtype_ = 'float32',
-    get_value_borrow_true_alias_ = False,
-    shared_borrow_true_alias_ = False,
-    set_value_borrow_true_alias_ = False,
-    set_value_inplace_ = True,
-    set_cast_value_inplace_ = True,
-    shared_constructor_accept_ndarray_ = True,
-    internal_type_ = cuda_ndarray.CudaNdarray,
-    test_internal_type_ = lambda a: isinstance(a,cuda_ndarray.CudaNdarray),
-    theano_fct_ = theano.tensor.exp,
-    ref_fct_ = numpy.exp,
-    cast_value_ = numpy.asarray,
-    op_by_matrix_ = True,
+    shared_constructor_=tcn.shared_constructor,
+    dtype_='float32',
+    get_value_borrow_true_alias_=False,
+    shared_borrow_true_alias_=False,
+    set_value_borrow_true_alias_=False,
+    set_value_inplace_=True,
+    set_cast_value_inplace_=True,
+    shared_constructor_accept_ndarray_=True,
+    internal_type_=cuda_ndarray.CudaNdarray,
+    test_internal_type_=lambda a: isinstance(a, cuda_ndarray.CudaNdarray),
+    theano_fct_=theano.tensor.exp,
+    ref_fct_=numpy.exp,
+    cast_value_=numpy.asarray,
+    op_by_matrix_=True,
    name='test_shared_options')

 if __name__ == '__main__':