pep8 fixes

a87abbc9 · orhanf · 8ce2395b · a87abbc9 · a87abbc9
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
-import operator
+	import operator
-import sys
+	import sys
-import unittest
+	import unittest
-import numpy
+	import numpy
-# Skip test if cuda_ndarray is not available.
+	# Skip test if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
+	from nose.plugins.skip import SkipTest
-import theano
+	import theano
-from theano.compile.pfunc import pfunc
+	from theano.compile.pfunc import pfunc
-from theano import config, tensor
+	from theano import config, tensor
-import theano.tensor.tests.test_nlinalg
+	import theano.tensor.tests.test_nlinalg
-import theano.tensor.tests.test_opt as test_opt
+	import theano.tensor.tests.test_opt as test_opt
-from theano.tests import unittest_tools as utt
+	from theano.tests import unittest_tools as utt
-import theano.sandbox.cuda as cuda
+	import theano.sandbox.cuda as cuda
-if not cuda.cuda_available:
+	if not cuda.cuda_available:
-    raise SkipTest('Optional package cuda disabled')
+	    raise SkipTest('Optional package cuda disabled')
-import theano.sandbox.cuda.cula as cula
+	import theano.sandbox.cuda.cula as cula
-from theano.sandbox.cuda import basic_ops
+	from theano.sandbox.cuda import basic_ops
-from theano.sandbox.cuda.type import CudaNdarrayType
+	from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.scalar.basic_scipy import erfinv
+	from theano.scalar.basic_scipy import erfinv
-if theano.config.mode == 'FAST_COMPILE':
+	if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+	    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+	    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
-else:
+	else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+	    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+	    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
-def test_no_shared_var_graph():
+	def test_no_shared_var_graph():
-    """Test that the InputToGpuOptimizer optimizer make graph that don't have shared variable compiled too.
+	    """Test that the InputToGpuOptimizer optimizer make graph that don't have shared variable compiled too.
-    """
+	    """
-    a = tensor.fmatrix()
+	    a = tensor.fmatrix()
-    b = tensor.fmatrix()
+	    b = tensor.fmatrix()
-    f = theano.function([a, b], [a + b], mode=mode_with_gpu)
+	    f = theano.function([a, b], [a + b], mode=mode_with_gpu)
-    l = f.maker.fgraph.toposort()
+	    l = f.maker.fgraph.toposort()
-    assert len(l) == 4
+	    assert len(l) == 4
-    assert numpy.any(isinstance(x.op, cuda.GpuElemwise) for x in l)
+	    assert numpy.any(isinstance(x.op, cuda.GpuElemwise) for x in l)
-    assert numpy.any(isinstance(x.op, cuda.GpuFromHost) for x in l)
+	    assert numpy.any(isinstance(x.op, cuda.GpuFromHost) for x in l)
-    assert numpy.any(isinstance(x.op, cuda.HostFromGpu) for x in l)
+	    assert numpy.any(isinstance(x.op, cuda.HostFromGpu) for x in l)
-def test_local_assert():
+	def test_local_assert():
-    x = theano.tensor.fmatrix()
+	    x = theano.tensor.fmatrix()
-    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
+	    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
-    f = theano.function([x], a, mode=mode_with_gpu)
+	    f = theano.function([x], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
+	    a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
-    assert len(a_op) == 1
+	    assert len(a_op) == 1
-    assert isinstance(a_op[0].inputs[0].type, CudaNdarrayType)
+	    assert isinstance(a_op[0].inputs[0].type, CudaNdarrayType)
-def test_local_remove_all_assert():
+	def test_local_remove_all_assert():
-    x = theano.tensor.fmatrix()
+            x = theano.tensor.fmatrix()
-    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
+            a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
-    f = theano.function([x], a, mode=mode_with_gpu)
+            f = theano.function([x], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+            topo = f.maker.fgraph.toposort()
-    a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
+            a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
-    assert len(a_op) == 0
+	    assert len(a_op) == 0
-def test_int_pow():
+	def test_int_pow():
-    a = CudaNdarrayType([False])()
+	    a = CudaNdarrayType([False])()
-    f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)
+	    f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)
-    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
+	    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
-    assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
+	    assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
-    f = theano.function([a], tensor.pow(a, 4).sum(), mode=mode_with_gpu)
+	    f = theano.function([a], tensor.pow(a, 4).sum(), mode=mode_with_gpu)
-    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
+	    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
-    assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
+	    assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
-def test_gpualloc():
+	def test_gpualloc():
-    '''
+	    '''
-    This tests tries to catch the scenario when, due to infer_shape,
+	    This tests tries to catch the scenario when, due to infer_shape,
-    the input of the alloc changes from tensor scalar to a constant
+	    the input of the alloc changes from tensor scalar to a constant
-    1. In this case the original constracted broadcastable pattern will
+	    1. In this case the original constracted broadcastable pattern will
-    have a False for that dimension, but the new broadcastable pattern
+	    have a False for that dimension, but the new broadcastable pattern
-    that will be inserted by gpualloc will have  a True since it knows the
+	    that will be inserted by gpualloc will have  a True since it knows the
-    dimension is 1 and therefore broadcastable.
+	    dimension is 1 and therefore broadcastable.
-    '''
+	    '''
-    x = theano.shared(numpy.ones(3, dtype='float32'), 'x')
+	    x = theano.shared(numpy.ones(3, dtype='float32'), 'x')
-    m = (x).dimshuffle(['x', 0])
+	    m = (x).dimshuffle(['x', 0])
-    v = tensor.alloc(1., *m.shape)
+	    v = tensor.alloc(1., *m.shape)
-    f = theano.function([], v + x,
+	    f = theano.function([], v + x,
-                        mode=mode_with_gpu.excluding("local_elemwise_alloc"))
+		                mode=mode_with_gpu.excluding("local_elemwise_alloc"))
-    l = f.maker.fgraph.toposort()
+	    l = f.maker.fgraph.toposort()
-    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
+	    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
-class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
+	class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
-    dtype = 'float32'
+	    dtype = 'float32'
-    def setUp(self):
+	    def setUp(self):
-        super(Test_local_elemwise_alloc, self).setUp()
+		super(Test_local_elemwise_alloc, self).setUp()
-        self.fast_run_mode = mode_with_gpu
+		self.fast_run_mode = mode_with_gpu
-        # self.vec = tensor.vector('vec', dtype=dtype)
+		# self.vec = tensor.vector('vec', dtype=dtype)
-        # self.mat = tensor.matrix('mat', dtype=dtype)
+		# self.mat = tensor.matrix('mat', dtype=dtype)
-        # self.tens = tensor.tensor3('tens', dtype=dtype)
+		# self.tens = tensor.tensor3('tens', dtype=dtype)
-        # self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
+		# self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
-        # self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
+		# self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
-        self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
+		self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2, 2)
-        self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
+		self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.mat.shape)
-        self.alloc_w_dep_tens = basic_ops.gpu_alloc(
+		self.alloc_w_dep_tens = basic_ops.gpu_alloc(
-            self.vec,
+		    self.vec,
-            self.tens.shape[0],
+		    self.tens.shape[0],
-            self.tens.shape[1]
+		    self.tens.shape[1]
-        )
+		)
-        self.tv_wo_dep = basic_ops.gpu_alloc(self.vec, 5, 5)
+		self.tv_wo_dep = basic_ops.gpu_alloc(self.vec, 5, 5)
-        self.tm_wo_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
+		self.tm_wo_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
-        self.s = tensor.iscalar('s')
+		self.s = tensor.iscalar('s')
-        self.tv_w_dep = basic_ops.gpu_alloc(self.vec, self.s, self.s)
+		self.tv_w_dep = basic_ops.gpu_alloc(self.vec, self.s, self.s)
-        self.tm_w_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
+		self.tm_w_dep = basic_ops.gpu_alloc(self.mat, 5, 5, 5)
-        self.row = tensor.row(dtype=self.dtype)
+		self.row = tensor.row(dtype=self.dtype)
-        self.o = basic_ops.gpu_alloc(self.row, 5, 5)
+		self.o = basic_ops.gpu_alloc(self.row, 5, 5)
-    def _verify_alloc_count(self, f, count):
+	    def _verify_alloc_count(self, f, count):
-        assert(
+		assert(
-            sum([isinstance(elem.op, basic_ops.GpuAlloc)
+		    sum([isinstance(elem.op, basic_ops.GpuAlloc)
-                 for elem in f.maker.fgraph.toposort()
+		         for elem in f.maker.fgraph.toposort()
-                 if elem.op is not None]) == count
+		         if elem.op is not None]) == count
-        )
+		)
-    def _verify_assert_count(self, f, count):
+	    def _verify_assert_count(self, f, count):
-        assert(
+		assert(
-            sum([isinstance(elem.op, tensor.opt.Assert)
+		    sum([isinstance(elem.op, tensor.opt.Assert)
-                 for elem in f.maker.fgraph.toposort()
+		         for elem in f.maker.fgraph.toposort()
-                 if elem.op is not None]) == count
+		         if elem.op is not None]) == count
-        )
+		)
-def test_alloc_memset_0():
+	def test_alloc_memset_0():
-    i = tensor.iscalar()
+	    i = tensor.iscalar()
-    z = numpy.zeros((1,), dtype='float32')
+	    z = numpy.zeros((1,), dtype='float32')
-    o = numpy.ones((1,), dtype='float32')
+	    o = numpy.ones((1,), dtype='float32')
-    ones = numpy.ones((2,), dtype='float32')
+	    ones = numpy.ones((2,), dtype='float32')
-    # Test with 0
+	    # Test with 0
-    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
+	    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
-    f = theano.function([i], a, mode=mode_with_gpu)
+	    f = theano.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
+	    assert len(topo) == 1
-    assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
+	    assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
-    assert (numpy.asarray(f(6)) == 0).all()
+	    assert (numpy.asarray(f(6)) == 0).all()
-    # Test with 1
+	    # Test with 1
-    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(o)), i)
+	    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(o)), i)
-    f = theano.function([i], a, mode=mode_with_gpu)
+	    f = theano.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
+	    assert len(topo) == 1
-    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
+	    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
-    assert not topo[0].op.memset_0
+	    assert not topo[0].op.memset_0
-    assert (numpy.asarray(f(6)) == 1).all()
+	    assert (numpy.asarray(f(6)) == 1).all()
-    # Test with 1, 1
+	    # Test with 1, 1
-    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(ones)), i)
+	    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(ones)), i)
-    f = theano.function([i], a, mode=mode_with_gpu)
+	    f = theano.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
+	    assert len(topo) == 1
-    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
+	    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
-    assert not topo[0].op.memset_0
+	    assert not topo[0].op.memset_0
-    assert (numpy.asarray(f(2)) == 1).all()
+	    assert (numpy.asarray(f(2)) == 1).all()
-def test_gpuspecifyshape():
+	def test_gpuspecifyshape():
-    x = cuda.shared_constructor(numpy.ones(3, dtype='float32'), 'x')
+	    x = cuda.shared_constructor(numpy.ones(3, dtype='float32'), 'x')
-    m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))
+	    m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))
-    f = theano.function([], updates=[(x, m * numpy.float32(2))],
+	    f = theano.function([], updates=[(x, m * numpy.float32(2))],
-                        mode=mode_with_gpu)
+		                mode=mode_with_gpu)
-    l = f.maker.fgraph.toposort()
+	    l = f.maker.fgraph.toposort()
-    assert not numpy.any([isinstance(x.op, cuda.HostFromGpu) for x in l])
+	    assert not numpy.any([isinstance(x.op, cuda.HostFromGpu) for x in l])
-def test_softmax():
+	def test_softmax():
-    x = tensor.fmatrix()
+	    x = tensor.fmatrix()
-    f = theano.function([x], tensor.nnet.nnet.Softmax()(x),
+	    f = theano.function([x], tensor.nnet.nnet.Softmax()(x),
-                        mode=mode_with_gpu.excluding('cudnn'))
+		                mode=mode_with_gpu.excluding('cudnn'))
-    f2 = theano.function([x], tensor.nnet.nnet.Softmax()(x),
+	    f2 = theano.function([x], tensor.nnet.nnet.Softmax()(x),
-                         mode=mode_without_gpu)
+		                 mode=mode_without_gpu)
-    assert isinstance(f.maker.fgraph.toposort()[1].op, cuda.nnet.GpuSoftmax)
+	    assert isinstance(f.maker.fgraph.toposort()[1].op, cuda.nnet.GpuSoftmax)
-    xv = numpy.random.rand(7, 8).astype('float32')
+	    xv = numpy.random.rand(7, 8).astype('float32')
-    assert numpy.allclose(f(xv), f2(xv))
+	    assert numpy.allclose(f(xv), f2(xv))
-def test_softmax_with_bias():
+	def test_softmax_with_bias():
-    x = tensor.fmatrix()
+	    x = tensor.fmatrix()
-    b = tensor.fvector()
+	    b = tensor.fvector()
-    f = theano.function([x, b], tensor.nnet.nnet.SoftmaxWithBias()(x, b),
+	    f = theano.function([x, b], tensor.nnet.nnet.SoftmaxWithBias()(x, b),
-                        mode=mode_with_gpu)
+		                mode=mode_with_gpu)
-    f2 = theano.function([x, b], tensor.nnet.nnet.SoftmaxWithBias()(x, b),
+	    f2 = theano.function([x, b], tensor.nnet.nnet.SoftmaxWithBias()(x, b),
-                         mode=mode_without_gpu)
+		                 mode=mode_without_gpu)
-    assert isinstance(f.maker.fgraph.toposort()[2].op,
+	    assert isinstance(f.maker.fgraph.toposort()[2].op,
-                      cuda.nnet.GpuSoftmaxWithBias)
+		              cuda.nnet.GpuSoftmaxWithBias)
-    xv = numpy.random.rand(7, 8).astype('float32')
+	    xv = numpy.random.rand(7, 8).astype('float32')
-    bv = numpy.random.rand(8).astype('float32')
+	    bv = numpy.random.rand(8).astype('float32')
-    assert numpy.allclose(f(xv, bv), f2(xv, bv))
+	    assert numpy.allclose(f(xv, bv), f2(xv, bv))
-def test_opt_gpujoin_onlyajoin():
+	def test_opt_gpujoin_onlyajoin():
-    # from a bug in normal sampling
+	    # from a bug in normal sampling
-    _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32')
+	    _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32')
-    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float32')
+	    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float32')
-    a = cuda.shared_constructor(_a)
+	    a = cuda.shared_constructor(_a)
-    b = cuda.shared_constructor(_b)
+	    b = cuda.shared_constructor(_b)
-    c = tensor.join(1, a, b)
+	    c = tensor.join(1, a, b)
-    f = theano.function([], c, mode=mode_with_gpu)
+	    f = theano.function([], c, mode=mode_with_gpu)
-    f()
+	    f()
-    graph_nodes = f.maker.fgraph.toposort()
+	    graph_nodes = f.maker.fgraph.toposort()
-    assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu)
+	    assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu)
-    assert isinstance(graph_nodes[-2].op, cuda.GpuJoin)
+	    assert isinstance(graph_nodes[-2].op, cuda.GpuJoin)
-    assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))
+	    assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))
-def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
+	def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
-    # from a bug in gpu normal sampling
+	    # from a bug in gpu normal sampling
-    _a = numpy.asarray([1, 2, 3, 4], dtype='float32')
+	    _a = numpy.asarray([1, 2, 3, 4], dtype='float32')
-    _b = numpy.asarray([5, 6, 7, 8], dtype='float32')
+	    _b = numpy.asarray([5, 6, 7, 8], dtype='float32')
-    a = cuda.shared_constructor(_a)
+	    a = cuda.shared_constructor(_a)
-    b = cuda.shared_constructor(_b)
+	    b = cuda.shared_constructor(_b)
-    a_prime = tensor.cos(a)
+	    a_prime = tensor.cos(a)
-    b_prime = tensor.sin(b)
+	    b_prime = tensor.sin(b)
-    c = tensor.join(0, a_prime, b_prime)
+	    c = tensor.join(0, a_prime, b_prime)
-    d = c[:-1]
+	    d = c[:-1]
-    f = theano.function([], d, mode=mode_with_gpu)
+	    f = theano.function([], d, mode=mode_with_gpu)
-    graph_nodes = f.maker.fgraph.toposort()
+	    graph_nodes = f.maker.fgraph.toposort()
-    assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu)
+	    assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu)
-    assert isinstance(graph_nodes[-2].op, cuda.GpuSubtensor)
+	    assert isinstance(graph_nodes[-2].op, cuda.GpuSubtensor)
-    assert isinstance(graph_nodes[-3].op, cuda.GpuJoin)
+	    assert isinstance(graph_nodes[-3].op, cuda.GpuJoin)
-    concat = numpy.concatenate([numpy.cos(_a), numpy.sin(_b)], axis=1)
+	    concat = numpy.concatenate([numpy.cos(_a), numpy.sin(_b)], axis=1)
-    concat = concat[:-1]
+	    concat = concat[:-1]
-    assert numpy.allclose(numpy.asarray(f()), concat)
+	    assert numpy.allclose(numpy.asarray(f()), concat)
-def test_local_gpu_subtensor():
+	def test_local_gpu_subtensor():
-    # Test shared forced on CPU.
+	    # Test shared forced on CPU.
-    t = tensor._shared(numpy.zeros(20, "float32"))
+	    t = tensor._shared(numpy.zeros(20, "float32"))
-    f = theano.function([], t[3:4], mode=mode_with_gpu)
+	    f = theano.function([], t[3:4], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert any([type(node.op) is tensor.Subtensor for node in topo])
+	    assert any([type(node.op) is tensor.Subtensor for node in topo])
-    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+	    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
-    # Test graph input.
+	    # Test graph input.
-    t = tensor.fmatrix()
+	    t = tensor.fmatrix()
-    f = theano.function([t], t[3:4], mode=mode_with_gpu)
+	    f = theano.function([t], t[3:4], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert any([type(node.op) is tensor.Subtensor for node in topo])
+	    assert any([type(node.op) is tensor.Subtensor for node in topo])
-    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+	    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
-    # Test multiple use of the input
+	    # Test multiple use of the input
-    # We want the subtensor to be on the GPU to prevent multiple transfer.
+	    # We want the subtensor to be on the GPU to prevent multiple transfer.
-    t = tensor.fmatrix()
+	    t = tensor.fmatrix()
-    f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
+	    f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert not any([type(node.op) is tensor.Subtensor for node in topo])
+	    assert not any([type(node.op) is tensor.Subtensor for node in topo])
-    assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+	    assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
-    # Test multiple use of the input + input as output
+	    # Test multiple use of the input + input as output
-    # We want the subtensor to be on the GPU to prevent multiple transfer.
+	    # We want the subtensor to be on the GPU to prevent multiple transfer.
-    t = tensor.fmatrix()
+	    t = tensor.fmatrix()
-    f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
+	    f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert not any([type(node.op) is tensor.Subtensor for node in topo])
+	    assert not any([type(node.op) is tensor.Subtensor for node in topo])
-    assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+	    assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
-    # Test shared forced on CPU end we do computation on the output of
+	    # Test shared forced on CPU end we do computation on the output of
-    # the subtensor.
+	    # the subtensor.
-    t = tensor._shared(numpy.zeros(20, "float32"))
+	    t = tensor._shared(numpy.zeros(20, "float32"))
-    f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
+	    f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert any([type(node.op) is tensor.Subtensor for node in topo])
+	    assert any([type(node.op) is tensor.Subtensor for node in topo])
-    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+	    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
-    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
+	    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
-def test_local_gpu_split():
+	def test_local_gpu_split():
-    """ Test that the GpuSplit op is being applied and works """
+	    """ Test that the GpuSplit op is being applied and works """
-    # Construct symbolic split
+	    # Construct symbolic split
-    x = tensor.fvector()
+	    x = tensor.fvector()
-    splits = tensor.lvector()
+	    splits = tensor.lvector()
-    ra, rb, rc = tensor.split(x, splits, n_splits=3, axis=0)
+	    ra, rb, rc = tensor.split(x, splits, n_splits=3, axis=0)
-    # Compile function to use CPU
+	    # Compile function to use CPU
-    f = theano.function([x, splits], [ra, rb, rc], mode=mode_without_gpu)
+	    f = theano.function([x, splits], [ra, rb, rc], mode=mode_without_gpu)
-    # Get values for CPU version
+	    # Get values for CPU version
-    cpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
+	    cpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
-    l = f.maker.fgraph.toposort()
+	    l = f.maker.fgraph.toposort()
-    # Ensure that one op is theano.tensor.Split
+	    # Ensure that one op is theano.tensor.Split
-    assert any([isinstance(o.op, theano.tensor.Split) for o in l])
+	    assert any([isinstance(o.op, theano.tensor.Split) for o in l])
-    # GPU version
+	    # GPU version
-    f = theano.function([x, splits], [ra, rb, rc], mode=mode_with_gpu)
+	    f = theano.function([x, splits], [ra, rb, rc], mode=mode_with_gpu)
-    gpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
+	    gpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
-    l = f.maker.fgraph.toposort()
+	    l = f.maker.fgraph.toposort()
-    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
+	    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
-    # Check equality
+	    # Check equality
-    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
+	    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
-    # Test the other path of the optimizer, when it is the output that
+	    # Test the other path of the optimizer, when it is the output that
-    # is moved to the GPU.
+	    # is moved to the GPU.
-    ra = cuda.gpu_from_host(ra)
+	    ra = cuda.gpu_from_host(ra)
-    f = theano.function([x, splits], [ra, rb, rc],
+	    f = theano.function([x, splits], [ra, rb, rc],
-                        mode=mode_with_gpu.excluding("InputToGpuOptimizer"))
+		                mode=mode_with_gpu.excluding("InputToGpuOptimizer"))
-    gpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
+	    gpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
-    l = f.maker.fgraph.toposort()
+	    l = f.maker.fgraph.toposort()
-    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
+	    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
-    # Check equality
+	    # Check equality
-    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
+	    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
-    # Test that split with only 1 output work
+	    # Test that split with only 1 output work
-    ra = tensor.split(x, splits, n_splits=1, axis=0)
+	    ra = tensor.split(x, splits, n_splits=1, axis=0)
-    f = theano.function([x, splits], [ra], mode=mode_without_gpu)
+	    f = theano.function([x, splits], [ra], mode=mode_without_gpu)
-    cpu_res = f([0, 1, 2, 3, 4, 5], [6])
+	    cpu_res = f([0, 1, 2, 3, 4, 5], [6])
-    l = f.maker.fgraph.toposort()
+	    l = f.maker.fgraph.toposort()
-    # Ensure that one op is theano.tensor.Split
+	    # Ensure that one op is theano.tensor.Split
-    assert any([isinstance(o.op, theano.tensor.Split) for o in l])
+	    assert any([isinstance(o.op, theano.tensor.Split) for o in l])
-    # GPU version
+	    # GPU version
-    f = theano.function([x, splits], [ra], mode=mode_with_gpu)
+	    f = theano.function([x, splits], [ra], mode=mode_with_gpu)
-    gpu_res = f([0, 1, 2, 3, 4, 5], [6])
+	    gpu_res = f([0, 1, 2, 3, 4, 5], [6])
-    l = f.maker.fgraph.toposort()
+	    l = f.maker.fgraph.toposort()
-    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
+	    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
-    # Check equality
+	    # Check equality
-    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
+	    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
-def test_print_op():
+	def test_print_op():
-    """ Test that print ops don't block gpu optimization"""
+	    """ Test that print ops don't block gpu optimization"""
-    b = tensor.fmatrix()
+	    b = tensor.fmatrix()
-    f = theano.function([b], theano.printing.Print()(b)*2, mode=mode_with_gpu)
+	    f = theano.function([b], theano.printing.Print()(b)*2, mode=mode_with_gpu)
-    # theano.printing.debugprint(f)
+	    # theano.printing.debugprint(f)
-    # print f.maker.fgraph.toposort()
+	    # print f.maker.fgraph.toposort()
-#[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]
+	#[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert topo[0].op == cuda.gpu_from_host
+	    assert topo[0].op == cuda.gpu_from_host
-    assert isinstance(topo[1].op, theano.printing.Print)
+	    assert isinstance(topo[1].op, theano.printing.Print)
-    assert isinstance(topo[2].op, cuda.GpuElemwise)
+	    assert isinstance(topo[2].op, cuda.GpuElemwise)
-    assert topo[3].op == cuda.host_from_gpu
+	    assert topo[3].op == cuda.host_from_gpu
-    f(numpy.random.random((5, 5)).astype('float32'))
+	    f(numpy.random.random((5, 5)).astype('float32'))
-def test_huge_elemwise_fusion():
+	def test_huge_elemwise_fusion():
-    """ Test the the GpuElemwise fusion work correctly
+	    """ Test the the GpuElemwise fusion work correctly
-        We check that we fuse one node with part of its input
+		We check that we fuse one node with part of its input
-        in case their is too many inputs and that would make it bust the 256
+		in case their is too many inputs and that would make it bust the 256
-        bytes limits.
+		bytes limits.
-    """
+	    """
-    shape = (2, 3, 4, 5, 6)
+	    shape = (2, 3, 4, 5, 6)
-    ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
+	    ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
-    gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()['gpu_ptr_size']
+	    gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()['gpu_ptr_size']
-    if gpu_ptr_size == 8:
+	    if gpu_ptr_size == 8:
-        nb_in = 7
+		nb_in = 7
-        len_topo = 10
+		len_topo = 10
-    elif gpu_ptr_size == 4:
+	    elif gpu_ptr_size == 4:
-        nb_in = 8
+		nb_in = 8
-        len_topo = 11
+		len_topo = 11
-    else:
+	    else:
-        raise Exception("Unexpected value for gpu_ptr_size", gpu_ptr_size)
+		raise Exception("Unexpected value for gpu_ptr_size", gpu_ptr_size)
-    vars = [tensor.tanh(ttype) for x in range(nb_in)]
+	    vars = [tensor.tanh(ttype) for x in range(nb_in)]
-    f = pfunc(vars, [reduce(operator.sub, vars)], mode=mode_with_gpu)
+	    f = pfunc(vars, [reduce(operator.sub, vars)], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert len(topo) == len_topo
+	    assert len(topo) == len_topo
-    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2
+	    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2
-    assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub)
+	    assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub)
-    assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite)
+	    assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite)
-    # let debugmode catch errors
+	    # let debugmode catch errors
-    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
+	    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    f(*[gen() for i in range(nb_in)])
+	    f(*[gen() for i in range(nb_in)])
-    # Test the case where we can't put the computation on the gpu! their is too
+	    # Test the case where we can't put the computation on the gpu! their is too
-    # many dimensions to the input to have 2 inputs to the op!
+	    # many dimensions to the input to have 2 inputs to the op!
-    shape = (1, 2, 3, 4, 5, 6, 7, 2, 2, 3, 2, 1, 2, 2, 2,)
+	    shape = (1, 2, 3, 4, 5, 6, 7, 2, 2, 3, 2, 1, 2, 2, 2,)
-    ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
+	    ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
-    vars = [tensor.tanh(ttype) for x in range(7)]
+	    vars = [tensor.tanh(ttype) for x in range(7)]
-    f = pfunc(vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] -
+	    f = pfunc(vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] -
-                     vars[5] - vars[6]], mode=mode_with_gpu)
+		             vars[5] - vars[6]], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
+	    assert len(topo) == 1
-    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0
+	    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0
-    assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1
+	    assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1
-    # let debugmode catch errors
+	    # let debugmode catch errors
-    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
+	    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())
+	    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())
-    def gen(shape):
+	    def gen(shape):
-        return theano._asarray(numpy.random.rand(*shape), dtype='float32')
+		return theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    max_var = 16  # excluded
+	    max_var = 16  # excluded
-    for shape in [(2,),
+	    for shape in [(2,),
-                  (2, 2),
+		          (2, 2),
-                  (2, 2, 2),
+		          (2, 2, 2),
-                  (2, 2, 2, 2),
+		          (2, 2, 2, 2),
-                  (2, 2, 2, 2, 2),  # 5d
+		          (2, 2, 2, 2, 2),  # 5d
-                  (2, 2, 2, 2, 2, 2),
+		          (2, 2, 2, 2, 2, 2),
-#                  (2, 2, 2, 2, 2, 2, 2),
+	#                  (2, 2, 2, 2, 2, 2, 2),
-#                  (2, 2, 2, 2, 2, 2, 2, 2),
+	#                  (2, 2, 2, 2, 2, 2, 2, 2),
-#                  (2, 2, 2, 1, 1, 1, 1, 2, 2),  # 9d
+	#                  (2, 2, 2, 1, 1, 1, 1, 2, 2),  # 9d
-                  ]:
+		          ]:
-        vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)]
+		vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)]
-        for use_tan in [True, False]:
+		for use_tan in [True, False]:
-            if use_tan:
+		    if use_tan:
-                vars = [tensor.tanh(x) for x in vals]
+		        vars = [tensor.tanh(x) for x in vals]
-            else:
+		    else:
-                vars = vals
+		        vars = vals
-            for nb_var in range(1, max_var):
+		    for nb_var in range(1, max_var):
-                out = reduce(lambda x, y: x + y, vars[:nb_var])
+		        out = reduce(lambda x, y: x + y, vars[:nb_var])
-                if not isinstance(out.type, CudaNdarrayType):
+		        if not isinstance(out.type, CudaNdarrayType):
-                    out = cuda.gpu_from_host(out)
+		            out = cuda.gpu_from_host(out)
-                f = pfunc([], [out], mode=mode_with_gpu)
+		        f = pfunc([], [out], mode=mode_with_gpu)
-                topo = f.maker.fgraph.toposort()
+		        topo = f.maker.fgraph.toposort()
-                # print shape, nb_var, use_tan, len(topo)
+		        # print shape, nb_var, use_tan, len(topo)
-                assert (sum([isinstance(node.op, cuda.GpuElemwise)
+		        assert (sum([isinstance(node.op, cuda.GpuElemwise)
-                             for node in topo]) == len(topo) or
+		                     for node in topo]) == len(topo) or
-                        (nb_var == 1 and use_tan is False))
+		                (nb_var == 1 and use_tan is False))
-                assert sum([isinstance(node.op, tensor.Elemwise)
+		        assert sum([isinstance(node.op, tensor.Elemwise)
-                            for node in topo]) == 0
+		                    for node in topo]) == 0
-                # let debugmode catch errors
+		        # let debugmode catch errors
-                f()
+		        f()
-def test_local_gpu_elemwise_0():
+	def test_local_gpu_elemwise_0():
-    """
+	    """
-    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
+	    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
-    """
+	    """
-    a = tensor.bmatrix()
+	    a = tensor.bmatrix()
-    b = tensor.fmatrix()
+	    b = tensor.fmatrix()
-    c = tensor.fmatrix()
+	    c = tensor.fmatrix()
-    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
+	    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
-    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
+	    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
-    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")
+	    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")
-    # Due to optimization order, this composite is created when all
+	    # Due to optimization order, this composite is created when all
-    # the op are on the gpu.
+	    # the op are on the gpu.
-    f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
+	    f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
+	    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
-    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
+	    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
-    f(a_v, b_v, c_v)
+	    f(a_v, b_v, c_v)
-    # Now test with the composite already on the cpu before we move it
+	    # Now test with the composite already on the cpu before we move it
-    # to the gpu
+	    # to the gpu
-    a_s = theano.scalar.int8()
+	    a_s = theano.scalar.int8()
-    b_s = theano.scalar.float32()
+	    b_s = theano.scalar.float32()
-    c_s = theano.scalar.float32()
+	    c_s = theano.scalar.float32()
-    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
+	    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
-    out_op = tensor.Elemwise(out_s)
+	    out_op = tensor.Elemwise(out_s)
-    f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
+	    f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
+	    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
-    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
+	    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
-    f(a_v, b_v, c_v)
+	    f(a_v, b_v, c_v)
-def test_elemwise_fusion():
+	def test_elemwise_fusion():
-    """ Test the the GpuElemwise fusion work correctly"""
+	    """ Test the the GpuElemwise fusion work correctly"""
-    shape = (3, 4)
+	    shape = (3, 4)
-    a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape),
+	    a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape),
-                                                dtype='float32'), 'a')
+		                                        dtype='float32'), 'a')
-    b = tensor.fmatrix()
+	    b = tensor.fmatrix()
-    c = tensor.fmatrix()
+	    c = tensor.fmatrix()
-    f = pfunc([b, c], [a + b + c], mode=mode_with_gpu)
+	    f = pfunc([b, c], [a + b + c], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
+	    topo = f.maker.fgraph.toposort()
-    for i, node in enumerate(topo):
+	    for i, node in enumerate(topo):
-        print >> sys.stdout, i, node
+		print >> sys.stdout, i, node
-    assert len(topo) == 4
+	    assert len(topo) == 4
-    assert isinstance(topo[2].op.scalar_op, theano.scalar.basic.Composite)
+	    assert isinstance(topo[2].op.scalar_op, theano.scalar.basic.Composite)
-    # let debugmode catch errors
+	    # let debugmode catch errors
-    f(theano._asarray(numpy.random.rand(*shape), dtype='float32'),
+	    f(theano._asarray(numpy.random.rand(*shape), dtype='float32'),
-      theano._asarray(numpy.random.rand(*shape), dtype='float32'))
+	      theano._asarray(numpy.random.rand(*shape), dtype='float32'))
-import theano.tests.test_ifelse
+	import theano.tests.test_ifelse
-class TestIfElse(theano.tests.test_ifelse.test_ifelse):
+	class TestIfElse(theano.tests.test_ifelse.test_ifelse):
-    dtype = "float32"
+	    dtype = "float32"
-    mode = mode_with_gpu
+	    mode = mode_with_gpu
-    cast_output = staticmethod(basic_ops.as_cuda_ndarray_variable)
+	    cast_output = staticmethod(basic_ops.as_cuda_ndarray_variable)
-    shared = staticmethod(cuda.shared_constructor)
+	    shared = staticmethod(cuda.shared_constructor)
-    def get_ifelse(self, n):
+	    def get_ifelse(self, n):
-        return theano.ifelse.IfElse(n, gpu=True, as_view=True)
+		return theano.ifelse.IfElse(n, gpu=True, as_view=True)
-def test_incsubtensor_mixed():
+	def test_incsubtensor_mixed():
-    # This catches a bug that occurred when incrementing
+	    # This catches a bug that occurred when incrementing
-    # a float32 tensor by a float64 tensor.
+	    # a float32 tensor by a float64 tensor.
-    # The result is defined to be float32, so it is OK
+	    # The result is defined to be float32, so it is OK
-    # to downcast the float64 increment in order to
+	    # to downcast the float64 increment in order to
-    # transfer it to the GPU.
+	    # transfer it to the GPU.
-    # The bug was that the optimization called GpuFromHost
+	    # The bug was that the optimization called GpuFromHost
-    # without casting first, causing the optimization to
+	    # without casting first, causing the optimization to
-    # fail.
+	    # fail.
-    X = tensor.fmatrix()
+	    X = tensor.fmatrix()
-    Y = tensor.dmatrix()
+	    Y = tensor.dmatrix()
-    Z = tensor.inc_subtensor(X[0:1, 0:1], Y)
+	    Z = tensor.inc_subtensor(X[0:1, 0:1], Y)
-    f = theano.function([X, Y], Z, mode=mode_with_gpu)
+	    f = theano.function([X, Y], Z, mode=mode_with_gpu)
-    packed, = f.maker.fgraph.inputs[1].clients
+	    packed, = f.maker.fgraph.inputs[1].clients
-    client, idx = packed
+	    client, idx = packed
-    print client
+	    print client
-    assert isinstance(client.op, tensor.Elemwise)
+	    assert isinstance(client.op, tensor.Elemwise)
-    assert isinstance(client.op.scalar_op, theano.scalar.Cast)
+	    assert isinstance(client.op.scalar_op, theano.scalar.Cast)
-    packed, = client.outputs[0].clients
+	    packed, = client.outputs[0].clients
-    client, idx = packed
+	    client, idx = packed
-    assert isinstance(client.op, cuda.GpuFromHost)
+	    assert isinstance(client.op, cuda.GpuFromHost)
-def test_erfinvgpu():
+	def test_erfinvgpu():
-    """ Test that local_gpu_elemwise_0 replaces Erfinv with ErfinvGPU """
+	    """ Test that local_gpu_elemwise_0 replaces Erfinv with ErfinvGPU """
-    x = tensor.fmatrix()
+	    x = tensor.fmatrix()
-    f = theano.function([x], tensor.Elemwise(erfinv)(x), mode=mode_with_gpu)
+	    f = theano.function([x], tensor.Elemwise(erfinv)(x), mode=mode_with_gpu)
-    f2 = theano.function([x], tensor.Elemwise(erfinv)(x),
+	    f2 = theano.function([x], tensor.Elemwise(erfinv)(x),
-                         mode=mode_without_gpu)
+		                 mode=mode_without_gpu)
-    assert isinstance(f.maker.fgraph.toposort()[1].op, cuda.GpuElemwise)
+	    assert isinstance(f.maker.fgraph.toposort()[1].op, cuda.GpuElemwise)
-    assert isinstance(f.maker.fgraph.toposort()[1].op.scalar_op,
+	    assert isinstance(f.maker.fgraph.toposort()[1].op.scalar_op,
-                      cuda.elemwise.ErfinvGPU)
+		              cuda.elemwise.ErfinvGPU)
-    xv = numpy.random.rand(7, 8).astype('float32')
+	    xv = numpy.random.rand(7, 8).astype('float32')
-    assert numpy.allclose(f(xv), f2(xv))
+	    assert numpy.allclose(f(xv), f2(xv))
-def test_local_gpu_solve():
+	def test_local_gpu_solve():
-    if not cula.cula_available:
+	    if not cula.cula_available:
-        raise SkipTest('Optional dependency CULA not available')
+		raise SkipTest('Optional dependency CULA not available')
-    numpy.random.seed(1)
+	    numpy.random.seed(1)
-    def cmp(a_shp, b_shp):
+	    def cmp(a_shp, b_shp):
-        a0 = numpy.random.uniform(-0.4, 0.4,
+		a0 = numpy.random.uniform(-0.4, 0.4,
-                                  a_shp).astype('float32')
+		                          a_shp).astype('float32')
-        a = cuda.shared_constructor(a0, 'a')
+		a = cuda.shared_constructor(a0, 'a')
-        b0 = numpy.random.uniform(-0.4, 0.4,
+		b0 = numpy.random.uniform(-0.4, 0.4,
-                                  b_shp).astype('float32')
+		                          b_shp).astype('float32')
-        b = cuda.shared_constructor(b0, 'b')
+		b = cuda.shared_constructor(b0, 'b')
-        f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu)
+		f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu)
-        assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op,
+		assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op,
-                          cuda.cula.GpuSolve)
+		                  cuda.cula.GpuSolve)
-        assert cuda.opt.local_gpu_solve.transform(
+		assert cuda.opt.local_gpu_solve.transform(
-            tensor.slinalg.solve(a, b).owner)
+		    tensor.slinalg.solve(a, b).owner)
-        out = f()
+		out = f()
-        assert numpy.allclose(numpy.dot(a0, out), b0)
+		assert numpy.allclose(numpy.dot(a0, out), b0)
-    cmp((6, 6), (6, 1))
+	    cmp((6, 6), (6, 1))
-    cmp((5, 5), (5, 1))
+	    cmp((5, 5), (5, 1))
-def test_local_gpu_dot_to_dot22dot():
+	def test_local_gpu_dot_to_dot22dot():
-    def cmp(a_shp, b_shp):
+	    def cmp(a_shp, b_shp):
-        a0 = numpy.random.rand(*a_shp).astype('float32')
+		a0 = numpy.random.rand(*a_shp).astype('float32')
-        a = cuda.shared_constructor(a0, 'a')
+		a = cuda.shared_constructor(a0, 'a')
-        b0 = numpy.random.rand(*b_shp).astype('float32')
+		b0 = numpy.random.rand(*b_shp).astype('float32')
-        b = cuda.shared_constructor(b0, 'b')
+		b = cuda.shared_constructor(b0, 'b')
-        f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
+		f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
-        assert cuda.opt.local_gpu_dot_to_dot22.transform(
+		assert cuda.opt.local_gpu_dot_to_dot22.transform(
-            tensor.dot(a, b).owner)
+		    tensor.dot(a, b).owner)
-        out = f()
+		out = f()
-        assert numpy.allclose(numpy.dot(a0, b0), out)
+		assert numpy.allclose(numpy.dot(a0, b0), out)
-        # Try with a matrix equal to a0, but with strides in both dims
+		# Try with a matrix equal to a0, but with strides in both dims
-        a.set_value(a0)
+		a.set_value(a0)
-        a.set_value(
+		a.set_value(
-            a.get_value(borrow=True,
+		    a.get_value(borrow=True,
-                        return_internal_type=True)[::-1],
+		                return_internal_type=True)[::-1],
-            borrow=True)
+		    borrow=True)
-        f()
+		f()
-    cmp((4,), (4, 5))
+	    cmp((4,), (4, 5))
-    cmp((3, 4), (4,))
+	    cmp((3, 4), (4,))
-class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
+	class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
-    mode = mode_with_gpu
+	    mode = mode_with_gpu
-    shared = staticmethod(cuda.shared_constructor)
+	    shared = staticmethod(cuda.shared_constructor)
-    floatX = 'float32'
+	    floatX = 'float32'
-    type = CudaNdarrayType
+	    type = CudaNdarrayType
-    def __init__(self, name):
+	    def __init__(self, name):
-        super(theano.tensor.tests.test_nlinalg.test_diag,
+		super(theano.tensor.tests.test_nlinalg.test_diag,
-              self).__init__(name)
+		      self).__init__(name)
-if __name__ == '__main__':
+	if __name__ == '__main__':
-    test_gpualloc()
+	    test_gpualloc()
-    test_opt_gpujoin_onlyajoin()
+	    test_opt_gpujoin_onlyajoin()
-    test_opt_gpujoin_joinvectors_elemwise_then_minusone()
+	    test_opt_gpujoin_joinvectors_elemwise_then_minusone()
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -61,6 +61,7 @@ def register_opt(*tags, **kwargs):
 register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
 register_opt('unsafe')(theano.tensor.opt.local_remove_all_assert)
 def safe_to_gpu(x):
    if isinstance(x.type, tensor.TensorType):
        return gpu_from_host(x)