Merge pull request #450 from nouiz/gpusum

Test nvidia driver

Merge pull request #450 from nouiz/gpusum
8aa08ca2 · lamblin · 8b1c4916 · 07deef6b · 8aa08ca2 · 8aa08ca2
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -99,6 +99,11 @@ import gof
 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
    import theano.sandbox.cuda
+# We can't test the driver during import of theano.sandbox.cuda as
+# this cause circular import dependency. So we also test it manually
+# after the import
+    import theano.sandbox.cuda.tests.test_driver
+    theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
 # Use config.numpy to call numpy.seterr
 import numpy

--- a/theano/sandbox/cuda/GpuConv3D.py
+++ b/theano/sandbox/cuda/GpuConv3D.py
@@ -7,9 +7,9 @@ from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, host_from_gp
 from theano.misc import strutil
 from theano.tensor.nnet.Conv3D import Conv3D
 from theano.sandbox.cuda.opt import register_opt
-from theano.sandbox.cuda import CudaNdarrayType
+from theano.sandbox.cuda import CudaNdarrayType, GpuOp
-class GpuConv3D(theano.Op):
+class GpuConv3D(GpuOp):
    """ GPU implementation of Conv3D """
    def __eq__(self, other):

--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -8,11 +8,12 @@ from theano.misc import strutil
 from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
 from theano.sandbox.cuda.opt import register_opt
-from theano.sandbox.cuda import CudaNdarrayType, HostFromGpu, host_from_gpu
+from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
+                                 host_from_gpu, GpuOp)
-class GpuConvGrad3D(theano.Op):
+class GpuConvGrad3D(GpuOp):
    """ GPU version of gradient of ConvGrad3D with respect to W """
    def make_node(self, V, d, WShape, dCdH):

--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -9,10 +9,11 @@ from theano.gof import local_optimizer
 from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
 from theano.sandbox.cuda.opt import register_opt
-from theano.sandbox.cuda import CudaNdarrayType, HostFromGpu, host_from_gpu
+from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
+                                 host_from_gpu, GpuOp)
-class GpuConvTransp3D(theano.Op):
+class GpuConvTransp3D(GpuOp):
    """ The gpu version of ConvTransp3D """
    def __eq__(self,other):
        return type(self) == type(other)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
 import atexit, logging, os, shutil, stat, sys
+import numpy
+import theano
 from theano.compile import optdb
 from theano.gof.cmodule import get_lib_extension
 from theano.configparser import config, AddConfigVar, StrParam
@@ -23,7 +27,8 @@ if config.cuda.root == "AUTO":
    # set nvcc_path correctly and get the version
    nvcc_compiler.set_cuda_root()
-#is_nvcc_available called here to initialize global vars in nvcc_compiler module
+#is_nvcc_available called here to initialize global vars in
+#nvcc_compiler module
 nvcc_compiler.is_nvcc_available()
 # Compile cuda_ndarray.cu
@@ -31,8 +36,9 @@ nvcc_compiler.is_nvcc_available()
 # printed and this module will not be working properly (we set `cuda_available`
 # to False).
-# This variable is True by default, and set to False if nvcc is not available or
+# This variable is True by default, and set to False if nvcc is not
-# their is no cuda card or something goes wrong when trying to initialize cuda.
+# available or their is no cuda card or something goes wrong when
+# trying to initialize cuda.
 cuda_available = True
 # Global variable to avoid displaying the same warning multiple times.
@@ -41,6 +47,7 @@ cuda_warning_is_displayed = False
 #This variable is set to True when we enable cuda.(i.e. when use() is called)
 cuda_enabled = False
 # Code factorized within a function so that it may be called from multiple
 # places (which is not currently the case, but may be useful in the future).
 def set_cuda_disabled():
@@ -72,17 +79,18 @@ libcuda_ndarray_so = os.path.join(cuda_ndarray_loc,
                               'libcuda_ndarray.' + get_lib_extension())
-# Add the theano cache directory's cuda_ndarray subdirectory to the list of
+# Add the theano cache directory's cuda_ndarray subdirectory to the
-# places that are hard-coded into compiled modules' runtime library search
+# list of places that are hard-coded into compiled modules' runtime
-# list.  This works in conjunction with nvcc_compiler.nvcc_module_compile_str
+# library search list.  This works in conjunction with
-# which adds this folder during compilation with -L and also adds -lcuda_ndarray
+# nvcc_compiler.nvcc_module_compile_str which adds this folder during
-# when compiling modules.
+# compilation with -L and also adds -lcuda_ndarray when compiling
+# modules.
 nvcc_compiler.add_standard_rpath(cuda_ndarray_loc)
 compile_cuda_ndarray = True
 if os.path.exists(cuda_ndarray_so):
-    compile_cuda_ndarray = date>=os.stat(cuda_ndarray_so)[stat.ST_MTIME]
+    compile_cuda_ndarray = date >= os.stat(cuda_ndarray_so)[stat.ST_MTIME]
 if not compile_cuda_ndarray:
    try:
        # If we load a previously-compiled version, config.compiledir should
@@ -111,7 +119,7 @@ try:
                    include_dirs=[cuda_path], libs=['cublas'])
            from cuda_ndarray.cuda_ndarray import *
 except Exception, e:
-    _logger.error( "Failed to compile cuda_ndarray.cu: %s", str(e))
+    _logger.error("Failed to compile cuda_ndarray.cu: %s", str(e))
    set_cuda_disabled()
 if cuda_available:
@@ -129,10 +137,13 @@ if cuda_available:
            os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
    try:
+        # This only test if the cuda driver is available and if there
+        # is at least one GPU that support cuda. This do not select a
+        # device.
        gpu_init()
        cuda_available = True
        cuda_initialization_error_message = ""
-        # actively closing our gpu session presents segfault-on-exit on some systems
+# actively closing our gpu session presents segfault-on-exit on some systems
        atexit.register(gpu_shutdown)
    except EnvironmentError, e:
        cuda_available = False
@@ -162,7 +173,7 @@ if cuda_available:
    shared_constructor = float32_shared_constructor
    import basic_ops
-    from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
+    from basic_ops import (GpuOp, GpuFromHost, HostFromGpu, GpuElemwise,
                           GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous,
                           GpuSubtensor, GpuIncSubtensor,
                           GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
@@ -180,18 +191,31 @@ def use(device,
        force=False,
        default_to_move_computation_to_gpu=True,
        move_shared_float32_to_gpu=True,
-        enable_cuda=True):
+        enable_cuda=True,
+        test_driver=True):
    """
-    Error and warning about CUDA should be displayed only when this function is called.
+    Error and warning about CUDA should be displayed only when this
-    We need to be able to load this module only to check if it is available!
+    function is called.  We need to be able to load this module only
+    to check if it is available!
+    :param device: string "cpu", "gpu", "gpuN" N is the device number to use
+    :param force: Will always raise an exception if we can't use the gpu.
+    :param default_to_move_computation_to_gpu: If gpu init succeeded, enable by
+                                               default optimization to move
+                                               computation to the gpu
+    :param move_shared_float32_to_gpu: If gpu init succeeded, put new shared
+                                       variable in float32 on the gpu.
+    :param enable_cuda: If the gpu is correctly enabled,
+                        set the the variable cuda_enabled to True.
    """
    global cuda_enabled, cuda_initialization_error_message
    if force and not cuda_available and device.startswith('gpu'):
        if not nvcc_compiler.is_nvcc_available():
-            raise EnvironmentError("You forced the use of gpu device '%s', but "
+            raise EnvironmentError("You forced the use of gpu device '%s', but"
-                                   "nvcc was not found. Set it in your PATH "
+                                   " nvcc was not found. Set it in your PATH "
                                   "environment variable or set the Theano "
-                                   "flags 'cuda.root' to its directory" % device)
+                                   "flags 'cuda.root' to its directory"
+                                   "" % device)
        else:
            raise EnvironmentError("You forced the use of gpu device %s, "
                                   "but CUDA initialization failed "
@@ -206,7 +230,8 @@ def use(device,
        try:
            if cuda_initialization_error_message:
                error_addendum = " (error: %s)" % cuda_initialization_error_message
-        except NameError: # cuda_initialization_error_message is not available b/c compilation failed
+        except NameError:
+# cuda_initialization_error_message is not available b/c compilation failed
            pass
        _logger.warning('CUDA is installed, but device %s is not available %s',
                device, error_addendum)
@@ -222,29 +247,33 @@ def use(device,
        raise ValueError("Invalid device identifier", device)
    if use.device_number is None:
        # No successful call to use() has been made yet
-        if device != 'gpu' and device<0:
+        if device != 'gpu' and device < 0:
            return
-        if device in [None,""]:
+        if device in [None, ""]:
-            device=0
+            device = 0
        try:
-            if device !='gpu':
+            if device != 'gpu':
                gpu_init(device)
+            use.device_number = device
+            if test_driver:
+                import theano.sandbox.cuda.tests.test_driver
+                theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
            if move_shared_float32_to_gpu:
                handle_shared_float32(True)
-            use.device_number = device
            if enable_cuda:
                cuda_enabled = True
-            print >> sys.stderr, "Using gpu device %d: %s" % (active_device_number(), active_device_name())
+            print >> sys.stderr, "Using gpu device %d: %s" % (
+                active_device_number(), active_device_name())
        except (EnvironmentError, ValueError), e:
            _logger.error(("ERROR: Not using GPU."
                " Initialisation of device %i failed:\n%s"),
                device, e)
            cuda_enabled = False
            if force:
-                e.args+=(("You asked to force this device and it failed."
+                e.args += (("You asked to force this device and it failed."
-                        " No fallback to the cpu or other gpu device."),)
+                            " No fallback to the cpu or other gpu device."),)
                raise
    elif use.device_number != device:
@@ -264,17 +293,16 @@ def use(device,
        try:
            #in case the device if just gpu,
            # we check that the driver init it correctly.
-            cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((5,5))
+            cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((5, 5))
        except (Exception, NameError), e:
            # NameError when no gpu present as cuda_ndarray is not loaded.
-            e.args+=("ERROR: GPU forced but failed. ",)
+            e.args += ("ERROR: GPU forced but failed. ",)
            raise
 use.device_number = None
 def handle_shared_float32(tf):
-    """Set the CudaNdarrayType as the default handler for shared float32 arrays.
+    """Set the default shared type for float32 tensor to CudaNdarrayType
    This function is intended to be called from use(gpu_index), not directly.
    """
@@ -285,11 +313,14 @@ def handle_shared_float32(tf):
    else:
        raise NotImplementedError('removing our handler')
+# We can't test the driver during import here as this cause circular
+# import dependency. So we also test it in the file theano/__init__.py
 if config.device.startswith('gpu'):
-    use(device=config.device, force=config.force_device)
+    use(device=config.device, force=config.force_device, test_driver=False)
 elif config.init_gpu_device:
-    assert config.device=="cpu", ("We can use the Theano flag init_gpu_device"
+    assert config.device == "cpu", (
-            " only when the Theano flag device=='cpu'")
+        "We can use the Theano flag init_gpu_device"
+        " only when the Theano flag device=='cpu'")
    _logger.warning(("GPU device %s will be initialized, and used if a GPU is "
          "needed. "
          "However, no computation, nor shared variables, will be implicitly "
@@ -300,4 +331,4 @@ elif config.init_gpu_device:
        force=config.force_device,
        default_to_move_computation_to_gpu=False,
        move_shared_float32_to_gpu=False,
-        enable_cuda=False)
+        enable_cuda=False, test_driver=False)
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -33,7 +33,20 @@ def as_cuda_array(obj):
    else:
        raise TypeError("Don't know how to cast to a CudaNdarray object")
-class HostFromGpu(Op):
+class GpuOp(Op):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        if theano.sandbox.cuda.use.device_number is None:
+            theano.sandbox.cuda.use("gpu",
+                                    force=True,
+                                    default_to_move_computation_to_gpu=False,
+                                    move_shared_float32_to_gpu=False,
+                                    enable_cuda=False)
+        return super(GpuOp, self).make_thunk(node, storage_map,
+                                             compute_map, no_recycling)
+class HostFromGpu(GpuOp):
    """
    Implement the transfer from gpu to the cpu.
    """
@@ -65,7 +78,7 @@ class HostFromGpu(Op):
        return xshp
 host_from_gpu = HostFromGpu()
-class GpuFromHost(Op):
+class GpuFromHost(GpuOp):
    """
    Implement the transfer from cpu to the gpu.
    """
@@ -98,7 +111,8 @@ class GpuFromHost(Op):
        return xshp
 gpu_from_host = GpuFromHost()
-class GpuElemwise(Op):
+class GpuElemwise(GpuOp):
    """
    Implement a generic elemwise on the gpu.
    """
@@ -208,7 +222,7 @@ class GpuElemwise(Op):
    def c_code_cache_version(self):
        return self.src_generator.cache_version
-class GpuDimShuffle(Op):
+class GpuDimShuffle(GpuOp):
    """
    Implement DimShuffle on the gpu.
    """
@@ -397,7 +411,7 @@ class GpuDimShuffle(Op):
    def c_code_cache_version(self):
        return (1,0)
-class GpuSum(Op):
+class GpuSum(GpuOp):
    """GpuSum is a Reduction along some dimensions by summation.
    The dimensions along which to sum is specified by the `reduce_mask` that you pass to the
@@ -1717,7 +1731,7 @@ class GpuSum(Op):
            """ %locals()
        return sio.getvalue()
-class GpuReshape(tensor.Reshape):
+class GpuReshape(tensor.Reshape, GpuOp):
    """
    Implement Reshape on the gpu.
    """
@@ -1733,7 +1747,7 @@ class GpuReshape(tensor.Reshape):
                    ', should be %i' % (len(shp), self.ndim), shp)
        out[0] = x.reshape(tuple(shp))
-class GpuSubtensor(tensor.Subtensor):
+class GpuSubtensor(tensor.Subtensor, GpuOp):
    """
    Implement subtensor on the gpu.
    """
@@ -1764,7 +1778,7 @@ class GpuSubtensor(tensor.Subtensor):
            cdata = cdata[0]
        out[0] = x.__getitem__(cdata)
-class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1):
+class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.
    """
@@ -1790,7 +1804,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1):
            o[j] = x[i]
        out[0] = o
-class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):
+class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
    """
    Implement AdvancedIncSubtensor1 on the gpu.
    """
@@ -1818,7 +1832,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):
        # CudaNdarray_Subscript() don't support Advanced slicing.
        # so we use the parent version that loop on each indices.
-class GpuIncSubtensor(tensor.IncSubtensor):
+class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    """
    Implement IncSubtensor on the gpu.
    """
@@ -1828,7 +1842,7 @@ class GpuIncSubtensor(tensor.IncSubtensor):
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
        return Apply(self, [x,y]+rval.inputs[2:], [x.type()])
-class GpuFlatten(tensor.Flatten):
+class GpuFlatten(tensor.Flatten, GpuOp):
    """
    Implement Flatten on the gpu.
    """
@@ -1839,7 +1853,7 @@ class GpuFlatten(tensor.Flatten):
        out_type = CudaNdarrayType(broadcastable=host_out_broadcastable)
        return Apply(self, [x], [out_type()])
-class GpuShape(tensor.Shape):
+class GpuShape(tensor.Shape, GpuOp):
    """
    Implement Shape on the gpu.
    """
@@ -1847,7 +1861,7 @@ class GpuShape(tensor.Shape):
        return Apply(self, [x], [tensor.lvector()])
 gpu_shape = GpuShape()
-class GpuJoin(tensor.Join):
+class GpuJoin(tensor.Join, GpuOp):
    """
    Implement Join on the gpu.
    """
@@ -1924,7 +1938,7 @@ class GpuJoin(tensor.Join):
 gpu_join = GpuJoin()
-class GpuAlloc(Op):
+class GpuAlloc(GpuOp):
    """
    Implement Alloc on the gpu.
    """
@@ -2023,7 +2037,7 @@ class GpuAlloc(Op):
 gpu_alloc = GpuAlloc()
-class GpuContiguous(Op):
+class GpuContiguous(GpuOp):
    """
    Always return a c contiguous output. Copy the input only if it is
    not already c contiguous.

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -4,8 +4,9 @@ import StringIO, os
 import cuda_ndarray.cuda_ndarray as cuda
 from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda import GpuOp
-class GpuDot22(Op):
+class GpuDot22(GpuOp):
    """
    Implement dot(2d, 2d) on the gpu.
    """
@@ -76,7 +77,7 @@ class GpuDot22(Op):
        """ % locals()
 gpu_dot22 = GpuDot22()
-class GpuDot22Scalar(Op):
+class GpuDot22Scalar(GpuOp):
    """
    Implement dot(2d, 2d) * scalar on the gpu.
    """
@@ -155,7 +156,7 @@ class GpuDot22Scalar(Op):
        """ % locals()
 gpu_dot22scalar = GpuDot22Scalar()
-class GpuGemm(Op):
+class GpuGemm(GpuOp):
    """
    implement the gemm on the gpu.
@@ -257,7 +258,7 @@ class GpuGemm(Op):
 gpu_gemm_no_inplace = GpuGemm(inplace=False)
 gpu_gemm_inplace = GpuGemm(inplace=True)
-class GpuGemv(Op):
+class GpuGemv(GpuOp):
    """
    implement gemv on the gpu.
@@ -348,7 +349,7 @@ class GpuGemv(Op):
 gpu_gemv_no_inplace = GpuGemv(inplace=False)
 gpu_gemv_inplace = GpuGemv(inplace=True)
-class GpuGer(Op):
+class GpuGer(GpuOp):
    """
    implement ger on the gpu.
@@ -439,7 +440,7 @@ class GpuGer(Op):
 gpu_ger_no_inplace = GpuGer(inplace=False)
 gpu_ger_inplace = GpuGer(inplace=True)
-class GpuOuter(Op):
+class GpuOuter(GpuOp):
    """ Implement outer on the gpu."""
    def make_node(self, x, y):
        # we suppose type checking has been done, but make sure.
@@ -532,7 +533,7 @@ gpu_outer = GpuOuter()
 ##
 # Not really a BLAS operation, but whatever.
 #
-class GpuConv(Op):
+class GpuConv(GpuOp):
    """
    Implement the batched and stacked 2d convolution on the gpu.
    """
@@ -698,7 +699,7 @@ class GpuConv(Op):
 """%sub
-class GpuDownsampleFactorMax(Op):
+class GpuDownsampleFactorMax(GpuOp):
    """
    Implement downsample with max on the gpu.
    """
@@ -858,7 +859,7 @@ class GpuDownsampleFactorMax(Op):
        }
        """ % locals()
-class GpuDownsampleFactorMaxGrad(Op):
+class GpuDownsampleFactorMaxGrad(GpuOp):
    """
    Implement the grad of downsample with max on the gpu.
    """

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -3,11 +3,12 @@ from theano import tensor, scalar
 import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax
-class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
+class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
    """
@@ -180,7 +181,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
-class GpuCrossentropySoftmax1HotWithBiasDx (Op):
+class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
    """
@@ -302,7 +303,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
-class GpuSoftmax (Op):
+class GpuSoftmax (GpuOp):
    """
    Implement Softmax on the gpu.
    """
@@ -400,7 +401,7 @@ class GpuSoftmax (Op):
 gpu_softmax = GpuSoftmax()
-class GpuSoftmaxWithBias (Op):
+class GpuSoftmaxWithBias (GpuOp):
    """
    Implement SoftmaxWithBias on the gpu.
    """

--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
@@ -10,7 +10,7 @@ __contact__ = "theano-dev@googlegroups.com"
 import sys
 import numpy
 import theano.gof
-from theano.sandbox.cuda import CudaNdarrayType
+from theano.sandbox.cuda import CudaNdarrayType, GpuOp
 from theano.tensor import (get_vector_length, cast, opt)
 from theano.compile import optdb
 from theano.gof import local_optimizer, Variable
@@ -19,7 +19,7 @@ from theano.gof import local_optimizer, Variable
 config = theano.config
-class CURAND_Base(theano.gof.Op):
+class CURAND_Base(GpuOp):
    """ Base class for a random number generator implemented in CURAND.
    The random number generator itself is an opaque reference managed by

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -19,23 +19,28 @@ import theano.sandbox.cuda.basic_ops as B
 from theano.tensor.basic import _allclose
 from theano.tests import unittest_tools as utt
-if theano.config.mode=='FAST_COMPILE':
+if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
 def rand_cuda_ndarray(shape):
-    return cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    return cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
+                                                    dtype='float32'))
 #intentionally disabled
 def tes_use():
    tcn.use()
 def test_sum():
    """
-    test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111, 0011, 0101, 0111, 1011, 1111
+    test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111,
+    0011, 0101, 0111, 1011, 1111
    test sum pattern implemented with reshape:
    1000, 0100, 0010, 0001, 11111
@@ -91,18 +96,18 @@ def test_sum():
                           ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
                           ]:
-        a = tensor.TensorType('float32',(False,)*len(shape))()
+        a = tensor.TensorType('float32', (False,) * len(shape))()
        b = T.Sum(pattern)(a)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
 #        val = numpy.ones(shape)
 #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val,dtype='float32')
+        val = theano._asarray(val, dtype='float32')
-        f = theano.function([a],b, mode=mode_with_gpu)
+        f = theano.function([a], b, mode=mode_with_gpu)
-        f2 = theano.function([a],b, mode=mode_without_gpu)
+        f2 = theano.function([a], b, mode=mode_without_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
-        if val.size==0:
+        if val.size == 0:
-            assert f2(val)==f(val), ('shape', shape, 'pattern', pattern)
+            assert f2(val) == f(val), ('shape', shape, 'pattern', pattern)
        else:
            try:
                #We raise the error threashold as we sum big matrix
@@ -110,7 +115,9 @@ def test_sum():
                #example in debug mode with unittests.rseed=9275
                orig_rtol = theano.tensor.basic.float32_rtol
                theano.tensor.basic.float32_rtol = 2e-5
-                assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
+                assert _allclose(f2(val), f(val)), ('shape', shape,
+                                                    'pattern', pattern,
+                                                    sum([shape[i] for i in pattern]))
            finally:
                theano.tensor.basic.float32_rtol = orig_rtol
@@ -121,21 +128,23 @@ def test_sum():
                           ((5,4),[0,1]),((5,4),[0]),
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
-        a = tensor.TensorType('float32',(False,)*len(shape))()
+        a = tensor.TensorType('float32', (False,) * len(shape))()
        dim_pattern = range(len(shape))
-        dim_pattern[0]=1
+        dim_pattern[0] = 1
-        dim_pattern[1]=0
+        dim_pattern[1] = 0
        a = a.dimshuffle(dim_pattern)
        b = T.Sum(pattern)(a)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
 #        val = numpy.ones(shape)
 #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val,dtype='float32')
+        val = theano._asarray(val, dtype='float32')
-        f = theano.function([a],b, mode=mode_with_gpu)
+        f = theano.function([a], b, mode=mode_with_gpu)
-        f2 = theano.function([a],b, mode=mode_without_gpu)
+        f2 = theano.function([a], b, mode=mode_without_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
-        assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
+        assert _allclose(f2(val), f(val)), ('shape', shape,
+                                            'pattern', pattern,
+                                            sum([shape[i] for i in pattern]))
        #test with broadcast
@@ -143,116 +152,135 @@ def test_sum():
                           ((5,4),[0,1]),((5,4),[0]),
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
-        shape = numpy.asarray(shape)*2
+        shape = numpy.asarray(shape) * 2
-        a = tensor.TensorType('float32',(False,)*len(shape))()
+        a = tensor.TensorType('float32', (False,) * len(shape))()
-        a2 = tcn.CudaNdarrayType((False,)*len(shape))()
+        a2 = tcn.CudaNdarrayType((False,) * len(shape))()
        b = T.Sum(pattern)(a)
        b2 = T.Sum(pattern)(a2)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
 #        val = numpy.ones(shape)
 #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val,dtype='float32')
+        val = theano._asarray(val, dtype='float32')
        val2 = cuda.CudaNdarray(val)
-        if len(shape)==1:
+        if len(shape) == 1:
            val = val[::2]
            val2 = val2[::2]
-        elif len(shape)==2:
+        elif len(shape) == 2:
-            val = val[::2,::2]
+            val = val[::2, ::2]
-            val2 = val2[::2,::2]
+            val2 = val2[::2, ::2]
-        elif len(shape)==3:
+        elif len(shape) == 3:
-            val = val[::2,::2,::2]
+            val = val[::2, ::2, ::2]
-            val2 = val2[::2,::2,::2]
+            val2 = val2[::2, ::2, ::2]
-        elif len(shape)==4:
+        elif len(shape) == 4:
-            val = val[::2,::2,::2,::2]
+            val = val[::2, ::2, ::2, ::2]
-            val2 = val2[::2,::2,::2,::2]
+            val2 = val2[::2, ::2, ::2, ::2]
-        f = theano.function([a],b, mode=mode_without_gpu)
+        f = theano.function([a], b, mode=mode_without_gpu)
-        f2 = theano.function([a2],b2, mode=mode_with_gpu)
+        f2 = theano.function([a2], b2, mode=mode_with_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()]
-        assert _allclose(f2(val2),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
+        assert _allclose(f2(val2), f(val)), ('shape', shape,
+                                             'pattern', pattern,
+                                             sum([shape[i] for i in pattern]))
 def test_flatten():
    x = cuda.fmatrix('x')
    f = theano.function([x], x.flatten())
-    assert len(f( [[0.,0.],[0.,0.]] ).shape)==1
+    assert len(f([[0., 0.], [0., 0.]]).shape) == 1
 def test_reshape():
    a = tcn.CudaNdarrayType((False,))()
-    b = tcn.CudaNdarrayType((False,False))()
+    b = tcn.CudaNdarrayType((False, False))()
-    c = T.reshape(a, [2,3])
+    c = T.reshape(a, [2, 3])
    #basic
-    f = theano.function([a], c, mode=mode_without_gpu)
+    f = theano.function([a], c, mode=mode_with_gpu)
-    fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32')))
+    fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
-    assert numpy.all(fv == numpy.asarray([[0,1,2], [3,4,5]]))
+                                                    dtype='float32')))
+    topo = f.maker.env.toposort()
+    assert any([isinstance(node.op, B.GpuReshape) for node in topo])
+    assert numpy.all(fv == numpy.asarray([[0, 1, 2], [3, 4, 5]]))
    #test that it works without inplace operations
-    a_val = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32'))
+    a_val = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
-    a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32'))
+                                                     dtype='float32'))
-    b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0,1,2],[3,4,5]],dtype='float32'))
+    a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
+                                                          dtype='float32'))
-    f_sub = theano.function([a,b], c-b, mode=mode_without_gpu)
+    b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0, 1, 2], [3, 4, 5]],
+                                                     dtype='float32'))
+    f_sub = theano.function([a, b], c - b, mode=mode_with_gpu)
+    topo = f_sub.maker.env.toposort()
+    assert any([isinstance(node.op, B.GpuReshape) for node in topo])
    assert numpy.all(f_sub(a_val, b_val) == 0.0)
    assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy))
    #test that it works with inplace operations
-    a_val = theano._asarray([0,1,2,3,4,5], dtype='float32')
+    a_val = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')
-    a_val_copy = theano._asarray([0,1,2,3,4,5], dtype='float32')
+    a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')
-    b_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float32')
+    b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float32')
-    f_sub = theano.function([a,b], c-b, mode=mode_without_gpu)
+    f_sub = theano.function([a, b], c - b, mode=mode_with_gpu)
+    topo = f_sub.maker.env.toposort()
+    assert any([isinstance(node.op, B.GpuReshape) for node in topo])
    assert numpy.all(f_sub(a_val, b_val) == 0.0)
    assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy))
    # verify gradient
    def just_vals(v):
-        return T.Reshape(2)(v, theano._asarray([2,3], dtype='int32'))
+        return T.Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
    utt.verify_grad(just_vals, [a_val])
 def test_elemwise_empty():
    #test with 0 element
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0,0), dtype='float32'), 'a')
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0, 0),
+                                               dtype='float32'), 'a')
    b = tensor.fmatrix()
-    f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
-    f2 = pfunc([b], [], updates=[(a, a+b)], mode=mode_without_gpu)
+    f2 = pfunc([b], [], updates=[(a, a + b)], mode=mode_without_gpu)
    a0 = a.get_value() * 1.0
-    f(numpy.ones((0,0), dtype='float32'))
+    f(numpy.ones((0, 0), dtype='float32'))
    assert numpy.all(a0 + 1.0 == a.get_value())
 def test_elemwise0():
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(4,4), dtype='float32'), 'a')
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(4, 4),
+                                               dtype='float32'), 'a')
    b = tensor.fmatrix()
-    f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
    #check that we work inplace.
-    assert f.maker.env.toposort()[1].op.destroy_map.items()==[(0,[0])]
+    assert f.maker.env.toposort()[1].op.destroy_map.items() == [(0, [0])]
    a0 = a.get_value() * 1.0
    print 'BEFORE ADD', a.get_value()
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
-    f(numpy.ones((4,4), dtype='float32'))
+    f(numpy.ones((4, 4), dtype='float32'))
    print 'AFTER ADD', a.get_value()
    assert numpy.all(a0 + 1.0 == a.get_value())
 def test_elemwise_bad_broadcast():
    x = cuda.fmatrix('x')
    y = cuda.fmatrix('y')
    f = theano.function([x, y], x * y, mode=mode_with_gpu)
    print f.maker.env.toposort()
-    assert len(f.maker.env.toposort())==2
+    assert len(f.maker.env.toposort()) == 2
    assert isinstance(f.maker.env.toposort()[0].op, cuda.GpuElemwise)
-    assert f.maker.env.toposort()[1].op==cuda.host_from_gpu
+    assert f.maker.env.toposort()[1].op == cuda.host_from_gpu
    try:
        f(rand_cuda_ndarray((10, 3)), rand_cuda_ndarray((10, 1)))
@@ -261,41 +289,48 @@ def test_elemwise_bad_broadcast():
    else:
        raise Exception("Theano should have raised an error")
 def test_elemwise1():
-    """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """
+    """ Several kinds of elemwise expressions with no broadcasting,
+    non power-of-two shape """
-    shape = (3,4)
+    shape = (3, 4)
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.5, 'a')
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
+                                               dtype='float32') + 0.5, 'a')
    b = tensor.fmatrix()
    #let debugmode catch any mistakes
    print >> sys.stdout, "STARTING FUNCTION 1"
-    f = pfunc([b], [], updates=[(a, b**a)], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu)
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
-    f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3)
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
    print >> sys.stdout, "STARTING FUNCTION 2"
    #let debugmode catch any mistakes
-    f = pfunc([b], [], updates=[(a, tensor.exp(b**a))], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu)
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
-    f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3)
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
    print >> sys.stdout, "STARTING FUNCTION 3"
    #let debugmode catch any mistakes
-    f = pfunc([b], [], updates=[(a, a+b * tensor.exp(b**a))], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))],
-    f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3)
+              mode=mode_with_gpu)
+    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
 def test_elemwise2():
    """ Several kinds of elemwise expressions with dimension permutations """
    rng = numpy.random.RandomState(int(time.time()))
    print 'random?', rng.rand(3)
-    shape = (3,5)
+    shape = (3, 5)
-    for pattern in [(0,1), (1,0)]:
+    for pattern in [(0, 1), (1, 0)]:
-        a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),dtype='float32'), name=None)
+        a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),
-        b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))()
+                                                   dtype='float32'), name=None)
-        f = pfunc([b], [], updates=[(a, (a+b).dimshuffle(pattern))], mode=mode_with_gpu)
+        b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
+        f = pfunc([b], [], updates=[(a, (a + b).dimshuffle(pattern))],
+                  mode=mode_with_gpu)
        has_elemwise = False
        for i, node in enumerate(f.maker.env.toposort()):
            print >> sys.stdout, i, node
@@ -303,34 +338,39 @@ def test_elemwise2():
        assert not has_elemwise
        #let debugmode catch errors
        print >> sys.stdout, 'pattern', pattern
-        f(theano._asarray(rng.rand(*shape),dtype='float32')*.3)
+        f(theano._asarray(rng.rand(*shape), dtype='float32') * .3)
-    shape = (3,4,5,6)
+    shape = (3, 4, 5, 6)
-    a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),dtype='float32'), 'a')
+    a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),
-    b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))()
+                                               dtype='float32'), 'a')
-    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) *
+    b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
-        tensor.exp(b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu)
+    f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) *
+        tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
    assert not has_elemwise
    #let debugmode catch errors
-    f(theano._asarray(rng.rand(*shape),dtype='float32'))
+    f(theano._asarray(rng.rand(*shape), dtype='float32'))
 def test_elemwise3():
-    """ Several kinds of elemwise expressions with dimension permutations and broadcasting"""
+    """ Several kinds of elemwise expressions with dimension
+    permutations and broadcasting"""
-    shape = (3,4,5,6)
+    shape = (3, 4, 5, 6)
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
+                                               dtype='float32'), 'a')
    b = tensor.fvector()
    print b.type
    print tensor.constant(1).type
    print (1 + b).type
-    print (1 + b**a).type
+    print (1 + b ** a).type
-    print tensor.exp((1 + b**a)).type
+    print tensor.exp((1 + b ** a)).type
-    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 +
+    new_val = (a + b).dimshuffle([2, 0, 3, 1])
-        b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu)
+    new_val *= tensor.exp(1 + b ** a).dimshuffle([2, 0, 3, 1])
+    f = pfunc([b], [], updates=[(a, new_val)], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        print >> sys.stdout, i, node
@@ -339,75 +379,86 @@ def test_elemwise3():
    #let debugmode catch errors
    f(theano._asarray(numpy.random.rand(6), dtype='float32'))
 def test_elemwise4():
-    """ Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update"""
+    """ Test that two vectors can be broadcast to form an outer
+    product (by performing rank-1 matrix update"""
-    shape = (3,4)
+    shape = (3, 4)
-    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
+    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
+                                               dtype='float32'), 'a')
    b = tensor.fvector()
    c = tensor.fvector()
-    f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*c.dimshuffle(0, 'x')))], mode=mode_with_gpu)
+    f = pfunc([b, c], [],
+              updates=[(a, (a + b.dimshuffle('x', 0) * c.dimshuffle(0, 'x')))],
+              mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        print >> sys.stdout, i, node
        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
    assert not has_elemwise
    #let debugmode catch errors
-    f(theano._asarray(numpy.random.rand(4), dtype='float32'), theano._asarray(numpy.random.rand(3), dtype='float32'))
+    f(theano._asarray(numpy.random.rand(4), dtype='float32'),
+      theano._asarray(numpy.random.rand(3), dtype='float32'))
 def test_elemwise_comparaison_cast():
    """
-    test if an elemwise comparaison followed by a cast to float32 are pushed to gpu.
+    test if an elemwise comparaison followed by a cast to float32 are
+    pushed to gpu.
    """
    a = tensor.fmatrix()
    b = tensor.fmatrix()
-    av = theano._asarray(numpy.random.rand(4,4), dtype='float32')
+    av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
-    bv = numpy.ones((4,4), dtype='float32')
+    bv = numpy.ones((4, 4), dtype='float32')
-    for g,ans in [(tensor.lt, av<bv), (tensor.gt, av>bv),
+    for g, ans in [(tensor.lt, av < bv), (tensor.gt, av > bv),
-                  (tensor.le, av<=bv), (tensor.ge, av>=bv)]:
+                   (tensor.le, av <= bv), (tensor.ge, av >= bv)]:
-        f = pfunc([a,b], tensor.cast(g(a,b),'float32'), mode=mode_with_gpu)
+        f = pfunc([a, b], tensor.cast(g(a, b), 'float32'), mode=mode_with_gpu)
        #theano.printing.debugprint(f)
-        out = f(av,bv)
+        out = f(av, bv)
        assert numpy.all(out == ans)
-        assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.toposort()])
+        assert any([isinstance(node.op, cuda.GpuElemwise)
-        #assert any([isinstance(node.op, tensor.Elemwise) for node in f.maker.env.toposort()])
+                    for node in f.maker.env.toposort()])
 def test_elemwise_composite_float64():
    # test that we don't fuse composite elemwise with float64 somewhere inside
-    # nvcc by default downcast them to float32. We would need to tell him not to
+    # nvcc by default downcast them to float32. We would need to tell him not
-    # do so, but that possible only on some device.
+    # to do so, but that possible only on some device.
    a = tensor.fmatrix()
    b = tensor.fmatrix()
-    av = theano._asarray(numpy.random.rand(4,4), dtype='float32')
+    av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
-    bv = numpy.ones((4,4), dtype='float32')
+    bv = numpy.ones((4, 4), dtype='float32')
    def get_all_basic_scalar(composite_op):
-        l=[]
+        l = []
        for i in composite_op.env.toposort():
            if isinstance(i, theano.scalar.Composite):
                l += get_all_basic_scalar(i)
            else:
                l.append(i)
        return l
-    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]:
+    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
-        f = pfunc([a,b], tensor.cast(tensor.lt(tensor.cast(a,'float64')**2,#*numpy.asarray(2, 'float32'),
+                 mode_with_gpu.excluding('elemwise_fusion')]:
+        f = pfunc([a, b],
+                  tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2,
                                               b),
                                     'float32'), mode=mode)
        #theano.printing.debugprint(f, print_type=True)
-        out = f(av,bv)
+        out = f(av, bv)
-        assert numpy.all(out == ((av**2)<bv))
+        assert numpy.all(out == ((av ** 2) < bv))
        for node in f.maker.env.toposort():
            if isinstance(node.op, cuda.GpuElemwise):
                if isinstance(node.op.scalar_op, theano.scalar.Composite):
                    scals = get_all_basic_scalar(node.op.scalar_op)
                    for s in scals:
-                        assert not any([i.type.dtype=='float64' for i in s.inputs+s.outputs])
+                        assert not any([i.type.dtype == 'float64'
+                                        for i in s.inputs + s.outputs])
 def test_elemwise_composite_support_code():
@@ -443,205 +494,226 @@ def test_elemwise_composite_support_code():
 def speed_elemwise_collapse():
    """ used to time if the collapse of ccontiguous dims are useful """
-    shape = (30,40,50,600)
+    shape = (30, 40, 50, 600)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2[:,::2,:,:]
+    a3 = a2[:, ::2, :, :]
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = a3+b * tensor.exp(1 + b**a3)
+    c = a3 + b * tensor.exp(1 + b ** a3)
    f = pfunc([b], [c], mode=mode_with_gpu)
+    v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    v = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    v = v[:, ::2, :, :]
-    v = v[:,::2,:,:]
+    v = cuda_ndarray.CudaNdarray(v)
-    v=cuda_ndarray.CudaNdarray(v)
+    for id, n in enumerate(f.maker.env.toposort()):
-    for id,n in enumerate(f.maker.env.toposort()):
        print id, n
-    t1=time.time()
+    t1 = time.time()
    for i in range(100):
        #let debugmode catch errors
        f(v)
-    t2=time.time()
+    t2 = time.time()
 def speed_elemwise_collapse2():
-    """ used to test the speed up of the generalised collapse of ccontiguous dims"""
+    """ used to test the speed up of the generalised collapse of
+    ccontiguous dims"""
-    shape = (30,40,50,600)
+    shape = (30, 40, 50, 600)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2[:,:,:,::2]
+    a3 = a2[:, :, :, ::2]
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = a3+b * tensor.exp(1 + b**a3)
+    c = a3 + b * tensor.exp(1 + b ** a3)
    f = pfunc([b], [c], mode=mode_with_gpu)
+    v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    v = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+    v = v[:, :, :, ::2]
-    v = v[:,:,:,::2]
+    v = cuda_ndarray.CudaNdarray(v)
-    v=cuda_ndarray.CudaNdarray(v)
+    for id, n in enumerate(f.maker.env.toposort()):
-    for id,n in enumerate(f.maker.env.toposort()):
        print id, n
-    t1=time.time()
+    t1 = time.time()
    for i in range(100):
        #let debugmode catch errors
        f(v)
-    t2=time.time()
+    t2 = time.time()
 def test_elemwise_collapse():
    """ Test when all inputs have one(and the same) broadcastable dimension """
-    shape = (4,5,60)
+    shape = (4, 5, 60)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle(0,'x',1,2)
+    a3 = a2.dimshuffle(0, 'x', 1, 2)
    b = tcn.CudaNdarrayType((False, True, False, False))()
-    c = a3+b
+    c = a3 + b
    f = pfunc([b], [c], mode=mode_with_gpu)
+    v = theano._asarray(numpy.random.rand(shape[0], 1, *shape[1:]),
-    v = theano._asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32')
+                        dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
+    out = f(v)[0]
-    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
+    assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
    print "Expected collapse of all dimensions"
 def test_elemwise_collapse2():
    """ Test when only one inputs have one broadcastable dimension """
-    shape = (4,5,9)
+    shape = (4, 5, 9)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle(0,'x',1,2)
+    a3 = a2.dimshuffle(0, 'x', 1, 2)
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = a3+b
+    c = a3 + b
    f = pfunc([b], [c], mode=mode_with_gpu)
+    v = theano._asarray(numpy.random.rand(shape[0], 5, *shape[1:]),
-    v = theano._asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32')
+                        dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
+    out = f(v)[0]
-    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
+    assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
    print "Expected collapse to 3 dimensions"
 def test_elemwise_collapse3():
    """ Test when only one inputs have two broadcastable dimension at each ends """
-    shape = (4,5)
+    shape = (4, 5)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape),
+                        dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle('x',0,1,'x')
+    a3 = a2.dimshuffle('x', 0, 1, 'x')
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = (a3+b)
+    c = (a3 + b)
    f = pfunc([b], [c], mode=mode_with_gpu)
+    v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4),
-    v = theano._asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
+                        dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n  in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
+    out = f(v)[0]
-    assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v)
+    assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v)
    print "Expected collapse to 3 dimensions"
 def test_elemwise_collapse4():
-    """ Test when only one inputs have two broadcastable dimension at each ends and we add a scalar"""
+    """ Test when only one inputs have two broadcastable dimension at
+    each ends and we add a scalar"""
-    shape = (4,5)
+    shape = (4, 5)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle('x',0,1,'x')
+    a3 = a2.dimshuffle('x', 0, 1, 'x')
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = (a3+b+2)
+    c = (a3 + b + 2)
    f = pfunc([b], [c], mode=mode_with_gpu)
+    v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4),
-    v = theano._asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
+                        dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
+    out = f(v)[0]
-    assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v+2)
+    assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v + 2)
    print "Expected collapse to 3 dimensions"
 def test_elemwise_collapse5():
-    """ Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar"""
+    """ Test when only one inputs have two broadcastable dimension at
+    the beginning and we add a scalar"""
-    shape = (4,5)
+    shape = (4, 5)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle('x','x',0,1)
+    a3 = a2.dimshuffle('x', 'x', 0, 1)
    b = tcn.CudaNdarrayType((False, False, False, False))()
-    c = (a3+b+2)
+    c = (a3 + b + 2)
    f = pfunc([b], [c], mode=mode_with_gpu)
+    v = theano._asarray(numpy.random.rand(5, 4, shape[0], shape[1]),
-    v = theano._asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32')
+                        dtype='float32')
-    v=cuda_ndarray.CudaNdarray(v)
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
+    out = f(v)[0]
-    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2)
+    assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v + 2)
    print "Expected collapse to 2 dimensions"
 def test_elemwise_collapse6():
-    """ Test when all inputs have two broadcastable dimension at the beginning"""
+    """ Test when all inputs have two broadcastable dimension at the
+    beginning"""
-    shape = (4,5)
+    shape = (4, 5)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
-    a3 = a2.dimshuffle('x','x',0,1)
+    a3 = a2.dimshuffle('x', 'x', 0, 1)
    b = tcn.CudaNdarrayType((True, True, False, False))()
-    f = pfunc([b], [a3+b], mode=mode_with_gpu)
+    f = pfunc([b], [a3 + b], mode=mode_with_gpu)
-    v = theano._asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32')
+    v = theano._asarray(numpy.random.rand(1, 1, shape[0], shape[1]),
-    v=cuda_ndarray.CudaNdarray(v)
+                        dtype='float32')
+    v = cuda_ndarray.CudaNdarray(v)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f(v)[0]
+    out = f(v)[0]
-    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v)
+    assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v)
    print "Expected collapse to c contiguous"
 def test_elemwise_collapse7(atol=1e-6):
-    """ Test when one input have one broadcastable dimension and the other is a scalar"""
+    """ Test when one input have one broadcastable dimension and the
+    other is a scalar"""
-    shape = (5,4,1)
+    shape = (5, 4, 1)
-    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
-    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
+                                                 dtype='float32'))
+    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a.copy(), 'a')
    a3 = a2.dimshuffle(0, 'x', 1, 2)
-    f = pfunc([], [a3+2], mode=mode_with_gpu)
+    f = pfunc([], [a3 + 2], mode=mode_with_gpu)
    if False:
-        for id,n in enumerate(f.maker.env.toposort()):
+        for id, n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
-    out=f()[0]
+    out = f()[0]
-    ans=(a+2).reshape(shape[0],1,shape[1],shape[2])
+    ans = (a + 2).reshape(shape[0], 1, shape[1], shape[2])
-    assert numpy.allclose(out,ans, atol=atol)
+    assert numpy.allclose(out, ans, atol=atol)
    print "Expected collapse to c contiguous"
@@ -651,40 +723,45 @@ def test_hostfromgpu_shape_i():
    """
    pass
-    m = mode_with_gpu.including('local_dot_to_dot22','local_dot22_to_dot22scalar','specialize')
+    m = mode_with_gpu.including('local_dot_to_dot22',
-    a=T.fmatrix('a')
+                                'local_dot22_to_dot22scalar','specialize')
-    ca=theano.sandbox.cuda.var.CudaNdarrayType((False,False))()
+    a = T.fmatrix('a')
+    ca = theano.sandbox.cuda.var.CudaNdarrayType((False, False))()
-    av=numpy.asarray(numpy.random.rand(5,4),dtype='float32')
+    av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
-    cv=cuda.CudaNdarray(numpy.asarray(numpy.random.rand(5,4),dtype='float32'))
+    cv = cuda.CudaNdarray(numpy.asarray(numpy.random.rand(5, 4),
+                                      dtype='float32'))
-    f = theano.function([a],cuda.basic_ops.gpu_from_host(a), mode=m)
+    f = theano.function([a], cuda.basic_ops.gpu_from_host(a), mode=m)
-    assert cuda.basic_ops.gpu_from_host in [x.op for x in f.maker.env.toposort()]
+    assert cuda.basic_ops.gpu_from_host in [x.op
-    f = theano.function([a],cuda.basic_ops.gpu_from_host(a).shape, mode=m)
+                                            for x in f.maker.env.toposort()]
+    f = theano.function([a], cuda.basic_ops.gpu_from_host(a).shape, mode=m)
    topo = f.maker.env.toposort()
-    assert isinstance(topo[0].op,T.opt.Shape_i)
+    assert isinstance(topo[0].op, T.opt.Shape_i)
-    assert isinstance(topo[1].op,T.opt.Shape_i)
+    assert isinstance(topo[1].op, T.opt.Shape_i)
-    assert isinstance(topo[2].op,T.opt.MakeVector)
+    assert isinstance(topo[2].op, T.opt.MakeVector)
-    assert tuple(f(av))==(5,4)
+    assert tuple(f(av)) == (5, 4)
-    f = theano.function([ca],cuda.basic_ops.host_from_gpu(ca), mode=m)
+    f = theano.function([ca], cuda.basic_ops.host_from_gpu(ca), mode=m)
-    assert cuda.basic_ops.host_from_gpu in [x.op for x in f.maker.env.toposort()]
+    assert cuda.basic_ops.host_from_gpu in [x.op
-    f = theano.function([ca],cuda.basic_ops.host_from_gpu(ca).shape, mode=m)
+                                            for x in f.maker.env.toposort()]
+    f = theano.function([ca], cuda.basic_ops.host_from_gpu(ca).shape, mode=m)
    topo = f.maker.env.toposort()
-    assert isinstance(topo[0].op,T.opt.Shape_i)
+    assert isinstance(topo[0].op, T.opt.Shape_i)
-    assert isinstance(topo[1].op,T.opt.Shape_i)
+    assert isinstance(topo[1].op, T.opt.Shape_i)
-    assert isinstance(topo[2].op,T.opt.MakeVector)
+    assert isinstance(topo[2].op, T.opt.MakeVector)
-    assert tuple(f(cv))==(5,4)
+    assert tuple(f(cv)) == (5, 4)
 # -----------------------------------------------------------------------
 import theano.sandbox.cuda as cuda_ndarray
 def test_gpujoin_assert_cndas():
    # this will end up being an ndarray, as it's float64
-    _a = numpy.asarray([[1,2],[3,4]],dtype='float64')
+    _a = numpy.asarray([[1, 2], [3, 4]], dtype='float64')
    a = theano.shared(_a)
    try:
@@ -697,64 +774,80 @@ def test_gpujoin_assert_cndas():
    assert False
 def test_gpujoin_no_rebroadcast():
-    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
+    _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32')
    a = tcn.shared_constructor(_a)
-    f = theano.function([],T.join(1,a))
+    f = theano.function([], T.join(1, a))
    l = f.maker.env.toposort()
-    assert not any([isinstance(x.op,T.Rebroadcast) for x in l])
+    assert not any([isinstance(x.op, T.Rebroadcast) for x in l])
 def test_gpualloc_input_on_gpu():
-    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    a = tcn.shared_constructor(a_val)
    b = T.fscalar()
-    f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu)
+    f = theano.function([b], T.ones_like(a) + b, mode=mode_without_gpu)
-    f_gpu = theano.function([b], T.ones_like(a)+b, mode=mode_with_gpu)
+    f_gpu = theano.function([b], T.ones_like(a) + b, mode=mode_with_gpu)
+    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 1
+    assert sum([node.op == B.gpu_alloc
+                for node in f_gpu.maker.env.toposort()]) == 1
-    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1
+    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape) + 9,
-    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1
+                          f_gpu(9))
+    assert numpy.allclose(f(5), f_gpu(5))
-    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
-    assert numpy.allclose(f(5),f_gpu(5))
 def test_gpujoin_gpualloc():
    a = T.fmatrix('a')
-    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    b = T.fmatrix('b')
-    b_val = numpy.asarray(numpy.random.rand(3,5),dtype='float32')
+    b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
-    f = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_without_gpu)
+    f = theano.function([a, b], T.join(0, T.zeros_like(a),T.ones_like(b)) + 4,
-    f_gpu = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b)), mode=mode_with_gpu)
+                        mode=mode_without_gpu)
-    f_gpu2 = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_with_gpu)
+    f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
+                            mode=mode_with_gpu)
+    f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
+                                           T.ones_like(b)) + 4,
+                             mode=mode_with_gpu)
+    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 2
+    assert sum([node.op == T.join for node in f.maker.env.toposort()]) == 1
+    assert sum([node.op == B.gpu_alloc
+                for node in f_gpu.maker.env.toposort()]) == 2
+    assert sum([node.op == B.gpu_join
+                for node in f_gpu.maker.env.toposort()]) == 1
+    assert sum([node.op == B.gpu_alloc
+                for node in f_gpu2.maker.env.toposort()]) == 2
+    assert sum([node.op == B.gpu_join
+                for node in f_gpu2.maker.env.toposort()]) == 1
+    assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
-    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==2
-    assert sum([node.op == T.join for node in f.maker.env.toposort()])==1
-    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==2
-    assert sum([node.op == B.gpu_join for node in f_gpu.maker.env.toposort()])==1
-    assert sum([node.op == B.gpu_alloc for node in f_gpu2.maker.env.toposort()])==2
-    assert sum([node.op == B.gpu_join for node in f_gpu2.maker.env.toposort()])==1
-    assert numpy.allclose(f(a_val,b_val),f_gpu2(a_val,b_val))
 def test_gpualloc_output_to_gpu():
-    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    a = tcn.shared_constructor(a_val)
    b = T.fscalar()
-    f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu)
+    f = theano.function([b], T.ones_like(a) + b, mode=mode_without_gpu)
-    f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a))+b, mode=mode_with_gpu)
+    f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a)) + b,
+                            mode=mode_with_gpu)
    print f.maker.env.toposort()
    print f_gpu.maker.env.toposort()
    print f(2)
    print f_gpu(2)
-    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1
+    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 1
-    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1
+    assert sum([node.op == B.gpu_alloc
+                for node in f_gpu.maker.env.toposort()]) == 1
-    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
+    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape) + 9,
-    assert numpy.allclose(f(5),f_gpu(5))
+                          f_gpu(9))
+    assert numpy.allclose(f(5), f_gpu(5))
 import theano.tensor.tests.test_basic
@@ -766,6 +859,7 @@ class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
    shared = staticmethod(cuda.shared_constructor)
    allocs = [B.GpuAlloc, B.GpuAlloc, tensor.Alloc]
 class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
    def setUp(self):
        utt.seed_rng()
@@ -783,128 +877,152 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
 # This is to don't duplicate test.
 class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
-    shared=staticmethod(cuda.shared_constructor)
+    shared = staticmethod(cuda.shared_constructor)
-    sub=cuda.GpuSubtensor
+    sub = cuda.GpuSubtensor
-    inc_sub=cuda.GpuIncSubtensor
+    inc_sub = cuda.GpuIncSubtensor
-    adv_sub1=cuda.GpuAdvancedSubtensor1
+    adv_sub1 = cuda.GpuAdvancedSubtensor1
-    adv_incsub1=cuda.GpuAdvancedIncSubtensor1
+    adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
-    mode=mode_with_gpu
+    mode = mode_with_gpu
-    dtype='float32'
+    dtype = 'float32'
-    ignore_topo=(B.HostFromGpu, B.GpuFromHost)
+    ignore_topo = (B.HostFromGpu, B.GpuFromHost)
    fast_compile = theano.config.mode == 'FAST_COMPILE'
    def __init__(self, name):
-        return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name)
+        return super(theano.tensor.tests.test_basic.T_subtensor,
+                     self).__init__(name)
 def test_advinc_subtensor1():
    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
    shared = cuda.shared_constructor
    #shared = tensor.shared
-    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                      dtype='float32')
-    yval = numpy.asarray([[10,10,10], [10,10,10]],
+    yval = numpy.asarray([[10, 10, 10], [10, 10, 10]],
                      dtype='float32')
-    x = shared(xval, name = 'x')
+    x = shared(xval, name='x')
    y = T.fmatrices('y')
-    expr = T.advanced_inc_subtensor1(x,y,[0,2])
+    expr = T.advanced_inc_subtensor1(x, y, [0, 2])
-    f=theano.function([y], expr, mode=mode_with_gpu)
+    f = theano.function([y], expr, mode=mode_with_gpu)
-    assert sum([isinstance(node.op,cuda.GpuAdvancedIncSubtensor1) for node in f.maker.env.toposort() ])==1
+    assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
-    assert numpy.allclose(f(yval),[[11.,12.,13.], [4.,5.,6.], [17.,18.,19.]])
+                for node in f.maker.env.toposort()]) == 1
+    assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.],
+                                    [17., 18., 19.]])
 def test_inc_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared
-    x,y = T.fmatrices('x','y')
+    x, y = T.fmatrices('x', 'y')
-    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                      dtype='float32')
-    yval = numpy.asarray([[10,10,10], [10,10,10], [10,10,10]],
+    yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]],
                      dtype='float32')
-    expr = T.inc_subtensor(x[:,1:3], y[:,1:3])
+    expr = T.inc_subtensor(x[:, 1:3], y[:, 1:3])
-    f=theano.function([x,y], expr, mode=mode_with_gpu)
+    f = theano.function([x, y], expr, mode=mode_with_gpu)
    print f.maker.env.toposort()
-    assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
+    assert sum([isinstance(node.op, cuda.GpuSubtensor)
-    assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==False for node in f.maker.env.toposort() ])==1
+                for node in f.maker.env.toposort()]) == 1
-    assert numpy.allclose(f(xval,yval),[[1.,12.,13.], [4.,15.,16.], [7.,18.,19.]])
+    assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
+                node.op.set_instead_of_inc==False
+                for node in f.maker.env.toposort()]) == 1
+    assert numpy.allclose(f(xval, yval), [[1., 12., 13.],
+                                          [4., 15., 16.], [7., 18., 19.]])
 def test_set_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared
-    x,y = T.fmatrices('x','y')
+    x, y = T.fmatrices('x', 'y')
-    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                      dtype='float32')
-    yval = numpy.asarray([[10,10,10], [10,10,10], [10,10,10]],
+    yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]],
                      dtype='float32')
-    expr = T.set_subtensor(x[:,1:3], y[:,1:3])
+    expr = T.set_subtensor(x[:, 1:3], y[:, 1:3])
-    f=theano.function([x,y], expr, mode=mode_with_gpu)
+    f = theano.function([x, y], expr, mode=mode_with_gpu)
-    assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
+    assert sum([isinstance(node.op, cuda.GpuSubtensor)
-    assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==True for node in f.maker.env.toposort() ])==1
+                for node in f.maker.env.toposort()]) == 1
-    print f(xval,yval)
+    assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
+                node.op.set_instead_of_inc == True
+                for node in f.maker.env.toposort()]) == 1
+    print f(xval, yval)
 def test_many_arg_elemwise():
    """this test checks whether the + and * elemwise ops can handle extremely large numbers of
    arguments on gpu
    i.e., it is a test of the optimization theano/sandbox/cuda/opt.py:local_gpu_huge_add_or_mul """
-    rng = numpy.random.RandomState( [1,2,3])
+    rng = numpy.random.RandomState([1, 2, 3])
    for num_args in [25]:
-        for op_to_test in [ theano.tensor.add, theano.tensor.mul ]:
+        for op_to_test in [theano.tensor.add, theano.tensor.mul]:
-            for nb_dim in [2,3,4,5]:
+            for nb_dim in [2, 3, 4, 5]:
-                shapes = [rng.randint(1,5) for i in range(nb_dim)]
+                shapes = [rng.randint(1, 5) for i in range(nb_dim)]
-                args = [ numpy.cast['float32'](rng.randn(*shapes)) for arg in xrange(0,num_args) ]
+                args = [numpy.cast['float32'](rng.randn(*shapes))
+                        for arg in xrange(0, num_args)]
-                symb_args = [ theano.tensor.TensorType('float32', (False,)*nb_dim)() for arg in xrange(0,num_args) ]
+                symb_args = [theano.tensor.TensorType('float32',
+                                                      (False,)*nb_dim)()
+                             for arg in xrange(0, num_args)]
                outputs = []
-                for mode in [ mode_with_gpu, mode_without_gpu ]:
+                for mode in [mode_with_gpu, mode_without_gpu]:
                    #test the optijmization local_gpu_elemwise_0
-                    f = theano.function( symb_args, op_to_test(*symb_args), mode = mode.excluding("local_gpu_elemwise_1") )
+                    f = theano.function(
-                    outputs.append( f( * args) )
+                        symb_args, op_to_test(*symb_args),
+                        mode=mode.excluding("local_gpu_elemwise_1"))
+                    outputs.append(f(*args))
                    #assert that the test was done on the gpu.
                    if mode is mode_with_gpu:
-                        assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes])
+                        assert any([isinstance(node.op, cuda.GpuElemwise)
+                                    for node in f.maker.env.nodes])
                    #test the optijmization local_gpu_elemwise_1
-                    f = theano.function( symb_args,
+                    f = theano.function(
-                                         cuda.gpu_from_host(op_to_test(*symb_args)),
+                        symb_args,
-                                         mode = mode.excluding("local_gpu_elemwise_0") )
+                        cuda.gpu_from_host(op_to_test(*symb_args)),
-                    out = f( * args)
+                        mode=mode.excluding("local_gpu_elemwise_0"))
+                    out = f(*args)
                    #assert that the test was done on the gpu.
                    if mode is mode_with_gpu:
-                        assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes])
+                        assert any([isinstance(node.op, cuda.GpuElemwise)
+                                    for node in f.maker.env.nodes])
                    assert numpy.allclose(out, outputs[-1])
                results_gpu, results_cpu = outputs
                assert numpy.allclose(results_gpu, results_cpu)
 def test_duplicate_arg_elemwise():
    A = theano.tensor.fmatrix()
    B = A + A
-    f = theano.function([A],B, mode = mode_with_gpu)
+    f = theano.function([A], B, mode=mode_with_gpu)
-    Aval = numpy.random.RandomState([1,2,3]).randn(5,5).astype('float32')
+    Aval = numpy.random.RandomState([1, 2, 3]).randn(5, 5).astype('float32')
    Bval = Aval + Aval
-    assert numpy.allclose(Bval,f(Aval))
+    assert numpy.allclose(Bval, f(Aval))
 def test_shared_float32():
    '''Test use of cuda.shared_constructor through theano.shared'''
    # Register cuda.shared_constructor in theano.shared
    theano.shared.constructors.append(cuda.shared_constructor)
-    a = theano.shared(numpy.ones((2,3), dtype='float32'))
+    a = theano.shared(numpy.ones((2, 3), dtype='float32'))
    assert isinstance(a.type, tcn.CudaNdarrayType)
    # Unregister
    del theano.shared.constructors[-1]
 def test_shared_cudandarray():
-    '''Test that we can create a CudaNdarraySharedVariable from a CudaNdarray'''
+    '''Test that we can create a CudaNdarraySharedVariable from a
-    a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2,3)))
+    CudaNdarray'''
+    a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2, 3)))
    assert isinstance(a.type, tcn.CudaNdarrayType)
@@ -987,38 +1105,38 @@ class test_size(unittest.TestCase):
 import theano.tensor.tests.test_sharedvar
 #This test the case when the shared constructor view an CudaNdarray as input
 test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester(
-    shared_constructor_ = tcn.shared_constructor,
+    shared_constructor_=tcn.shared_constructor,
-    dtype_ = 'float32',
+    dtype_='float32',
-    get_value_borrow_true_alias_ = True,
+    get_value_borrow_true_alias_=True,
-    shared_borrow_true_alias_ = True,#True when the original value is already a CudaNdarray!
+    shared_borrow_true_alias_=True,#True when the original value is already a CudaNdarray!
-    set_value_borrow_true_alias_ = True,
+    set_value_borrow_true_alias_=True,
-    set_value_inplace_ = True,
+    set_value_inplace_=True,
-    set_cast_value_inplace_ = False,
+    set_cast_value_inplace_=False,
-    shared_constructor_accept_ndarray_ = True,
+    shared_constructor_accept_ndarray_=True,
-    internal_type_ = cuda_ndarray.CudaNdarray,
+    internal_type_=cuda_ndarray.CudaNdarray,
-    test_internal_type_ = lambda a: isinstance(a,cuda_ndarray.CudaNdarray),
+    test_internal_type_=lambda a: isinstance(a, cuda_ndarray.CudaNdarray),
-    theano_fct_ = theano.tensor.exp,
+    theano_fct_=theano.tensor.exp,
-    ref_fct_ = numpy.exp,
+    ref_fct_=numpy.exp,
-    cast_value_ = cuda.as_cuda_array,
+    cast_value_=cuda.as_cuda_array,
-    op_by_matrix_ = True,
+    op_by_matrix_=True,
    name='test_shared_options')
 #This test the case when the shared constructor view an ndarray as input
 test_shared_options2 = theano.tensor.tests.test_sharedvar.makeSharedTester(
-    shared_constructor_ = tcn.shared_constructor,
+    shared_constructor_=tcn.shared_constructor,
-    dtype_ = 'float32',
+    dtype_='float32',
-    get_value_borrow_true_alias_ = False,
+    get_value_borrow_true_alias_=False,
-    shared_borrow_true_alias_ = False,
+    shared_borrow_true_alias_=False,
-    set_value_borrow_true_alias_ = False,
+    set_value_borrow_true_alias_=False,
-    set_value_inplace_ = True,
+    set_value_inplace_=True,
-    set_cast_value_inplace_ = True,
+    set_cast_value_inplace_=True,
-    shared_constructor_accept_ndarray_ = True,
+    shared_constructor_accept_ndarray_=True,
-    internal_type_ = cuda_ndarray.CudaNdarray,
+    internal_type_=cuda_ndarray.CudaNdarray,
-    test_internal_type_ = lambda a: isinstance(a,cuda_ndarray.CudaNdarray),
+    test_internal_type_=lambda a: isinstance(a, cuda_ndarray.CudaNdarray),
-    theano_fct_ = theano.tensor.exp,
+    theano_fct_=theano.tensor.exp,
-    ref_fct_ = numpy.exp,
+    ref_fct_=numpy.exp,
-    cast_value_ = numpy.asarray,
+    cast_value_=numpy.asarray,
-    op_by_matrix_ = True,
+    op_by_matrix_=True,
    name='test_shared_options')
 if __name__ == '__main__':

--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
+import numpy
+import theano
+# Skip test if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    raise SkipTest('Optional package cuda disabled')
+import theano.sandbox.cuda as cuda
+import theano.sandbox.cuda.basic_ops as B
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+def test_nvidia_driver1():
+    """ Some nvidia driver give bad result for reduction
+        This execute some reduction test to ensure it run correctly
+    """
+    a = numpy.random.rand(10000).astype("float32")
+    A = cuda.shared_constructor(a)
+    f = theano.function(inputs=[], outputs=A.sum(), mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    assert len(topo) == 2
+    assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1
+    if not numpy.allclose(f(), a.sum()):
+        raise Exception("The nvidia driver version installed with the OS "
+                        "don't give good result for reduction."
+                        "Installing the nvidia driver available on the same "
+                        "download page as the cuda package will fix the "
+                        "problem: http://developer.nvidia.com/cuda-downloads")
+def test_nvidia_driver2():
+    """ Test that the gpu device is initialized by theano when
+        we manually make a shared variable on the gpu.
+        The driver should always be tested during theano initialization
+        of the gpu device
+    """
+    a = numpy.random.rand(10000).astype("float32")
+    cuda.shared_constructor(a)
+    assert theano.sandbox.cuda.use.device_number is not None
+def test_nvidia_driver3():
+    """ Test that the gpu device is initialized by theano when
+        we build a function with gpu op.
+        The driver should always be tested during theano initialization
+        of the gpu device
+    """
+    var = cuda.fvector()
+    f = theano.function([var], var + 1, mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
+    assert theano.sandbox.cuda.use.device_number is not None
+# TODO make sure the test_nvidia_driver test are executed when we make manually
+# a CudaNdarray like this: cuda.CudaNdarray.zeros((5,4))
--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -169,6 +169,12 @@ def cuda_shared_constructor(value, name=None, strict=False,
 def float32_shared_constructor(value, name=None, strict=False,
        allow_downcast=None, borrow=False, broadcastable=None):
    """SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray"""
+    if theano.sandbox.cuda.use.device_number is None:
+        theano.sandbox.cuda.use("gpu",
+                                force=True,
+                                default_to_move_computation_to_gpu=False,
+                                move_shared_float32_to_gpu=False,
+                                enable_cuda=False)
    # if value isn't a float32 ndarray, or a CudaNdarray then raise

--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -5,7 +5,7 @@ from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available
 if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
+    from theano.sandbox.cuda import CudaNdarrayType, GpuOp
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
    from theano.sandbox.cuda.opt import register_opt
@@ -120,7 +120,7 @@ class MultinomialFromUniform(Op):
        """ % locals()
-class GpuMultinomialFromUniform(MultinomialFromUniform):
+class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
    """
    The output is transposed compared to MultinomialFromUniform.
    We must insert a Transpose op after it.

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -5,7 +5,7 @@ from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available
 if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
+    from theano.sandbox.cuda import CudaNdarrayType, GpuOp
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
    from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
@@ -292,7 +292,7 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
 # This is work in progress
-class GpuImages2Neibs(Images2Neibs):
+class GpuImages2Neibs(Images2Neibs, GpuOp):
    def __init__(self, mode='valid'):
        if mode not in ['valid', 'wrap_centered']:
            raise NotImplementedError("Only the mode valid and wrap_centered"

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -20,7 +20,10 @@ import multinomial
 from theano.sandbox.cuda import cuda_available, cuda_enabled
 if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
+    from theano.sandbox.cuda import (CudaNdarrayType,
+                                     float32_shared_constructor,
+                                     GpuOp)
 def mulmod(a, b, c, m):
    r = numpy.int32((numpy.int64(a)*b + c) % m)
@@ -372,7 +375,7 @@ class mrg_uniform(mrg_uniform_base):
    def c_code_cache_version(self):
        return (1,)
-class GPU_mrg_uniform(mrg_uniform_base):
+class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
    #GPU VERSION
    @classmethod