Merge pull request #450 from nouiz/gpusum

Test nvidia driver

Merge pull request #450 from nouiz/gpusum
8aa08ca2 · lamblin · 8b1c4916 · 07deef6b · 8aa08ca2 · 8aa08ca2
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -99,6 +99,11 @@ import gof
 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
    import theano.sandbox.cuda
+# We can't test the driver during import of theano.sandbox.cuda as
+# this cause circular import dependency. So we also test it manually
+# after the import
+    import theano.sandbox.cuda.tests.test_driver
+    theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
 # Use config.numpy to call numpy.seterr
 import numpy

--- a/theano/sandbox/cuda/GpuConv3D.py
+++ b/theano/sandbox/cuda/GpuConv3D.py
@@ -7,9 +7,9 @@ from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, host_from_gp
 from theano.misc import strutil
 from theano.tensor.nnet.Conv3D import Conv3D
 from theano.sandbox.cuda.opt import register_opt
-from theano.sandbox.cuda import CudaNdarrayType
+from theano.sandbox.cuda import CudaNdarrayType, GpuOp
-class GpuConv3D(theano.Op):
+class GpuConv3D(GpuOp):
    """ GPU implementation of Conv3D """
    def __eq__(self, other):

--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -8,11 +8,12 @@ from theano.misc import strutil
 from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
 from theano.sandbox.cuda.opt import register_opt
-from theano.sandbox.cuda import CudaNdarrayType, HostFromGpu, host_from_gpu
+from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
+                                 host_from_gpu, GpuOp)
-class GpuConvGrad3D(theano.Op):
+class GpuConvGrad3D(GpuOp):
    """ GPU version of gradient of ConvGrad3D with respect to W """
    def make_node(self, V, d, WShape, dCdH):

--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -9,10 +9,11 @@ from theano.gof import local_optimizer
 from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
 from theano.sandbox.cuda.opt import register_opt
-from theano.sandbox.cuda import CudaNdarrayType, HostFromGpu, host_from_gpu
+from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
+                                 host_from_gpu, GpuOp)
-class GpuConvTransp3D(theano.Op):
+class GpuConvTransp3D(GpuOp):
    """ The gpu version of ConvTransp3D """
    def __eq__(self,other):
        return type(self) == type(other)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -33,7 +33,20 @@ def as_cuda_array(obj):
    else:
        raise TypeError("Don't know how to cast to a CudaNdarray object")
-class HostFromGpu(Op):
+class GpuOp(Op):
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        if theano.sandbox.cuda.use.device_number is None:
+            theano.sandbox.cuda.use("gpu",
+                                    force=True,
+                                    default_to_move_computation_to_gpu=False,
+                                    move_shared_float32_to_gpu=False,
+                                    enable_cuda=False)
+        return super(GpuOp, self).make_thunk(node, storage_map,
+                                             compute_map, no_recycling)
+class HostFromGpu(GpuOp):
    """
    Implement the transfer from gpu to the cpu.
    """
@@ -65,7 +78,7 @@ class HostFromGpu(Op):
        return xshp
 host_from_gpu = HostFromGpu()
-class GpuFromHost(Op):
+class GpuFromHost(GpuOp):
    """
    Implement the transfer from cpu to the gpu.
    """
@@ -98,7 +111,8 @@ class GpuFromHost(Op):
        return xshp
 gpu_from_host = GpuFromHost()
-class GpuElemwise(Op):
+class GpuElemwise(GpuOp):
    """
    Implement a generic elemwise on the gpu.
    """
@@ -208,7 +222,7 @@ class GpuElemwise(Op):
    def c_code_cache_version(self):
        return self.src_generator.cache_version
-class GpuDimShuffle(Op):
+class GpuDimShuffle(GpuOp):
    """
    Implement DimShuffle on the gpu.
    """
@@ -397,7 +411,7 @@ class GpuDimShuffle(Op):
    def c_code_cache_version(self):
        return (1,0)
-class GpuSum(Op):
+class GpuSum(GpuOp):
    """GpuSum is a Reduction along some dimensions by summation.
    The dimensions along which to sum is specified by the `reduce_mask` that you pass to the
@@ -1717,7 +1731,7 @@ class GpuSum(Op):
            """ %locals()
        return sio.getvalue()
-class GpuReshape(tensor.Reshape):
+class GpuReshape(tensor.Reshape, GpuOp):
    """
    Implement Reshape on the gpu.
    """
@@ -1733,7 +1747,7 @@ class GpuReshape(tensor.Reshape):
                    ', should be %i' % (len(shp), self.ndim), shp)
        out[0] = x.reshape(tuple(shp))
-class GpuSubtensor(tensor.Subtensor):
+class GpuSubtensor(tensor.Subtensor, GpuOp):
    """
    Implement subtensor on the gpu.
    """
@@ -1764,7 +1778,7 @@ class GpuSubtensor(tensor.Subtensor):
            cdata = cdata[0]
        out[0] = x.__getitem__(cdata)
-class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1):
+class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.
    """
@@ -1790,7 +1804,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1):
            o[j] = x[i]
        out[0] = o
-class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):
+class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
    """
    Implement AdvancedIncSubtensor1 on the gpu.
    """
@@ -1818,7 +1832,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):
        # CudaNdarray_Subscript() don't support Advanced slicing.
        # so we use the parent version that loop on each indices.
-class GpuIncSubtensor(tensor.IncSubtensor):
+class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    """
    Implement IncSubtensor on the gpu.
    """
@@ -1828,7 +1842,7 @@ class GpuIncSubtensor(tensor.IncSubtensor):
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
        return Apply(self, [x,y]+rval.inputs[2:], [x.type()])
-class GpuFlatten(tensor.Flatten):
+class GpuFlatten(tensor.Flatten, GpuOp):
    """
    Implement Flatten on the gpu.
    """
@@ -1839,7 +1853,7 @@ class GpuFlatten(tensor.Flatten):
        out_type = CudaNdarrayType(broadcastable=host_out_broadcastable)
        return Apply(self, [x], [out_type()])
-class GpuShape(tensor.Shape):
+class GpuShape(tensor.Shape, GpuOp):
    """
    Implement Shape on the gpu.
    """
@@ -1847,7 +1861,7 @@ class GpuShape(tensor.Shape):
        return Apply(self, [x], [tensor.lvector()])
 gpu_shape = GpuShape()
-class GpuJoin(tensor.Join):
+class GpuJoin(tensor.Join, GpuOp):
    """
    Implement Join on the gpu.
    """
@@ -1924,7 +1938,7 @@ class GpuJoin(tensor.Join):
 gpu_join = GpuJoin()
-class GpuAlloc(Op):
+class GpuAlloc(GpuOp):
    """
    Implement Alloc on the gpu.
    """
@@ -2023,7 +2037,7 @@ class GpuAlloc(Op):
 gpu_alloc = GpuAlloc()
-class GpuContiguous(Op):
+class GpuContiguous(GpuOp):
    """
    Always return a c contiguous output. Copy the input only if it is
    not already c contiguous.

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -4,8 +4,9 @@ import StringIO, os
 import cuda_ndarray.cuda_ndarray as cuda
 from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda import GpuOp
-class GpuDot22(Op):
+class GpuDot22(GpuOp):
    """
    Implement dot(2d, 2d) on the gpu.
    """
@@ -76,7 +77,7 @@ class GpuDot22(Op):
        """ % locals()
 gpu_dot22 = GpuDot22()
-class GpuDot22Scalar(Op):
+class GpuDot22Scalar(GpuOp):
    """
    Implement dot(2d, 2d) * scalar on the gpu.
    """
@@ -155,7 +156,7 @@ class GpuDot22Scalar(Op):
        """ % locals()
 gpu_dot22scalar = GpuDot22Scalar()
-class GpuGemm(Op):
+class GpuGemm(GpuOp):
    """
    implement the gemm on the gpu.
@@ -257,7 +258,7 @@ class GpuGemm(Op):
 gpu_gemm_no_inplace = GpuGemm(inplace=False)
 gpu_gemm_inplace = GpuGemm(inplace=True)
-class GpuGemv(Op):
+class GpuGemv(GpuOp):
    """
    implement gemv on the gpu.
@@ -348,7 +349,7 @@ class GpuGemv(Op):
 gpu_gemv_no_inplace = GpuGemv(inplace=False)
 gpu_gemv_inplace = GpuGemv(inplace=True)
-class GpuGer(Op):
+class GpuGer(GpuOp):
    """
    implement ger on the gpu.
@@ -439,7 +440,7 @@ class GpuGer(Op):
 gpu_ger_no_inplace = GpuGer(inplace=False)
 gpu_ger_inplace = GpuGer(inplace=True)
-class GpuOuter(Op):
+class GpuOuter(GpuOp):
    """ Implement outer on the gpu."""
    def make_node(self, x, y):
        # we suppose type checking has been done, but make sure.
@@ -532,7 +533,7 @@ gpu_outer = GpuOuter()
 ##
 # Not really a BLAS operation, but whatever.
 #
-class GpuConv(Op):
+class GpuConv(GpuOp):
    """
    Implement the batched and stacked 2d convolution on the gpu.
    """
@@ -698,7 +699,7 @@ class GpuConv(Op):
 """%sub
-class GpuDownsampleFactorMax(Op):
+class GpuDownsampleFactorMax(GpuOp):
    """
    Implement downsample with max on the gpu.
    """
@@ -858,7 +859,7 @@ class GpuDownsampleFactorMax(Op):
        }
        """ % locals()
-class GpuDownsampleFactorMaxGrad(Op):
+class GpuDownsampleFactorMaxGrad(GpuOp):
    """
    Implement the grad of downsample with max on the gpu.
    """

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -3,11 +3,12 @@ from theano import tensor, scalar
 import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax
-class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
+class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
    """
@@ -180,7 +181,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
-class GpuCrossentropySoftmax1HotWithBiasDx (Op):
+class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
    """
@@ -302,7 +303,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
-class GpuSoftmax (Op):
+class GpuSoftmax (GpuOp):
    """
    Implement Softmax on the gpu.
    """
@@ -400,7 +401,7 @@ class GpuSoftmax (Op):
 gpu_softmax = GpuSoftmax()
-class GpuSoftmaxWithBias (Op):
+class GpuSoftmaxWithBias (GpuOp):
    """
    Implement SoftmaxWithBias on the gpu.
    """

--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
@@ -10,7 +10,7 @@ __contact__ = "theano-dev@googlegroups.com"
 import sys
 import numpy
 import theano.gof
-from theano.sandbox.cuda import CudaNdarrayType
+from theano.sandbox.cuda import CudaNdarrayType, GpuOp
 from theano.tensor import (get_vector_length, cast, opt)
 from theano.compile import optdb
 from theano.gof import local_optimizer, Variable
@@ -19,7 +19,7 @@ from theano.gof import local_optimizer, Variable
 config = theano.config
-class CURAND_Base(theano.gof.Op):
+class CURAND_Base(GpuOp):
    """ Base class for a random number generator implemented in CURAND.
    The random number generator itself is an opaque reference managed by

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
+import numpy
+import theano
+# Skip test if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    raise SkipTest('Optional package cuda disabled')
+import theano.sandbox.cuda as cuda
+import theano.sandbox.cuda.basic_ops as B
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+def test_nvidia_driver1():
+    """ Some nvidia driver give bad result for reduction
+        This execute some reduction test to ensure it run correctly
+    """
+    a = numpy.random.rand(10000).astype("float32")
+    A = cuda.shared_constructor(a)
+    f = theano.function(inputs=[], outputs=A.sum(), mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    assert len(topo) == 2
+    assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1
+    if not numpy.allclose(f(), a.sum()):
+        raise Exception("The nvidia driver version installed with the OS "
+                        "don't give good result for reduction."
+                        "Installing the nvidia driver available on the same "
+                        "download page as the cuda package will fix the "
+                        "problem: http://developer.nvidia.com/cuda-downloads")
+def test_nvidia_driver2():
+    """ Test that the gpu device is initialized by theano when
+        we manually make a shared variable on the gpu.
+        The driver should always be tested during theano initialization
+        of the gpu device
+    """
+    a = numpy.random.rand(10000).astype("float32")
+    cuda.shared_constructor(a)
+    assert theano.sandbox.cuda.use.device_number is not None
+def test_nvidia_driver3():
+    """ Test that the gpu device is initialized by theano when
+        we build a function with gpu op.
+        The driver should always be tested during theano initialization
+        of the gpu device
+    """
+    var = cuda.fvector()
+    f = theano.function([var], var + 1, mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
+    assert theano.sandbox.cuda.use.device_number is not None
+# TODO make sure the test_nvidia_driver test are executed when we make manually
+# a CudaNdarray like this: cuda.CudaNdarray.zeros((5,4))
--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -169,6 +169,12 @@ def cuda_shared_constructor(value, name=None, strict=False,
 def float32_shared_constructor(value, name=None, strict=False,
        allow_downcast=None, borrow=False, broadcastable=None):
    """SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray"""
+    if theano.sandbox.cuda.use.device_number is None:
+        theano.sandbox.cuda.use("gpu",
+                                force=True,
+                                default_to_move_computation_to_gpu=False,
+                                move_shared_float32_to_gpu=False,
+                                enable_cuda=False)
    # if value isn't a float32 ndarray, or a CudaNdarray then raise

--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -5,7 +5,7 @@ from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available
 if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
+    from theano.sandbox.cuda import CudaNdarrayType, GpuOp
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
    from theano.sandbox.cuda.opt import register_opt
@@ -120,7 +120,7 @@ class MultinomialFromUniform(Op):
        """ % locals()
-class GpuMultinomialFromUniform(MultinomialFromUniform):
+class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
    """
    The output is transposed compared to MultinomialFromUniform.
    We must insert a Transpose op after it.

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -5,7 +5,7 @@ from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available
 if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType
+    from theano.sandbox.cuda import CudaNdarrayType, GpuOp
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
    from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
@@ -292,7 +292,7 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
 # This is work in progress
-class GpuImages2Neibs(Images2Neibs):
+class GpuImages2Neibs(Images2Neibs, GpuOp):
    def __init__(self, mode='valid'):
        if mode not in ['valid', 'wrap_centered']:
            raise NotImplementedError("Only the mode valid and wrap_centered"

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -20,7 +20,10 @@ import multinomial
 from theano.sandbox.cuda import cuda_available, cuda_enabled
 if cuda_available:
-    from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
+    from theano.sandbox.cuda import (CudaNdarrayType,
+                                     float32_shared_constructor,
+                                     GpuOp)
 def mulmod(a, b, c, m):
    r = numpy.int32((numpy.int64(a)*b + c) % m)
@@ -372,7 +375,7 @@ class mrg_uniform(mrg_uniform_base):
    def c_code_cache_version(self):
        return (1,)
-class GPU_mrg_uniform(mrg_uniform_base):
+class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
    #GPU VERSION
    @classmethod