Merge pull request #947 from nouiz/gpu_reduce

Gpu reduce small change.

Merge pull request #947 from nouiz/gpu_reduce
6657d35b · goodfeli · 2196f4a4 · 4135b1d9 · 6657d35b · 6657d35b
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -373,7 +373,12 @@ def use(device,
            if test_driver:
                import theano.sandbox.cuda.tests.test_driver
                theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
+            if device_properties(use.device_number)["warpSize"] != 32:
+                raise ValueError("Your GPU have a warpSize of 32. Currently"
+                                 " we have code that depend on this. Email"
+                                 " Theano mailing list to tell us about"
+                                 " this new GPU as we don't know any with"
+                                 " this properties")
            if move_shared_float32_to_gpu:
                handle_shared_float32(True)

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -624,8 +624,8 @@ class GpuCAReduce(GpuOp):
        # but tensor.elemwise.CAReduce has this exact same check so I guess
        # this is OK to do
        if self.scalar_op in [scal.minimum, scal.maximum]:
-            for i in xrange(nd_in):
            conds = []
+            for i in xrange(nd_in):
                if self.reduce_mask[i]:
                    conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals())
            assert len(conds) > 0
@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp):
                if (verbose)
                    printf("running kernel_reduce_10_%(name)s\\n");
-                int n_shared = sizeof(float) * n_threads.x;
+                int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
                                                n_shared>>>(
                        CudaNdarray_HOST_DIMS(%(x)s)[0],
@@ -862,11 +862,10 @@ class GpuCAReduce(GpuOp):
                extern __shared__ float buf[];
                float myresult = 0.0f;
+                //This is caught in cuda/init.py when we init the gpu. I keep
+                //it here to ease finding code that rely on this.
                if (warpSize != 32)
                {
-                    // TODO: set error code
-                    // 2012-09-20 IG: as of today, Fred says he will check
-                    // this elsewhere, in a different PR
                    Z[0] = -666;
                    return;
                }

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py