Merge pull request #610 from nouiz/gpu_conv_faster

Gpu conv faster

Merge pull request #610 from nouiz/gpu_conv_faster
578f4836 · lamblin · f57b7b77 · 328129d1 · 578f4836 · 578f4836
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -39,6 +39,12 @@ Interface changes
   the provided value have. In the past, the error was at run time.
   (Frederic B.)
+Speed up
+ * Convolution on the GPU now check the generation of the card to make
+   it faster in some cases (especially medium/big ouput image) (Frédéric B.)
+   (We hardcoded 512 as the maximum number of thread per block. Newer card
+    support up to 1024 threads per block.
 New Features
 * debugprint new param ids=["CHAR", "id", "int", ""]
   This makes the identifier printed to be the python id, a unique char, a
@@ -120,6 +126,9 @@ Crash Fix
 * Work around a known issue with nvcc 4.1 on MacOS X. (Graham Taylon)
 * In advanced indexing, if some inputs are constant, no need to call constant(...)
   on their value any more. (Pascal L., reported by John Salvatier)
+ * Fix crash on GPU when the GpuSubtensor didn't put the right stride
+   when the results tensor had a dimensions with size of 1. (Pascal L,
+   reported Graham T.)
 =============
 Release Notes

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
+import copy
 import os
 import StringIO
+import theano
 from theano import Apply
 from theano import tensor
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -613,7 +615,8 @@ class GpuConv(GpuOp):
            version=-1,
            verbose=0,
            kshp=None,
-            imshp=None):
+            imshp=None,
+            max_threads_dim0=None):
        """
        :param version: each version of c_code implement many kernel for the
                        convolution. By default we try to guess the best one.
@@ -629,6 +632,10 @@ class GpuConv(GpuOp):
        :param imshp:   The size of the image. Not used for code generation but
                        allow to select an experimental new version in another
                        repo.
+        :param max_threads_dim0: The maximum number of thread for the
+                        block size dimensions 0 (blockDim.x) used by the
+                        GPU function.
        """
        self.border_mode = border_mode
        self.subsample = subsample
@@ -651,6 +658,7 @@ class GpuConv(GpuOp):
        self.verbose = verbose
        self.kshp = kshp
        self.imshp = imshp
+        self.max_threads_dim0 = max_threads_dim0
    def __eq__(self, other):
        return type(self) == type(other) \
@@ -662,7 +670,8 @@ class GpuConv(GpuOp):
            and self.version == other.version \
            and self.verbose == other.verbose \
            and self.kshp == other.kshp\
-            and self.imshp == other.imshp
+            and self.imshp == other.imshp\
+            and self.max_threads_dim0 == other.max_threads_dim0
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -681,7 +690,8 @@ class GpuConv(GpuOp):
            ^ self.version \
            ^ hash(self.verbose) \
            ^ hash(self.kshp)\
-            ^ hash(self.imshp)
+            ^ hash(self.imshp)\
+            ^ hash(self.max_threads_dim0)
    def __str__(self):
        return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
@@ -704,6 +714,25 @@ class GpuConv(GpuOp):
                         False, False]
        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        node_ = copy.copy(node)
+        assert node.op is node_.op
+        if node_.op.max_threads_dim0 is None:
+            op = copy.copy(node_.op)
+            device_id = theano.sandbox.cuda.use.device_number[3:]
+            if device_id == '':
+                device_id = 0
+            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
+            prop = cuda_ndarray.device_properties(device_id)
+            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
+                                                   compute_map, no_recycling)
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, "max_threads_dim0"):
+            self.max_threads_dim0 = None
    def c_compile_args(self):
        nb = 0
        if self.kshp is not None:
@@ -715,7 +744,7 @@ class GpuConv(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 18)
+        return (0, 19)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -734,6 +763,7 @@ class GpuConv(GpuOp):
        version = self.version
        verbose = self.verbose
        sub = sub.copy()
+        max_threads_dim0 = self.max_threads_dim0
        sub.update(locals())
        return """
    //Mandatory args
@@ -764,7 +794,8 @@ class GpuConv(GpuOp):
    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
                                                         %(out)s, mode,
                                                         dx, dy,
-                                                         version, verbose);
+                                                         version, verbose,
+                                                         %(max_threads_dim0)s);
    Py_XDECREF(%(out)s);
    %(out)s = out2;
 """ % sub

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -31,6 +31,16 @@ else:
 cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
+device_id = theano.sandbox.cuda.use.device_number
+if device_id is None:
+    cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
+device_id = theano.sandbox.cuda.use.device_number
+device_id = device_id[3:]
+if device_id == '':
+    device_id = 0
+cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
+device_prop = cuda_ndarray.device_properties(device_id)
 def py_conv_valid_numpy(img, kern):
    assert img.shape[1] == kern.shape[1]
@@ -386,7 +396,7 @@ def test_valid_0_2():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if ishape[1] > 1:
            continue
@@ -417,7 +427,7 @@ def test_valid_1_3_11_12():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
            (16 * 1024 - 150)):
@@ -446,7 +456,7 @@ def test_valid_4():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if ishape[1] > 1:
            continue
@@ -478,7 +488,7 @@ def test_valid_5():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
            (16 * 1024 - 150)):
@@ -512,7 +522,7 @@ def test_valid_7_8_13():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[2] * oshape[3] > 512:
+        if oshape[2] * oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
               oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
@@ -543,7 +553,7 @@ def test_valid_9_10():
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
                                                  numpy.asarray([1, 1]))
-        if oshape[3] > 512:
+        if oshape[3] > device_prop['maxThreadsDim0']:
            continue
        if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
            continue