Use the CUDA driver API for CUDA gpuarray operations.

Instead of mixing the CUDA driver API and the runtime API in the generated code, use only the CUDA driver API. GPU programs for CUDA gpuarray operations (except conv operations) are now generated as a string that is passed to the python interface of libgpuarray. libgpuarray then generates a cubin bytearray, which is embedded in the generated code. The generated code then uses the CUDA driver API via the C++ interface of libgpuarray to load and launch the GPU program. This has at least two benefits: (1) This approach does not use the nvcc offline compiler to compile the generated code into the shared library. It uses the host compiler directly, which is likely to be faster. Note that, for cubin generation, libgpuarray still uses the nvcc offline compiler, but an improvement is being made to use NVRTC and ptxas instead of nvcc, which should be, again, faster. (2) Mixing the CUDA driver API and the runtime API is typically discouraged.

Use the CUDA driver API for CUDA gpuarray operations.
89f584bc · Sean Lee · 7852531c · 89f584bc · 89f584bc · 89f584bc
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -145,11 +145,18 @@ class GpuKernelBase(object):
        return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
    def c_support_code_apply(self, node, name):
+        ceil_intdiv = """
+        template <typename T>
+        static T ceil_intdiv(T a, T b)
+        {
+            return (a/b) + ((a % b) ? 1: 0);
+        }
+        """
        kernels = self.gpu_kernels(node, name)
        bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
        codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
        vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
-        return '\n'.join([bins, codes, vars])
+        return '\n'.join([ceil_intdiv, bins, codes, vars])
    def _generate_kernel_init(self, k, err):
        if PY3:

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc.
 """
+try:
+    import pygpu
+    from pygpu import gpuarray
+except ImportError:
+    pass
 def nvcc_kernel(name, params, body):
    """
    Return the c code of a kernel function.
@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body):
            else:
                yield b
    bodystr = ';\n'.join(flatbody())
-    return """__global__ void %(name)s (%(paramstr)s)
+    return """KERNEL void %(name)s (%(paramstr)s)
    {
        %(bodystr)s;
    }
@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    We use __i as an int variable in a loop.
    """
+    ctype = gpuarray.dtype_to_ctype(dtype)
    return [
            # get max of buf (trashing all but buf[0])
            inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            ('npy_%s row_max = ' + buf + '[0]') % dtype,
+            ('%s row_max = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            'for(int __i=' + threadPos + '; __i<' + N +
                  '; __i+=' + threadCount + '){',
@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            ('npy_%s row_sum = ' + buf + '[0]') % dtype,
+            ('%s row_sum = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
            'for(int __i=' + threadPos + '; __i<' + N +
@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+    ctype = gpuarray.dtype_to_ctype(dtype)
    return """
    {
        // This function trashes buf[1..n_threads],
        // leaving the reduction result in buf[0].
-        npy_%(dtype)s red = %(init)s;
+        %(ctype)s red = %(init)s;
        #pragma unroll 16
        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
          red = %(loop_line)s;
@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
    We use tx as an int variable in a loop.
    """
+    ctype = gpuarray.dtype_to_ctype(dtype)
    ret = [
        # get max of buf (trashing all but buf[0])
        inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                       b, stride_b, load_b,
                                       dtype),
        '__syncthreads()',
-        ('npy_%s row_max = ' + buf + '[0]') % dtype,
+        ('%s row_max = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
                                   threadPos, threadCount,
@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                   lambda a: "exp(%s - row_max)" % a,
                                   b, stride_b, load_b, dtype),
        '__syncthreads()',
-        ('npy_%s row_sum = ' + buf + '[0]') % dtype,
+        ('%s row_sum = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
        ]

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py