merge

91fd3c4f · James Bergstra · 201cbdd3 · 64a29098 · 91fd3c4f · 91fd3c4f
--- a/kernel_codegen.py
+++ b/kernel_codegen.py
+""" Helper routines for generating gpu kernels for nvcc.
+"""
+def nvcc_kernel(name, params, body):
+    """Return the c code of a kernel function.
+    :param params: the parameters to the function as one or more strings
+    :param body: the [nested] list of statements for the body of the function.  These will be
+    separated by ';' characters.
+    """
+    paramstr = ', '.join(params)
+    def flatbody():
+        for b in body:
+            if isinstance(b, (list, tuple)):
+                for bb in b:
+                    yield bb
+            else:
+                yield b
+    bodystr = ';\n'.join(flatbody())
+    return """__global__ void %(name)s (%(paramstr)s)
+    {
+        %(bodystr)s;
+    }
+    """ %locals()
+def code_version(version):
+    """decorator to support version-based cache mechanism"""
+    if not isinstance(version, tuple):
+        raise TypeError('version must be tuple', version)
+    def deco(f):
+        f.code_version = version
+        return f
+    return deco
+UNVERSIONED = ()
+@code_version((1,))
+def inline_reduce(N, buf, pos, count, manner_fn):
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
+    :param N: length of the buffer
+    :param buf: buffer pointer
+    :param pos: index of executing thread
+    :param count: number of executing threads
+    :param manner_fn: a function that accepts strings of arguments a and b, and returns c code
+    for their reduction. (Example: return "%(a)s + %(b)s" for a sum reduction).
+    :postcondition:
+    This function leaves the answer in position 0 of the buffer.  The rest of the buffer is
+    trashed by this function.
+    :note: buf should be in gpu shared memory, we access it many times.
+    """
+    loop_line = manner_fn("%s[%s]"%(buf,pos), "%s[i]" %(buf))
+    r_16 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+16]" %(buf, pos))
+    r_8 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+8]" %(buf, pos))
+    r_4 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+4]" %(buf, pos))
+    r_2 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+2]" %(buf, pos))
+    r_1 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+1]" %(buf, pos))
+    return """
+    {
+        // This function trashes buf[1..N], leaving the max in buf[0].
+        if (%(pos)s < warpSize)
+        {
+            for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
+            {
+                %(buf)s[%(pos)s] = %(loop_line)s;
+            }
+            if (%(pos)s < 16)
+            {
+                //reduce so that %(pos)s 0 has the sum of everything
+                if(%(pos)s + 16 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_16)s;
+                if(%(pos)s + 8 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_8)s;
+                if(%(pos)s + 4 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_4)s;
+                if(%(pos)s + 2 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_2)s;
+                if(%(pos)s + 1 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_1)s;
+            }
+        }
+    }
+    """ % locals()
+@code_version(inline_reduce.code_version)
+def inline_reduce_max(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count, lambda a, b: "max(%s, %s)"%(a,b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_sum(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count, lambda a, b: "%s + %s"%(a,b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_min(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count, lambda a, b: "min(%s, %s)"%(a,b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_prod(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count, lambda a, b: "%s * %s"%(a,b))
+@code_version((1,) + inline_reduce_max.code_version + inline_reduce_sum.code_version)
+def inline_softmax(N, buf, buf2, threadPos, threadCount):
+    """
+    :Precondition: buf and buf2 contain two identical copies of the input to softmax
+    :Postcondition: buf contains the softmax, buf2 contains un-normalized softmax
+    :note: buf and buf2 should be in gpu shared memory, we access it many times.
+    """
+    return [
+            #get max of buf (trashing all but buf[0])
+            inline_reduce_max(N, buf, threadPos, threadCount),
+            '__syncthreads()',
+            'float row_max = '+buf+'[0]',
+            '__syncthreads()',
+            buf+'['+threadPos+'] = exp('+buf2+'['+threadPos+'] - row_max)',
+            buf2+'['+threadPos+'] = '+buf+'['+threadPos+']',
+            '__syncthreads()',
+            inline_reduce_sum(N, buf, threadPos, threadCount),
+            '__syncthreads()',
+            'float row_sum = '+buf+'[0]',
+            '__syncthreads()',
+            # divide each exp() result by the sum to complete the job.
+            buf+'['+threadPos+'] = '+buf2+'['+threadPos+'] / row_sum'
+            ]
--- a/nnet.py
+++ b/nnet.py
@@ -5,6 +5,8 @@ import StringIO
 import cuda_ndarray
 from .type import CudaNdarrayType
+from .kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax
 class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
    nin=3
    nout=3
@@ -287,3 +289,81 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
        }
        """ % locals()
+class GpuSoftmax (Op):
+    """Writeme"""
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x):
+        return Apply(self, [x],[x.type()])
+    def c_code_cache_version(self):
+        #return ()
+        return (1,) + inline_softmax.code_version
+    def c_code(self, node, nodename, (x,), (z,), sub):
+        fail = sub['fail']
+        return """
+        if (%(x)s->nd != 2)
+        {
+            PyErr_SetString(PyExc_ValueError, "rank error");
+            %(fail)s;
+        }
+        if ((NULL == %(z)s)
+            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
+        {
+            Py_XDECREF(%(z)s);
+            %(z)s = (CudaNdarray*)CudaNdarray_new_null();
+            if ((NULL == %(z)s)
+                || CudaNdarray_alloc_contiguous(%(z)s, 2, CudaNdarray_HOST_DIMS(%(x)s)))
+            {
+                Py_XDECREF(%(z)s);
+                %(z)s = NULL;
+                %(fail)s;
+            }
+        }
+        {
+            kSoftmax_%(nodename)s
+                <<<
+                // todo: cap these at the card limits, implement loops in kernel
+                    CudaNdarray_HOST_DIMS(%(x)s)[0],
+                    CudaNdarray_HOST_DIMS(%(x)s)[1],
+                    CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
+                >>>(
+                        CudaNdarray_HOST_DIMS(%(x)s)[0],
+                        CudaNdarray_HOST_DIMS(%(x)s)[1], 
+                        CudaNdarray_DEV_DATA(%(x)s),
+                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
+                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
+                        CudaNdarray_DEV_DATA(%(z)s)  //guarantee c contig
+                );
+            CNDA_THREAD_SYNC;
+            cudaError_t err = cudaGetLastError();
+            if( cudaSuccess != err) 
+            {
+                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
+                %(fail)s;
+            }                         
+        }
+        assert(%(z)s);
+        """ % locals()
+    def c_support_code_apply(self, node, nodename):
+        return nvcc_kernel("kSoftmax_%s"%nodename,
+                params=['int M', 'int N', 
+                    'const float * x', 'const int sx0', 'const int sx1',
+                    'float * sm'],
+                body=[
+                    "extern __shared__ float buf[]",
+                    "float * buf2 = buf + N",
+                    "buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
+                    "buf2[threadIdx.x] = buf[threadIdx.x]",
+                    "__syncthreads()",
+                    inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
+                    "sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
+                    ])
--- a/opt.py
+++ b/opt.py
@@ -7,7 +7,8 @@ from theano_cuda_ndarray.blas import gpu_dot22, gpu_gemm, GpuConv
 from theano_cuda_ndarray.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from theano_cuda_ndarray.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
-        GpuCrossentropySoftmax1HotWithBiasDx)
+        GpuCrossentropySoftmax1HotWithBiasDx,
+        GpuSoftmax)
 from theano.compile import optdb
 #optdb.print_summary()  # this shows what is currently registered (in a so-far crude way...)
@@ -285,6 +286,16 @@ def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
            return [host_from_gpu(gpu_dx)]
    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_softmax(node):
+    if isinstance(node.op, tensor.nnet.Softmax):
+        x, = node.inputs
+        if x.owner and x.owner.op == host_from_gpu:
+            gpu_x, = x.owner.inputs
+            gpu_sm = GpuSoftmax()(gpu_x)
+            return [host_from_gpu(gpu_sm)]
+    return False
 #### Convolution, maxpooling
 import theano.sandbox.conv