numpydoc for theano/sandbox/cuda/kernel_codegen.py

fd25c9c6 · Iban Harlouchet · c489c64a · fd25c9c6
--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
-""" Helper routines for generating gpu kernels for nvcc.
 """
+Helper routines for generating gpu kernels for nvcc.

+"""

-def nvcc_kernel(name, params, body):
-    """Return the c code of a kernel function.

-    :param params: the parameters to the function as one or more strings
+def nvcc_kernel(name, params, body):
+    """
+    Return the c code of a kernel function.

-    :param body: the [nested] list of statements for the body of the
-         function.  These will be separated by ';' characters.
+    Parameters
+    ----------
+    params
+        The parameters to the function as one or more strings.
+    body
+        The [nested] list of statements for the body of the
+        function. These will be separated by ';' characters.

    """
    paramstr = ', '.join(params)
@@ -29,7 +35,10 @@ def nvcc_kernel(name, params, body):


 def code_version(version):
-    """decorator to support version-based cache mechanism"""
+    """
+    Decorator to support version-based cache mechanism.
+
+    """
    if not isinstance(version, tuple):
        raise TypeError('version must be tuple', version)

@@ -43,22 +52,31 @@ UNVERSIONED = ()

 @code_version((1,))
 def inline_reduce(N, buf, pos, count, manner_fn):
-    """Return C++ code for a function that reduces a contiguous buffer.
-
-    :param N: length of the buffer
-    :param buf: buffer pointer
-    :param pos: index of executing thread
-    :param count: number of executing threads
-
-    :param manner_fn: a function that accepts strings of arguments a
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
+
+    Parameters
+    ----------
+    N
+        Length of the buffer.
+    buf
+        Buffer pointer.
+    pos
+        Index of executing thread.
+    count
+        Number of executing threads.
+    manner_fn
+        A function that accepts strings of arguments a
        and b, and returns c code for their reduction. (Example:
        return "%(a)s + %(b)s" for a sum reduction).

    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
+    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.

-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    ----- 
+    buf should be in gpu shared memory, we access it many times.

    """
    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
@@ -127,18 +145,26 @@ def inline_reduce_prod(N, buf, pos, count):
 def inline_softmax(N, buf, buf2, threadPos, threadCount):
    """

-    :param N: length of the buffer
-    :param threadPos: index of executing thread
-    :param threadCount: number of executing threads
+    Parameters
+    ----------
+    N
+        Length of the buffer.
+    threadPos
+        Index of executing thread.
+    threadCount
+        Number of executing threads.

    :Precondition: buf and buf2 contain two identical copies of the input
        to softmax
    :Postcondition: buf contains the softmax, buf2 contains un-normalized
        softmax

-    :note: buf and buf2 should be in gpu shared memory, we access it many times
+    Notes
+    -----
+    buf and buf2 should be in gpu shared memory, we access it many times.
+
+    We use __i as an int variable in a loop.

-    :note2: We use __i as an int variable in a loop
    """
    return [
            # get max of buf (trashing all but buf[0])
@@ -169,26 +195,38 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
 def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
                               manner_fn, manner_init,
                               b='', stride_b=''):
-    """Return C++ code for a function that reduces a contiguous buffer.
-
-    :param N: length of the buffer
-    :param buf: buffer pointer of size warpSize * sizeof(float)
-    :param pos: index of executing thread
-    :param count: number of executing threads
-    :param b: Optional, pointer to the bias
-    :param stride_b: Optional, the stride of b if b is provided
-
-    :param manner_fn: a function that accepts strings of arguments a
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
+
+    Parameters
+    ----------
+    N
+        Length of the buffer.
+    buf
+        Buffer pointer of size warpSize * sizeof(float).
+    pos
+        Index of executing thread.
+    count
+        Number of executing threads.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.
+    manner_fn
+        A function that accepts strings of arguments a
        and b, and returns c code for their reduction. (Example:
        return "%(a)s + %(b)s" for a sum reduction).
-    :param manner_init: a function that accepts strings of arguments a
-        and return c code for its initialization
+    manner_init
+        A function that accepts strings of arguments a
+        and return c code for its initialization.

    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
+    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.

-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.

    """
    if b:
@@ -263,24 +301,39 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
                                b='', stride_b=''):
    """

-    :param N: length of the buffer, atleast waprSize(32).
-    :param buf: a shared memory buffer of size warpSize * sizeof(float)
-    :param x: a ptr to the gpu memory where the row is stored
-    :param stride_x: the stride between each element in x
-    :param sm: a ptr to the gpu memory to store the result
-    :param sm_stride: the stride between eash sm element
-    :param threadPos: index of executing thread
-    :param threadCount: number of executing threads
-    :param b: Optional, pointer to the bias
-    :param stride_b: Optional, the stride of b if b is provided
+    Parameters
+    ----------
+    N
+        Length of the buffer, atleast waprSize(32).
+    buf
+        A shared memory buffer of size warpSize * sizeof(float).
+    x
+        A ptr to the gpu memory where the row is stored.
+    stride_x
+        The stride between each element in x.
+    sm
+        A ptr to the gpu memory to store the result.
+    sm_stride
+        The stride between each sm element.
+    threadPos
+        Index of executing thread.
+    threadCount
+        Number of executing threads.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.

    :Precondition: buf is empty
    :Postcondition: buf[0] contains the softmax,
        buf2 contains un-normalized softmax

-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
+
+    We use tx as an int variable in a loop.

-    :note2: We use tx as an int variable in a loop
    """
    ret = [
        # get max of buf (trashing all but buf[0])