numpydoc for theano/sandbox/gpuarray/kernel_codegen.py

f772ce53 · Iban Harlouchet · d438c2d0 · f772ce53
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
-""" Helper routines for generating gpu kernels for nvcc.
 """
+Helper routines for generating gpu kernels for nvcc.
-def nvcc_kernel(name, params, body):
+"""
-    """Return the c code of a kernel function.
-    :param params: the parameters to the function as one or more strings
+def nvcc_kernel(name, params, body):
+    """
+    Return the c code of a kernel function.
-    :param body: the [nested] list of statements for the body of the
+    Parameters
-         function.  These will be separated by ';' characters.
+    ----------
+    params
+        The parameters to the function as one or more strings.
+    body
+        The [nested] list of statements for the body of the function.
+        These will be separated by ';' characters.
    """
    paramstr = ', '.join(params)
@@ -28,7 +34,10 @@ def nvcc_kernel(name, params, body):
 def code_version(version):
-    """decorator to support version-based cache mechanism"""
+    """
+    Decorator to support version-based cache mechanism.
+    """
    if not isinstance(version, tuple):
        raise TypeError('version must be tuple', version)
@@ -42,22 +51,31 @@ UNVERSIONED = ()
 @code_version((1,))
 def inline_reduce(N, buf, pos, count, manner_fn):
-    """Return C++ code for a function that reduces a contiguous buffer.
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
-    :param N: length of the buffer
-    :param buf: buffer pointer
+    Parameters
-    :param pos: index of executing thread
+    ----------
-    :param count: number of executing threads
+    N
+        Length of the buffer.
-    :param manner_fn: a function that accepts strings of arguments a
+    buf
-        and b, and returns c code for their reduction. (Example:
+        buffer pointer.
-        return "%(a)s + %(b)s" for a sum reduction).
+    pos
+        Index of executing thread.
+    count
+        Number of executing threads.
+    manner_fn
+        A function that accepts strings of arguments a and b, and returns c code
+        for their reduction.
+        Example: return "%(a)s + %(b)s" for a sum reduction.
    :postcondition:
    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.
-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
    """
    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
@@ -126,19 +144,28 @@ def inline_reduce_prod(N, buf, pos, count):
 def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    """
-    :param N: length of the buffer
+    Parameters
-    :param threadPos: index of executing thread
+    ----------
-    :param threadCount: number of executing threads
+    N
-    :param dtype: dtype of the softmax's output
+        Length of the buffer.
+    threadPos
+        Index of executing thread.
+    threadCount
+        Number of executing threads.
+    dtype
+        Dtype of the softmax's output.
    :Precondition: buf and buf2 contain two identical copies of the input
        to softmax
    :Postcondition: buf contains the softmax, buf2 contains un-normalized
        softmax
-    :note: buf and buf2 should be in gpu shared memory, we access it many times
+    Notes
+    -----
+    buf and buf2 should be in gpu shared memory, we access it many times.
+    We use __i as an int variable in a loop.
-    :note2: We use __i as an int variable in a loop
    """
    return [
            # get max of buf (trashing all but buf[0])
@@ -169,31 +196,48 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
 def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
                               manner_fn, manner_init,
                               b='', stride_b='', load_b='', dtype='float32'):
-    """Return C++ code for a function that reduces a contiguous buffer.
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
-    :param N: length of the buffer
-    :param buf: buffer pointer of size warpSize * sizeof(dtype)
+    Parameters
-    :param x: input data
+    ----------
-    :param stride_x: input data stride
+    N
-    :param load_x: wrapper to read from x
+        Length of the buffer.
-    :param pos: index of executing thread
+    buf
-    :param count: number of executing threads
+        Buffer pointer of size warpSize * sizeof(dtype).
-    :param b: Optional, pointer to the bias
+    x
-    :param stride_b: Optional, the stride of b if b is provided
+        Input data.
-    :param load_b: Optional, wrapper to read from b if b is provided
+    stride_x
-    :param dtype: Optional, the dtype of the output
+        Input data stride.
+    load_x
-    :param manner_fn: a function that accepts strings of arguments a
+        Wrapper to read from x.
-        and b, and returns c code for their reduction. (Example:
+    pos
-        return "%(a)s + %(b)s" for a sum reduction).
+        Index of executing thread.
-    :param manner_init: a function that accepts strings of arguments a
+    count
-        and return c code for its initialization
+        Number of executing threads.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.
+    load_b
+        Optional, wrapper to read from b if b is provided.
+    dtype
+        Optional, the dtype of the output.
+    manner_fn
+        A function that accepts strings of arguments a and b, and returns c code
+        for their reduction. 
+        Example: return "%(a)s + %(b)s" for a sum reduction.
+    manner_init
+        A function that accepts strings of arguments a and return c code for its
+        initialization.
    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
+    This function leaves the answer in position 0 of the buffer. The rest of the
-    rest of the buffer is trashed by this function.
+    buffer is trashed by this function.
-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
    """
    if b:
@@ -270,28 +314,47 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                dtype="float32"):
    """
-    :param N: length of the buffer, atleast waprSize(32).
+    Parameters
-    :param buf: a shared memory buffer of size warpSize * sizeof(dtype)
+    ----------
-    :param x: a ptr to the gpu memory where the row is stored
+    N 
-    :param stride_x: the stride between each element in x
+        Length of the buffer, atleast waprSize(32).
-    :param load_x: wrapper to read from x
+    buf
-    :param sm: a ptr to the gpu memory to store the result
+        A shared memory buffer of size warpSize * sizeof(dtype).
-    :param sm_stride: the stride between eash sm element
+    x
-    :param write_sm: wrapper before writing to sm
+        A ptr to the gpu memory where the row is stored.
-    :param threadPos: index of executing thread
+    stride_x
-    :param threadCount: number of executing threads
+        The stride between each element in x.
-    :param b: Optional, pointer to the bias
+    load_x
-    :param stride_b: Optional, the stride of b if b is provided
+        Wrapper to read from x.
-    :param load_b: Optional, wrapper to read from b if b is provided
+    sm
-    :param dtype: Optional, the dtype of the softmax's output if not float32
+        A ptr to the gpu memory to store the result.
+    sm_stride
+        The stride between each sm element.
+    write_sm
+        Wrapper before writing to sm.
+    threadPos
+        Index of executing thread.
+    threadCount
+        Number of executing threads.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.
+    load_b
+        Optional, wrapper to read from b if b is provided.
+    dtype
+        Optional, the dtype of the softmax's output if not float32.
    :Precondition: buf is empty
-    :Postcondition: buf[0] contains the softmax,
+    :Postcondition: buf[0] contains the softmax, buf2 contains un-normalized
-        buf2 contains un-normalized softmax
+        softmax
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
-    :note: buf should be in gpu shared memory, we access it many times.
+    We use tx as an int variable in a loop.
-    :note2: We use tx as an int variable in a loop
    """
    ret = [
        # get max of buf (trashing all but buf[0])