提交 fd25c9c6 authored 作者: Iban Harlouchet's avatar Iban Harlouchet

numpydoc for theano/sandbox/cuda/kernel_codegen.py

上级 c489c64a
""" Helper routines for generating gpu kernels for nvcc.
""" """
Helper routines for generating gpu kernels for nvcc.
"""
def nvcc_kernel(name, params, body):
"""Return the c code of a kernel function.
:param params: the parameters to the function as one or more strings def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
:param body: the [nested] list of statements for the body of the Parameters
function. These will be separated by ';' characters. ----------
params
The parameters to the function as one or more strings.
body
The [nested] list of statements for the body of the
function. These will be separated by ';' characters.
""" """
paramstr = ', '.join(params) paramstr = ', '.join(params)
...@@ -29,7 +35,10 @@ def nvcc_kernel(name, params, body): ...@@ -29,7 +35,10 @@ def nvcc_kernel(name, params, body):
def code_version(version): def code_version(version):
"""decorator to support version-based cache mechanism""" """
Decorator to support version-based cache mechanism.
"""
if not isinstance(version, tuple): if not isinstance(version, tuple):
raise TypeError('version must be tuple', version) raise TypeError('version must be tuple', version)
...@@ -43,22 +52,31 @@ UNVERSIONED = () ...@@ -43,22 +52,31 @@ UNVERSIONED = ()
@code_version((1,)) @code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn): def inline_reduce(N, buf, pos, count, manner_fn):
"""Return C++ code for a function that reduces a contiguous buffer. """
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer Parameters
:param pos: index of executing thread ----------
:param count: number of executing threads N
Length of the buffer.
:param manner_fn: a function that accepts strings of arguments a buf
Buffer pointer.
pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example: and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction). return "%(a)s + %(b)s" for a sum reduction).
:postcondition: :postcondition:
This function leaves the answer in position 0 of the buffer. The This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function. rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
""" """
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf)) loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
...@@ -127,18 +145,26 @@ def inline_reduce_prod(N, buf, pos, count): ...@@ -127,18 +145,26 @@ def inline_reduce_prod(N, buf, pos, count):
def inline_softmax(N, buf, buf2, threadPos, threadCount): def inline_softmax(N, buf, buf2, threadPos, threadCount):
""" """
:param N: length of the buffer Parameters
:param threadPos: index of executing thread ----------
:param threadCount: number of executing threads N
Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
:Precondition: buf and buf2 contain two identical copies of the input :Precondition: buf and buf2 contain two identical copies of the input
to softmax to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized :Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times Notes
-----
buf and buf2 should be in gpu shared memory, we access it many times.
We use __i as an int variable in a loop.
:note2: We use __i as an int variable in a loop
""" """
return [ return [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
...@@ -169,26 +195,38 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -169,26 +195,38 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
manner_fn, manner_init, manner_fn, manner_init,
b='', stride_b=''): b='', stride_b=''):
"""Return C++ code for a function that reduces a contiguous buffer. """
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(float) Parameters
:param pos: index of executing thread ----------
:param count: number of executing threads N
:param b: Optional, pointer to the bias Length of the buffer.
:param stride_b: Optional, the stride of b if b is provided buf
Buffer pointer of size warpSize * sizeof(float).
:param manner_fn: a function that accepts strings of arguments a pos
Index of executing thread.
count
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
manner_fn
A function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example: and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction). return "%(a)s + %(b)s" for a sum reduction).
:param manner_init: a function that accepts strings of arguments a manner_init
and return c code for its initialization A function that accepts strings of arguments a
and return c code for its initialization.
:postcondition: :postcondition:
This function leaves the answer in position 0 of the buffer. The This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function. rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
""" """
if b: if b:
...@@ -263,24 +301,39 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, ...@@ -263,24 +301,39 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
b='', stride_b=''): b='', stride_b=''):
""" """
:param N: length of the buffer, atleast waprSize(32). Parameters
:param buf: a shared memory buffer of size warpSize * sizeof(float) ----------
:param x: a ptr to the gpu memory where the row is stored N
:param stride_x: the stride between each element in x Length of the buffer, atleast waprSize(32).
:param sm: a ptr to the gpu memory to store the result buf
:param sm_stride: the stride between eash sm element A shared memory buffer of size warpSize * sizeof(float).
:param threadPos: index of executing thread x
:param threadCount: number of executing threads A ptr to the gpu memory where the row is stored.
:param b: Optional, pointer to the bias stride_x
:param stride_b: Optional, the stride of b if b is provided The stride between each element in x.
sm
A ptr to the gpu memory to store the result.
sm_stride
The stride between each sm element.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
:Precondition: buf is empty :Precondition: buf is empty
:Postcondition: buf[0] contains the softmax, :Postcondition: buf[0] contains the softmax,
buf2 contains un-normalized softmax buf2 contains un-normalized softmax
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
We use tx as an int variable in a loop.
:note2: We use tx as an int variable in a loop
""" """
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论