提交 f772ce53 authored 作者: Iban Harlouchet's avatar Iban Harlouchet

numpydoc for theano/sandbox/gpuarray/kernel_codegen.py

上级 d438c2d0
""" Helper routines for generating gpu kernels for nvcc.
""" """
Helper routines for generating gpu kernels for nvcc.
def nvcc_kernel(name, params, body): """
"""Return the c code of a kernel function.
:param params: the parameters to the function as one or more strings def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
:param body: the [nested] list of statements for the body of the Parameters
function. These will be separated by ';' characters. ----------
params
The parameters to the function as one or more strings.
body
The [nested] list of statements for the body of the function.
These will be separated by ';' characters.
""" """
paramstr = ', '.join(params) paramstr = ', '.join(params)
...@@ -28,7 +34,10 @@ def nvcc_kernel(name, params, body): ...@@ -28,7 +34,10 @@ def nvcc_kernel(name, params, body):
def code_version(version): def code_version(version):
"""decorator to support version-based cache mechanism""" """
Decorator to support version-based cache mechanism.
"""
if not isinstance(version, tuple): if not isinstance(version, tuple):
raise TypeError('version must be tuple', version) raise TypeError('version must be tuple', version)
...@@ -42,22 +51,31 @@ UNVERSIONED = () ...@@ -42,22 +51,31 @@ UNVERSIONED = ()
@code_version((1,)) @code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn): def inline_reduce(N, buf, pos, count, manner_fn):
"""Return C++ code for a function that reduces a contiguous buffer. """
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer Parameters
:param pos: index of executing thread ----------
:param count: number of executing threads N
Length of the buffer.
:param manner_fn: a function that accepts strings of arguments a buf
and b, and returns c code for their reduction. (Example: buffer pointer.
return "%(a)s + %(b)s" for a sum reduction). pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction.
:postcondition: :postcondition:
This function leaves the answer in position 0 of the buffer. The This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function. rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
""" """
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf)) loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
...@@ -126,19 +144,28 @@ def inline_reduce_prod(N, buf, pos, count): ...@@ -126,19 +144,28 @@ def inline_reduce_prod(N, buf, pos, count):
def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
""" """
:param N: length of the buffer Parameters
:param threadPos: index of executing thread ----------
:param threadCount: number of executing threads N
:param dtype: dtype of the softmax's output Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
dtype
Dtype of the softmax's output.
:Precondition: buf and buf2 contain two identical copies of the input :Precondition: buf and buf2 contain two identical copies of the input
to softmax to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized :Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times Notes
-----
buf and buf2 should be in gpu shared memory, we access it many times.
We use __i as an int variable in a loop.
:note2: We use __i as an int variable in a loop
""" """
return [ return [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
...@@ -169,31 +196,48 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -169,31 +196,48 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init, manner_fn, manner_init,
b='', stride_b='', load_b='', dtype='float32'): b='', stride_b='', load_b='', dtype='float32'):
"""Return C++ code for a function that reduces a contiguous buffer. """
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(dtype) Parameters
:param x: input data ----------
:param stride_x: input data stride N
:param load_x: wrapper to read from x Length of the buffer.
:param pos: index of executing thread buf
:param count: number of executing threads Buffer pointer of size warpSize * sizeof(dtype).
:param b: Optional, pointer to the bias x
:param stride_b: Optional, the stride of b if b is provided Input data.
:param load_b: Optional, wrapper to read from b if b is provided stride_x
:param dtype: Optional, the dtype of the output Input data stride.
load_x
:param manner_fn: a function that accepts strings of arguments a Wrapper to read from x.
and b, and returns c code for their reduction. (Example: pos
return "%(a)s + %(b)s" for a sum reduction). Index of executing thread.
:param manner_init: a function that accepts strings of arguments a count
and return c code for its initialization Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction.
manner_init
A function that accepts strings of arguments a and return c code for its
initialization.
:postcondition: :postcondition:
This function leaves the answer in position 0 of the buffer. The This function leaves the answer in position 0 of the buffer. The rest of the
rest of the buffer is trashed by this function. buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
""" """
if b: if b:
...@@ -270,28 +314,47 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -270,28 +314,47 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
dtype="float32"): dtype="float32"):
""" """
:param N: length of the buffer, atleast waprSize(32). Parameters
:param buf: a shared memory buffer of size warpSize * sizeof(dtype) ----------
:param x: a ptr to the gpu memory where the row is stored N
:param stride_x: the stride between each element in x Length of the buffer, atleast waprSize(32).
:param load_x: wrapper to read from x buf
:param sm: a ptr to the gpu memory to store the result A shared memory buffer of size warpSize * sizeof(dtype).
:param sm_stride: the stride between eash sm element x
:param write_sm: wrapper before writing to sm A ptr to the gpu memory where the row is stored.
:param threadPos: index of executing thread stride_x
:param threadCount: number of executing threads The stride between each element in x.
:param b: Optional, pointer to the bias load_x
:param stride_b: Optional, the stride of b if b is provided Wrapper to read from x.
:param load_b: Optional, wrapper to read from b if b is provided sm
:param dtype: Optional, the dtype of the softmax's output if not float32 A ptr to the gpu memory to store the result.
sm_stride
The stride between each sm element.
write_sm
Wrapper before writing to sm.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the softmax's output if not float32.
:Precondition: buf is empty :Precondition: buf is empty
:Postcondition: buf[0] contains the softmax, :Postcondition: buf[0] contains the softmax, buf2 contains un-normalized
buf2 contains un-normalized softmax softmax
Notes
-----
buf should be in gpu shared memory, we access it many times.
:note: buf should be in gpu shared memory, we access it many times. We use tx as an int variable in a loop.
:note2: We use tx as an int variable in a loop
""" """
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论