提交 931f4e93 authored 作者: abergeron's avatar abergeron

Merge pull request #3297 from harlouci/numpydoc_sandbox_2

Numpydoc sandbox 2
......@@ -12,7 +12,10 @@ from theano.sandbox.cuda import CudaNdarrayType, GpuOp
class GpuConv3D(GpuOp):
""" GPU implementation of Conv3D """
"""
GPU implementation of Conv3D.
"""
def __eq__(self, other):
return type(self) == type(other)
......@@ -25,10 +28,18 @@ class GpuConv3D(GpuOp):
def make_node(self, V, W, b, d):
"""
:param V: Visible unit, input
:param W: Weights, filter
:param b: bias
:param d: strides when moving the filter over the input
Parameters
----------
V
Visible unit, input.
W
Weights, filter.
b
Bias.
d
Strides when moving the filter over the input.
"""
V_ = as_cuda_ndarray_variable(V)
W_ = as_cuda_ndarray_variable(W)
......
......@@ -15,14 +15,25 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
class GpuConvGrad3D(GpuOp):
""" GPU version of gradient of ConvGrad3D with respect to W """
"""
GPU version of gradient of ConvGrad3D with respect to W.
"""
def make_node(self, V, d, WShape, dCdH):
"""
:param V: visible
:param d: strides
:param WShape: shapes of the weights -> shape of this op output
:param dCdH: other input with what V will be convolved.
Parameters
----------
V
Visible.
d
Strides.
WShape
Shapes of the weights -> shape of this op output.
dCdH
Other input with what V will be convolved.
"""
V_ = as_cuda_ndarray_variable(V)
d_ = T.as_tensor_variable(d)
......
......@@ -16,7 +16,11 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
class GpuConvTransp3D(GpuOp):
""" The gpu version of ConvTransp3D """
"""
The gpu version of ConvTransp3D.
"""
def __eq__(self, other):
return type(self) == type(other)
......
......@@ -94,12 +94,14 @@ cuda_enabled = False
# Code factorized within a function so that it may be called from multiple
# places (which is not currently the case, but may be useful in the future).
def set_cuda_disabled():
"""Function used to disable cuda.
"""
Function used to disable cuda.
A warning is displayed, so that the user is aware that cuda-based code is
not going to work.
Note that there is no point calling this function from outside of
`cuda.__init__`, since it has no effect once the module is loaded.
"""
global cuda_available, cuda_warning_is_displayed
cuda_available = False
......@@ -116,8 +118,9 @@ libcuda_ndarray_so = os.path.join(cuda_ndarray_loc,
def try_import():
"""
load the cuda_ndarray module if present and up to date
return True if loaded correctly, otherwise return False
Load the cuda_ndarray module if present and up to date.
Return True if loaded correctly, otherwise return False.
"""
cuda_files = (
'cuda_ndarray.cu',
......@@ -219,6 +222,7 @@ if cuda_available:
def ok():
"""
Check if an existing library exists and can be read.
"""
try:
open(libcuda_ndarray_so).close()
......@@ -266,6 +270,7 @@ class GpuOp(theano.gof.Op):
It is defined in __init__.py so that it exists even when `cuda_available`
is False (this is necessary to avoid breaking the test suite).
"""
def make_thunk(self, node, storage_map, compute_map, no_recycling):
......@@ -331,18 +336,23 @@ def use(device,
test_driver=True):
"""
Error and warning about CUDA should be displayed only when this
function is called. We need to be able to load this module only
function is called. We need to be able to load this module only
to check if it is available!
:param device: string "cpu", "gpu", "gpuN" (N is the device number to use)
:param force: Will always raise an exception if we can't use the gpu.
:param default_to_move_computation_to_gpu: If gpu init succeeded, enable by
default optimizations to move
computations to the gpu
:param move_shared_float32_to_gpu: If gpu init succeeded, put new shared
variables in float32 on the gpu.
:param enable_cuda: If the gpu is correctly enabled,
set the variable cuda_enabled to True.
Parameters
----------
device : string
"cpu", "gpu", "gpuN" (N is the device number to use).
force
Will always raise an exception if we can't use the gpu.
default_to_move_computation_to_gpu
If gpu init succeeded, enable by default optimizations to move
computations to the gpu.
move_shared_float32_to_gpu
If gpu init succeeded, put new shared variables in float32 on the gpu.
enable_cuda
If the gpu is correctly enabled, set the variable cuda_enabled to True.
"""
global cuda_enabled, cuda_initialization_error_message
if force and not cuda_available and device.startswith('gpu'):
......@@ -480,7 +490,7 @@ use.device_number = None
def unuse():
"""
This undo what was done by the call to
This undo what was done by the call to.
use('gpu[0-9]', default_to_move_computation_to_gpu=True,
move_shared_float32_to_gpu=True,
......@@ -488,7 +498,9 @@ def unuse():
This is used in Pylearn2 tests to enable/disable the GPU when needed.
After this call, the rest of Theano think the GPU shouldn't be used by default.
After this call, the rest of Theano think the GPU shouldn't be used by
default.
"""
global cuda_enabled
cuda_enabled = False
......@@ -502,9 +514,11 @@ def unuse():
def handle_shared_float32(tf):
"""Set the default shared type for float32 tensor to CudaNdarrayType
"""
Set the default shared type for float32 tensor to CudaNdarrayType.
This function is intended to be called from use(gpu_index), not directly.
"""
if tf:
theano.compile.shared_constructor(float32_shared_constructor)
......
"""This file implement 3 different version of the elemwise op on the
"""
This file implement 3 different version of the elemwise op on the
gpu. Only NaiveAlgo is used and it is not very naive now.
The elemwise fct are also used with scalar operation! So it can happen
......@@ -40,12 +41,25 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value',
class SupportCodeError(Exception):
"""It is currently not possible to auto-generate a GPU implementation for
"""
It is currently not possible to auto-generate a GPU implementation for
an elementwise Op with c_support_code_apply().
But we support Op.c_support_code."""
But we support Op.c_support_code.
"""
class NaiveAlgo(object):
"""
Parameters
----------
scalar_op
The scalar operation to execute on each element.
sync
If True, will wait after the kernel launch and check for error call.
"""
verbose = 0 # 1, 2 or 3 for more verbose output.
@property
......@@ -57,10 +71,6 @@ class NaiveAlgo(object):
return ver
def __init__(self, scalar_op, sync=True, inplace_pattern=None):
"""
:param scalar_op: the scalar operation to execute on each element.
:param sync: if True, will wait after the kernel launch and check for error call.
"""
if inplace_pattern is None:
inplace_pattern = {}
try:
......@@ -154,8 +164,10 @@ class NaiveAlgo(object):
return sio.getvalue()
def c_src_kernel_tiling(self, node, nodename):
""" The kernel applies to problems with <= 5 dimensions """
"""
The kernel applies to problems with <= 5 dimensions.
"""
# The kernel is intended to be structured roughly like this:
"""
static __global__ void kernel()
......@@ -278,8 +290,10 @@ class NaiveAlgo(object):
return sio.getvalue()
def c_src_kernel_tiling_less_registers(self, node, nodename):
""" The kernel applies to problems with <= 5 dimensions """
"""
The kernel applies to problems with <= 5 dimensions.
"""
nd = node.outputs[0].type.ndim
n_in = len(node.inputs)
n_out = len(node.outputs)
......@@ -1049,12 +1063,16 @@ class ErfinvGPU(Erfinv):
"""
Provides a c-code implementation of the inverse error function for GPU.
Note: We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we
Notes
-----
We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we
currently rely on Nvidia's cublas library to provide the erfinv
c-implementation (which requires different c_headers). As it stands,
theano.scalar.basic_scipy.Erfinv does not have c_code as scipy does not
export the required C function
export the required C function.
"""
def c_headers(self):
return ['math_functions.h', 'cublas_v2.h']
......@@ -1070,14 +1088,19 @@ erfinv_gpu = ErfinvGPU(upgrade_to_float_no_complex, name='erfinv_gpu')
class ErfcxGPU(Erfinv):
"""
Provides a c-code implementation of the scaled complementary error function for GPU.
Provides a c-code implementation of the scaled complementary error function
for GPU.
Note: We do not add this c_code to theano.scalar.basic_scipy.Erfcx, as we
Notes
-----
We do not add this c_code to theano.scalar.basic_scipy.Erfcx, as we
currently rely on Nvidia's cublas library to provide the erfcx
c-implementation (which requires different c_headers). As it stands,
theano.scalar.basic_scipy.Erfcx does not have c_code as scipy does not
export the required C function
export the required C function.
"""
def c_headers(self):
return ['math_functions.h', 'cublas_v2.h']
......@@ -1088,4 +1111,4 @@ class ErfcxGPU(Erfinv):
raise NotImplementedError('type not supported', type)
return "%(z)s = erfcx(%(x)s);" % locals()
erfcx_gpu = ErfcxGPU(upgrade_to_float_no_complex, name='erfcx_gpu')
\ No newline at end of file
erfcx_gpu = ErfcxGPU(upgrade_to_float_no_complex, name='erfcx_gpu')
......@@ -13,13 +13,19 @@ if cuda_available:
class GpuCumsum(CumsumOp, GpuOp):
"""
Parameters
----------
axis
Can not be None. If you want the array flatten, do it before.
"""
SUPPORTED_NDIMS = 3
__props__ = ('axis', 'max_threads_dim0', 'max_grid_size1', 'max_grid_size2')
def __init__(self, axis):
"""
``axis`` can not be None. If you want the array flatten, do it before.
"""
self.axis = axis
self.max_threads_dim0 = None
self.max_grid_size1 = None
......@@ -415,11 +421,13 @@ class GpuCumsum(CumsumOp, GpuOp):
def values_eq_approx_high_tol(a, b):
"""This fct is needed to don't have DebugMode raise useless
"""
This fct is needed to don't have DebugMode raise useless
error due to rounding error.
This happen with big input size due to change in the order of
operation.
"""
rtol = None
if a.size > 100000:
......
......@@ -171,10 +171,11 @@ class CuIFFTOp(ScikitsCudaOp):
def to_complex_gpuarray(x, copyif=False):
"""
adapted version of theano.misc.pycuda_utils.to_gpuarray that takes
Adapted version of theano.misc.pycuda_utils.to_gpuarray that takes
an array with an extra trailing dimension of length 2 for
real/imaginary parts, and turns it into a complex64 PyCUDA
GPUArray.
"""
if not isinstance(x, CudaNdarray):
raise ValueError("We can transfer only CudaNdarray "
......@@ -213,7 +214,8 @@ def bptrs(a):
"""
Pointer array when input represents a batch of matrices.
taken from scikits.cuda tests/test_cublas.py
Taken from scikits.cuda tests/test_cublas.py.
"""
return pycuda.gpuarray.arange(a.ptr, a.ptr + a.shape[0] * a.strides[0],
a.strides[0], dtype=cublas.ctypes.c_void_p)
......@@ -222,8 +224,9 @@ def bptrs(a):
def sc_complex_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N',
handle=None):
"""
uses cublasCgemmBatched to compute a bunch of complex dot products
in parallel
Uses cublasCgemmBatched to compute a bunch of complex dot products
in parallel.
"""
if handle is None:
handle = scikits.cuda.misc._global_cublas_handle
......@@ -292,7 +295,9 @@ class BatchedComplexDotOp(ScikitsCudaOp):
"""
This version uses cublasCgemmBatched under the hood, instead of
doing multiple cublasCgemm calls.
"""
def make_node(self, inp1, inp2):
inp1 = basic_ops.gpu_contiguous(
basic_ops.as_cuda_ndarray_variable(inp1))
......@@ -355,10 +360,15 @@ batched_complex_dot = BatchedComplexDotOp()
def mult_and_reduce(input_fft_v, filters_fft_v, input_shape=None,
filter_shape=None):
"""
input_fft_v is (b, ic, i0, i1//2 + 1, 2)
filters_fft_v is (oc, ic, i0, i1//2 + 1, 2)
"""
Parameters
----------
input_fft_v
It's (b, ic, i0, i1//2 + 1, 2).
filters_fft_v
It's (oc, ic, i0, i1//2 + 1, 2).
"""
if input_shape is None:
input_shape = input_fft_v.shape # symbolic
......@@ -405,16 +415,19 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
On valid mode the filters must be smaller than the input.
input: (b, ic, i0, i1)
filters: (oc, ic, f0, f1)
border_mode: 'valid' of 'full'
Parameters
----------
input
(b, ic, i0, i1).
filters
(oc, ic, f0, f1).
border_mode : {'valid', 'full'}
pad_last_dim
Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
pad_last_dim: Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
"""
# use symbolic shapes to compute shape info at runtime if not specified
if image_shape is None:
image_shape = input.shape
......@@ -546,16 +559,19 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
On valid mode the filters must be smaller than the input.
input: (b, ic, i0, i1, i2)
filters: (oc, ic, f0, f1, i2)
border_mode: 'valid' of 'full'
Parameters
----------
input
(b, ic, i0, i1, i2).
filters
(oc, ic, f0, f1, i2).
border_mode : {'valid', 'full'}.
pad_last_dim
Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
pad_last_dim: Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
"""
# use symbolic shapes to compute shape info at runtime if not specified
if image_shape is None:
image_shape = input.shape
......@@ -670,5 +686,3 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# output should now be the result of a batched valid convolution
# of the input with the filters.
return basic_ops.as_cuda_ndarray_variable(output)
""" Helper routines for generating gpu kernels for nvcc.
"""
Helper routines for generating gpu kernels for nvcc.
"""
def nvcc_kernel(name, params, body):
"""Return the c code of a kernel function.
:param params: the parameters to the function as one or more strings
def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
:param body: the [nested] list of statements for the body of the
function. These will be separated by ';' characters.
Parameters
----------
params
The parameters to the function as one or more strings.
body
The [nested] list of statements for the body of the
function. These will be separated by ';' characters.
"""
paramstr = ', '.join(params)
......@@ -29,7 +35,10 @@ def nvcc_kernel(name, params, body):
def code_version(version):
"""decorator to support version-based cache mechanism"""
"""
Decorator to support version-based cache mechanism.
"""
if not isinstance(version, tuple):
raise TypeError('version must be tuple', version)
......@@ -43,22 +52,31 @@ UNVERSIONED = ()
@code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer
:param pos: index of executing thread
:param count: number of executing threads
:param manner_fn: a function that accepts strings of arguments a
"""
Return C++ code for a function that reduces a contiguous buffer.
Parameters
----------
N
Length of the buffer.
buf
Buffer pointer.
pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
:postcondition:
This function leaves the answer in position 0 of the buffer. The
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
Notes
-----
buf should be in gpu shared memory, we access it many times.
"""
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
......@@ -127,18 +145,26 @@ def inline_reduce_prod(N, buf, pos, count):
def inline_softmax(N, buf, buf2, threadPos, threadCount):
"""
:param N: length of the buffer
:param threadPos: index of executing thread
:param threadCount: number of executing threads
Parameters
----------
N
Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
:Precondition: buf and buf2 contain two identical copies of the input
to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times
Notes
-----
buf and buf2 should be in gpu shared memory, we access it many times.
We use __i as an int variable in a loop.
:note2: We use __i as an int variable in a loop
"""
return [
# get max of buf (trashing all but buf[0])
......@@ -169,26 +195,38 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
manner_fn, manner_init,
b='', stride_b=''):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(float)
:param pos: index of executing thread
:param count: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param manner_fn: a function that accepts strings of arguments a
"""
Return C++ code for a function that reduces a contiguous buffer.
Parameters
----------
N
Length of the buffer.
buf
Buffer pointer of size warpSize * sizeof(float).
pos
Index of executing thread.
count
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
manner_fn
A function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
:param manner_init: a function that accepts strings of arguments a
and return c code for its initialization
manner_init
A function that accepts strings of arguments a
and return c code for its initialization.
:postcondition:
This function leaves the answer in position 0 of the buffer. The
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
Notes
-----
buf should be in gpu shared memory, we access it many times.
"""
if b:
......@@ -263,24 +301,39 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
b='', stride_b=''):
"""
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(float)
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
Parameters
----------
N
Length of the buffer, atleast waprSize(32).
buf
A shared memory buffer of size warpSize * sizeof(float).
x
A ptr to the gpu memory where the row is stored.
stride_x
The stride between each element in x.
sm
A ptr to the gpu memory to store the result.
sm_stride
The stride between each sm element.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
:Precondition: buf is empty
:Postcondition: buf[0] contains the softmax,
buf2 contains un-normalized softmax
:note: buf should be in gpu shared memory, we access it many times.
Notes
-----
buf should be in gpu shared memory, we access it many times.
We use tx as an int variable in a loop.
:note2: We use tx as an int variable in a loop
"""
ret = [
# get max of buf (trashing all but buf[0])
......
......@@ -13,7 +13,9 @@ from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
"""
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
"""
nin = 3
nout = 3
......@@ -224,7 +226,9 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1Ho
class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
"""
Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
"""
nin = 3
nout = 1
"""Gradient wrt x of the CrossentropySoftmax1Hot Op"""
......@@ -393,7 +397,9 @@ gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasD
class GpuSoftmax(GpuOp):
"""
Implement Softmax on the gpu.
"""
def __eq__(self, other):
return type(self) == type(other)
......@@ -555,7 +561,9 @@ gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias(GpuOp):
"""
Implement SoftmaxWithBias on the gpu.
"""
nin = 2
nout = 1
......
......@@ -85,7 +85,10 @@ nvcc_version = None
def is_nvcc_available():
"""Return True iff the nvcc compiler is found."""
"""
Return True iff the nvcc compiler is found.
"""
def set_version():
p_out = output_subprocess_Popen([nvcc_path, '--version'])
ver_line = decode(p_out[0]).strip().split('\n')[-1]
......@@ -150,6 +153,7 @@ class NVCC_compiler(Compiler):
"""
This args will be received by compile_str() in the preargs paramter.
They will also be included in the "hard" part of the key module.
"""
flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
if config.nvcc.fastmath:
......@@ -209,33 +213,47 @@ class NVCC_compiler(Compiler):
module_name, src_code,
location=None, include_dirs=[], lib_dirs=[], libs=[], preargs=[],
rpaths=rpath_defaults, py_module=True, hide_symbols=True):
""":param module_name: string (this has been embedded in the src_code
:param src_code: a complete c or c++ source listing for the module
:param location: a pre-existing filesystem directory where the
cpp file and .so will be written
:param include_dirs: a list of include directory names
(each gets prefixed with -I)
:param lib_dirs: a list of library search path directory names
(each gets prefixed with -L)
:param libs: a list of libraries to link with
(each gets prefixed with -l)
:param preargs: a list of extra compiler arguments
:param rpaths: list of rpaths to use with Xlinker.
Defaults to `rpath_defaults`.
:param py_module: if False, compile to a shared library, but
do not import as a Python module.
:param hide_symbols: if True (the default), hide all symbols
from the library symbol table unless explicitely exported.
"""
:returns: dynamically-imported python module of the compiled code.
Parameters
----------
module_name: str
This has been embedded in the src_code.
src_code
A complete c or c++ source listing for the module.
location
A pre-existing filesystem directory where the
cpp file and .so will be written.
include_dirs
A list of include directory names (each gets prefixed with -I).
lib_dirs
A list of library search path directory names (each gets
prefixed with -L).
libs
A list of libraries to link with (each gets prefixed with -l).
preargs
A list of extra compiler arguments.
rpaths
List of rpaths to use with Xlinker. Defaults to `rpath_defaults`.
py_module
If False, compile to a shared library, but
do not import as a Python module.
hide_symbols
If True (the default), hide all symbols from the library symbol
table unless explicitely exported.
Returns
-------
module
Dynamically-imported python module of the compiled code.
(unless py_module is False, in that case returns None.)
:note 1: On Windows 7 with nvcc 3.1 we need to compile in the
real directory Otherwise nvcc never finish.
Notes
-----
On Windows 7 with nvcc 3.1 we need to compile in the real directory
Otherwise nvcc never finish.
"""
rpaths = list(rpaths)
if sys.platform == "win32":
......
......@@ -141,7 +141,9 @@ class InputToGpuOptimizer(Optimizer):
Transfer the input of a graph to the gpu if it is necessary.
It should make this part of the optimizer faster we will will need only 1
pass on the fgraph.
"""
def __init__(self):
Optimizer.__init__(self)
......@@ -208,7 +210,10 @@ def dtype_in_elemwise_supported(op):
Return True of the Elemwise op is supported on the gpu.
Return False otherwise.
:note: We need to check inside the Composite op.
Notes
-----
We need to check inside the Composite op.
"""
def get_all_basic_scalar(composite_op):
l = []
......@@ -231,8 +236,10 @@ def dtype_in_elemwise_supported(op):
@register_opt()
@local_optimizer([tensor.Elemwise])
def local_gpu_elemwise_0(node):
"""elemwise(..., host_from_gpu, ...)
-> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
"""
Elemwise(..., host_from_gpu, ...)
-> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
"""
if (isinstance(node.op, tensor.Elemwise) and
dtype_in_elemwise_supported(node.op)):
......@@ -294,6 +301,7 @@ def local_gpu_elemwise_0(node):
def local_gpu_elemwise_1(node):
"""
gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
"""
if isinstance(node.op, GpuFromHost):
host_i, = node.inputs
......@@ -350,6 +358,7 @@ def local_gpu_dimshuffle_0(node):
"""
dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle)
gpu_from_host(dimshuffle) -> gpu_dimshuffle(gpu_from_host)
"""
if isinstance(node.op, tensor.DimShuffle):
input, = node.inputs
......@@ -375,6 +384,7 @@ def local_gpu_specifyShape_0(node):
"""
specify_shape(host_from_gpu()) -> host_from_gpu(specify_shape)
gpu_from_host(specify_shape) -> specify_shape(gpu_from_host)
"""
if isinstance(node.op, tensor.SpecifyShape):
input = node.inputs[0]
......@@ -403,11 +413,11 @@ def local_gpu_dot_to_dot22(node):
transforming the vector into a matrix, apply gpudot22 and reshaping
the output.
A more suitable solution would be to use the right cublas call
A more suitable solution would be to use the right cublas call.
This is needed in fast_compile
"""
This is needed in fast_compile.
"""
# In case the got do input upcast, we much check that we can
# make it run on the gpu.
if isinstance(node.op, GpuFromHost):
......@@ -482,10 +492,11 @@ theano.compile.optdb.register('assert_no_cpu_op', assert_no_cpu_op, 49.2)
@register_opt()
@local_optimizer([theano.ifelse.IfElse, gpu_from_host])
def local_gpu_lazy_ifelse(node):
"""
"""
gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host)
ifelse(host_from_gpu) -> host_from_gpu(ifelse)
"""
if isinstance(node.op, theano.ifelse.IfElse) and not node.op.gpu:
gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True)
......@@ -554,6 +565,7 @@ def local_gpu_dot22(node):
gpu_from_host(dot22) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot22)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
......@@ -577,6 +589,7 @@ def local_gpu_dot22scalar(node):
gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
......@@ -602,7 +615,9 @@ def local_gpu_dot22scalar(node):
def local_gpu_solve(node):
"""
gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
......@@ -627,6 +642,7 @@ def local_gpu_solve(node):
def local_gpu_gemv(node):
"""
gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
"""
......@@ -665,6 +681,7 @@ def local_gpu_gemv(node):
def local_gpu_ger(node):
"""
gpu_from_host(ger) -> gpu_ger(gpu_from_host)
ger(host_from_gpu) -> host_from_gpu(gpu_ger)
"""
......@@ -706,6 +723,7 @@ def local_gpu_gemm(node):
gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
......@@ -1120,7 +1138,10 @@ def local_gpu_shape(node):
@register_opt()
@local_optimizer([tensor.Rebroadcast])
def local_gpu_rebroadcast(node):
'''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))'''
"""
rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))
"""
if isinstance(node.op, tensor.Rebroadcast):
x, = node.inputs
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
......@@ -1342,7 +1363,8 @@ def local_conv_fft_full(node):
def values_eq_approx_high_tol(a, b):
"""This fct is needed to don't have DebugMode raise useless
"""
This fct is needed to don't have DebugMode raise useless
error due to ronding error.
This happen as We reduce on the two last dimensions, so this
......@@ -1364,6 +1386,7 @@ def local_gpu_conv(node):
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv)
"""
def GpuConvOp_from_ConvOp(op):
logical_img_hw = None
......@@ -1534,7 +1557,10 @@ conv_groupopt.register('local_conv_gemm', local_conv_gemm, 30,
class LocalCudaMetaOptimizer(LocalMetaOptimizer):
"""Base class for CUDA-based LocalMetaOptimizers"""
"""
Base class for CUDA-based LocalMetaOptimizers.
"""
def time_call(self, fn):
# Override time_call() to do device synchronization
......@@ -1827,7 +1853,6 @@ def local_gpu_join(node):
by other opts, leaving us with
host_from_gpu(gpu_join)
For intermediate places in the graph not covered by the first opt, the
following could be useful:
......@@ -1911,8 +1936,12 @@ optdb.register('InplaceGpuBlasOpt',
def get_device_type_sizes():
"""
:return:(gpu ptr size, cpu ptr size, int sizes(gpu and cpu))
:return type: tuple
Returns
-------
tuple
(gpu ptr size, cpu ptr size, int sizes(gpu and cpu)).
"""
if hasattr(get_device_type_sizes, 'rval'):
return get_device_type_sizes.rval
......@@ -1941,7 +1970,7 @@ def get_device_type_sizes():
def max_inputs_to_GpuElemwise(node):
"""
return the maximum number of inputs this GpuElemwise Apply node can
Return the maximum number of inputs this GpuElemwise Apply node can
accept.
This is needed as currently there is a limit of 256 bytes of
......@@ -1950,8 +1979,8 @@ def max_inputs_to_GpuElemwise(node):
2.x (not used).
This measures the number of parameters we put in our GPU function and
computes the maximum number of inputs that respect the 256 byte
limit.
computes the maximum number of inputs that respect the 256 byte limit.
"""
type_sizes = get_device_type_sizes()
int_size = type_sizes['int_size']
......@@ -1986,6 +2015,7 @@ def split_huge_add_or_mul(node):
This should not happen for other GpuElemwise as their is only the fusion
that can generate op with too much input and it check for that.
"""
if node.op.scalar_op in (scal.add, scal.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node)
......@@ -2135,6 +2165,7 @@ def local_gpu_eye(node):
gpu_from_host(eye) -> gpueye(gpu_from_host)
eye(host_from_gpu) -> host_from_gpu(gpueye)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
......@@ -2167,10 +2198,11 @@ def safe_to_cpu(x):
def gpu_safe_new(x, tag=''):
"""
Internal function that constructs a new variable from x with the same
type, but with a different name ( old name + tag). This function is used
type, but with a different name (old name + tag). This function is used
by gradient, or the R-op to construct new variables for the inputs of
the inner graph such that there is no interference between the original
graph and the newly constructed graph.
"""
if hasattr(x, 'name') and x.name is not None:
nw_name = x.name + tag
......@@ -2188,8 +2220,9 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
"""
Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with
new variables of the same type, and returns those ( in the same
new variables of the same type, and returns those (in the same
order as the original inputs).
"""
if tag is None:
tag = ''
......@@ -2217,7 +2250,9 @@ def tensor_to_cuda(x):
def local_gpu_extract_diagonal(node):
"""
extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
"""
if (isinstance(node.op, nlinalg.ExtractDiag) and
isinstance(node.inputs[0].type,
......@@ -2249,9 +2284,10 @@ def typeConstructor(broadcastable, dtype):
def gpuScanOptimization(node):
"""
scan(host_from_gpu) -> host_from_gpu(GPUscan)
gpu_from_host(scan) -> GPUscan(gpu_from_host)
"""
"""
# gpu_from_host(scan) -> GPUscan(gpu_from_host)
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
......
"""
Define CURAND_RandomStreams - backed by CURAND
Define CURAND_RandomStreams - backed by CURAND.
"""
__authors__ = "James Bergstra"
......@@ -20,7 +21,8 @@ config = theano.config
class CURAND_Base(GpuOp):
""" Base class for a random number generator implemented in CURAND.
"""
Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by
CURAND. This Op uses a generic-typed shared variable to point to a CObject
......@@ -30,18 +32,23 @@ class CURAND_Base(GpuOp):
The actual random number generator is allocated from the seed, on the first
call to allocate random numbers (see c_code).
:note:
One caveat is that the random number state is simply not serializable.
Consequently, attempts to serialize functions compiled with these
random numbers will fail.
Parameters
----------
output_type
A theano type (e.g. tensor.fvector).
seed: int
destructive
True or False (on the generator)
Notes
-----
One caveat is that the random number state is simply not serializable.
Consequently, attempts to serialize functions compiled with these
random numbers will fail.
"""
def __init__(self, output_type, seed, destructive):
"""
output_type: a theano type (e.g. tensor.fvector)
seed: integer
destructive: True or False (on the generator)
"""
theano.gof.Op.__init__(self)
self.destructive = destructive
self.seed = seed
......@@ -51,11 +58,17 @@ class CURAND_Base(GpuOp):
assert output_type.dtype == "float32"
def as_destructive(self):
"""Return an destructive version of self"""
"""
Return an destructive version of self.
"""
return self.__class__(self.output_type, self.seed, destructive=True)
def _config(self):
"""Return a tuple of attributes that define the Op"""
"""
Return a tuple of attributes that define the Op.
"""
return (
self.destructive,
self.output_type,
......@@ -81,7 +94,7 @@ class CURAND_Base(GpuOp):
"""
Return a symbolic sample from generator.
cls dictates the random variable (e.g. uniform, normal)
cls dictates the random variable (e.g. uniform, normal).
"""
v_size = theano.tensor.as_tensor_variable(size)
......@@ -237,8 +250,11 @@ class CURAND_Base(GpuOp):
class CURAND_Normal(CURAND_Base):
"""Op to draw normal numbers using CURAND
"""
Op to draw normal numbers using CURAND.
"""
def _curand_call_str(self, **kwargs):
return """curandGenerateNormal(*gen,
CudaNdarray_DEV_DATA(%(o_sample)s),
......@@ -248,8 +264,11 @@ class CURAND_Normal(CURAND_Base):
class CURAND_Uniform(CURAND_Base):
"""Op to draw uniform numbers using CURAND
"""
Op to draw uniform numbers using CURAND.
"""
def _curand_call_str(self, **kwargs):
return """ curandGenerateUniform(*gen,
CudaNdarray_DEV_DATA(%(o_sample)s),
......@@ -262,24 +281,31 @@ class CURAND_RandomStreams(object):
RandomStreams instance that creates CURAND-based random variables.
One caveat is that generators are not serializable.
Parameters
----------
seed : int
"""
def __init__(self, seed):
""" seed: int
"""
self._start_seed = seed
self._cur_seed = seed
self._has_lost_states = False # True if self.state_updates incomplete
self.state_updates = []
def updates(self):
"""List of all (old, new) generator update pairs created by this
"""
List of all (old, new) generator update pairs created by this
instance.
"""
return list(self.state_updates)
def next_seed(self):
"""Return a unique seed for initializing a random variable.
"""
Return a unique seed for initializing a random variable.
"""
self._cur_seed += 1
return self._cur_seed - 1
......@@ -295,6 +321,7 @@ class CURAND_RandomStreams(object):
dtype=config.floatX):
"""
Return symbolic tensor of uniform numbers.
"""
if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable"
......@@ -321,8 +348,12 @@ class CURAND_RandomStreams(object):
"""
Return symbolic tensor of normally-distributed numbers.
:param: size: Can be a list of integer or Theano variable(ex: the shape
Parameters
----------
size
Can be a list of integer or Theano variable (ex: the shape
of other Theano Variable)
"""
if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable"
......
"""Provide CudaNdarrayType
"""
Provide CudaNdarrayType.
"""
from __future__ import print_function
import os
......@@ -31,36 +33,47 @@ class CudaNdarrayType(Type):
dtype = 'float32'
Variable = None
""" This will be set to the Variable type corresponding to this class.
"""
This will be set to the Variable type corresponding to this class.
That variable type is `CudaNdarrayVariable` defined in the
``var.py`` file beside this one.
:note: The var file depends on the file basic_ops.py, which
depends on this file. A cyclic dependency is avoided by not
hardcoding ``Variable = CudaNdarrayVariable``.
Notes
-----
The var file depends on the file basic_ops.py, which depends on this file.
A cyclic dependency is avoided by not hardcoding
``Variable = CudaNdarrayVariable``.
"""
Constant = None
""" This will be set to `CudaNdarrayConstant` defined in ``var.py``
"""
This will be set to `CudaNdarrayConstant` defined in ``var.py``.
:note:
Notes
-----
The var file depends on the file basic_ops.py, which depends on this file.
A cyclic dependency is avoided by not hardcoding this class.
"""
SharedVariable = None
""" This will be set to `CudaNdarraySharedVariable` defined in ``var.py``
"""
This will be set to `CudaNdarraySharedVariable` defined in ``var.py``.
:note:
Notes
-----
The var file depends on the file basic_ops.py, which depends on this file.
A cyclic dependency is avoided by not hardcoding this class.
"""
if cuda is not None:
value_zeros = staticmethod(cuda.CudaNdarray.zeros)
"""
Create an CudaNdarray full of 0 values
Create an CudaNdarray full of 0 values.
"""
def __init__(self, broadcastable, name=None, dtype=None):
......@@ -120,11 +133,13 @@ class CudaNdarrayType(Type):
data)
def filter_variable(self, other, allow_convert=True):
"""Convert a Variable into a CudaNdarrayType, if compatible.
"""
Convert a Variable into a CudaNdarrayType, if compatible.
This Variable should either already be a CudaNdarrayType, or be
a TensorType. It has to have the right number of dimensions,
broadcastable pattern, and dtype.
"""
if hasattr(other, '_as_CudaNdarrayVariable'):
other = other._as_CudaNdarrayVariable()
......@@ -209,10 +224,12 @@ class CudaNdarrayType(Type):
)
def dtype_specs(self):
"""Return a tuple (python type, c type, numpy typenum) that
corresponds to self.dtype.
"""
Return a tuple (python type, c type, numpy typenum) that corresponds
to self.dtype.
This function is used internally as part of C code generation.
"""
# TODO: add more type correspondances for e.g. int32, int64, float32,
# complex64, etc.
......@@ -236,7 +253,10 @@ class CudaNdarrayType(Type):
self.__class__.__name__, self.dtype))
def __eq__(self, other):
"""Compare True iff other is the same kind of CudaNdarrayType"""
"""
Compare True iff other is the same kind of CudaNdarrayType.
"""
return (type(self) == type(other) and
other.broadcastable == self.broadcastable)
......@@ -248,12 +268,16 @@ class CudaNdarrayType(Type):
return theano.tensor.patternbroadcast(var, self.broadcastable)
def __hash__(self):
"""Hash equal for same kinds of CudaNdarrayType"""
"""
Hash equal for same kinds of CudaNdarrayType.
"""
return hash(type(self)) ^ hash(self.broadcastable)
ndim = property(lambda self: len(self.broadcastable),
doc="number of dimensions")
"""Number of dimensions
"""
Number of dimensions.
This read-only property is the preferred way to get the number of
dimensions of a `CudaNdarrayType`.
......@@ -261,12 +285,14 @@ class CudaNdarrayType(Type):
"""
def make_variable(self, name=None):
"""Return a `TensorVariable` of this type
"""
Return a `TensorVariable` of this type.
:Parameters:
- `name`: str
A pretty name to identify this `Variable` when printing and
debugging
Parameters
----------
name : str
A pretty name to identify this `Variable` when printing and
debugging.
"""
return self.Variable(self, name=name)
......@@ -381,7 +407,9 @@ class CudaNdarrayType(Type):
return sio.getvalue()
def c_extract_out(self, name, sub, check_input=True, check_broadcast=True):
""" To allow the hack to skip check_broadcast.
"""
To allow the hack to skip check_broadcast.
"""
return """
if (py_%(name)s == Py_None)
......@@ -411,7 +439,10 @@ class CudaNdarrayType(Type):
""" % locals()
def c_sync(self, name, sub):
"""Override `CLinkerOp.c_sync` """
"""
Override `CLinkerOp.c_sync`.
"""
return """
//std::cerr << "sync\\n";
if (NULL == %(name)s) {
......@@ -433,11 +464,17 @@ class CudaNdarrayType(Type):
""" % locals()
def c_headers(self):
"""Override `CLinkerOp.c_headers` """
"""
Override `CLinkerOp.c_headers`.
"""
return ['cuda_ndarray.cuh']
def c_header_dirs(self):
"""Override `CLinkerOp.c_headers` """
"""
Override `CLinkerOp.c_headers`.
"""
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
......
......@@ -19,15 +19,18 @@ except ImportError:
class _operators(tensor.basic._tensor_py_operators):
"""Define a few properties and conversion methods for CudaNdarray Variables.
"""
Define a few properties and conversion methods for CudaNdarray Variables.
The default implementation of arithemetic operators is to build graphs of TensorType
variables.
The default implementation of arithemetic operators is to build graphs of
TensorType variables.
The optimization pass (specialization) will insert pure GPU implementations.
This approach relieves the Cuda-Ops of having to deal with input argument checking and
gradients.
This approach relieves the Cuda-Ops of having to deal with input argument
checking and gradients.
"""
def _as_TensorVariable(self):
return HostFromGpu()(self)
def _as_CudaNdarrayVariable(self):
......@@ -63,7 +66,8 @@ CudaNdarrayType.Constant = CudaNdarrayConstant
class CudaNdarraySharedVariable(_operators, SharedVariable):
"""
Shared Variable interface to CUDA-allocated arrays
Shared Variable interface to CUDA-allocated arrays.
"""
get_value_return_ndarray = True
......@@ -72,20 +76,23 @@ class CudaNdarraySharedVariable(_operators, SharedVariable):
"""
Return the value of this SharedVariable's internal array.
:param borrow:
permit the return of internal storage, when used in conjunction with
``return_internal_type=True``
:param return_internal_type:
True to return the internal ``cuda_ndarray`` instance rather than a ``numpy.ndarray``
(Default False)
Parameters
----------
borrow
Permit the return of internal storage, when used in conjunction with
``return_internal_type=True``.
return_internal_type
True to return the internal ``cuda_ndarray`` instance rather than a
``numpy.ndarray`` (Default False).
By default ``get_value()`` copies from the GPU to a ``numpy.ndarray`` and returns that
host-allocated array.
By default ``get_value()`` copies from the GPU to a ``numpy.ndarray``
and returns that host-allocated array.
``get_value(False,True)`` will return a GPU-allocated copy of the original GPU array.
``get_value(False,True)`` will return a GPU-allocated copy of the
original GPU array.
``get_value(True,True)`` will return the original GPU-allocated array without any
copying.
``get_value(True,True)`` will return the original GPU-allocated array
without any copying.
"""
if return_internal_type or not self.get_value_return_ndarray:
......@@ -101,33 +108,39 @@ class CudaNdarraySharedVariable(_operators, SharedVariable):
"""
Assign `value` to the GPU-allocated array.
:param borrow: ``True`` permits reusing `value` itself, ``False`` requires that this function
copies `value` into internal storage.
:note:
Parameters
----------
borrow : bool
``True`` permits reusing `value` itself, ``False`` requires that
this function copies `value` into internal storage.
Prior to Theano 0.3.1, set_value did not work in-place on the GPU. This meant that sometimes,
GPU memory for the new value would be allocated before the old memory was released. If you're
running near the limits of GPU memory, this could cause you to run out of GPU memory.
Notes
-----
Prior to Theano 0.3.1, set_value did not work in-place on the GPU. This
meant that sometimes, GPU memory for the new value would be allocated
before the old memory was released. If you're running near the limits of
GPU memory, this could cause you to run out of GPU memory.
Beginning with Theano 0.3.1, set_value will work in-place on the GPU, if the following conditions
are met:
Beginning with Theano 0.3.1, set_value will work in-place on the GPU, if
the following conditions are met:
* The destination on the GPU must be c_contiguous.
* The source is on the CPU.
* The old value must have the same dtype as the new value (which is a given for now,
since only float32 is supported).
* The old value must have the same dtype as the new value (which is
a given for now, since only float32 is supported).
* The old and new value must have the same shape.
* The old value is being completely replaced by the new value (not partially modified,
e.g. by replacing some subtensor of it).
* You change the value of the shared variable via set_value, not via the .value
accessors. You should not use the .value accessors anyway, since they will soon be
deprecated and removed.
* The old value is being completely replaced by the new value (not
partially modified, e.g. by replacing some subtensor of it).
* You change the value of the shared variable via set_value, not via
the .value accessors. You should not use the .value accessors
anyway, since they will soon be deprecated and removed.
It is also worth mentioning that, for efficient transfer to the GPU,
Theano will make the new data ``c_contiguous``. This can require an
extra copy of the data on the host.
It is also worth mentioning that, for efficient transfer to the GPU, Theano will make the new data
``c_contiguous``. This can require an extra copy of the data on the host.
The inplace on gpu memory work when borrow is either True or False.
The inplace on gpu memory work when borrow is either True or False.
"""
if not borrow:
# TODO: check for cuda_ndarray type
......@@ -147,8 +160,10 @@ CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
def cuda_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, broadcastable=None):
"""SharedVariable Constructor for CudaNdarrayType"""
"""
SharedVariable Constructor for CudaNdarrayType.
"""
# THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD
# SO INT shared vars, float64 shared vars, etc. all end up on the card.
# THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT.
......@@ -179,7 +194,11 @@ def cuda_shared_constructor(value, name=None, strict=False,
def float32_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, broadcastable=None):
"""SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray"""
"""
SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or
CudaNdarray.
"""
if theano.sandbox.cuda.use.device_number is None:
theano.sandbox.cuda.use("gpu",
force=True,
......
......@@ -47,7 +47,9 @@ def as_gpuarray(x):
class Kernel(object):
"""
This class groups together all the attributes of a gpu kernel.
"""
def __init__(self, code, params, name, flags,
codevar=None, binvar=None, objvar=None):
self.code = code
......@@ -113,9 +115,9 @@ class Kernel(object):
class GpuKernelBase(object):
def gpu_kernels(self, node, name):
"""
This is the method to override. This should return an
iterable of Kernel objects that describe the kernels this op
will need.
This is the method to override. This should return an iterable of Kernel
objects that describe the kernels this op will need.
"""
raise MethodNotDefined('gpu_kernels')
......@@ -552,13 +554,20 @@ cuda_from_gpu = CudaFromGpu()
class GpuAlloc(HideC, Alloc):
"""
Parameters
----------
memset_0
It's only an optimized version. True, it means the
value is always 0, so the c code call memset as it is faster.
"""
__props__ = ('memset_0',)
_f16_ok = True
def __init__(self, memset_0=False):
"""memset_0 is only an optimized version. True, it mean the
value is always 0, so the c code call memset as it is faster.
"""
self.memset_0 = memset_0
def __str__(self):
......@@ -777,7 +786,9 @@ class GpuContiguous(Op):
"""
Always return a c contiguous output. Copy the input only if it is
not already c contiguous.
"""
__props__ = ()
view_map = {0: [0]}
_f16_ok = True
......@@ -835,7 +846,9 @@ gpu_contiguous = GpuContiguous()
class GpuReshape(HideC, tensor.Reshape):
"""
Implement Reshape on the gpu.
"""
_f16_ok = True
# __hash__, __eq__, __str__ come from tensor.Reshape
......@@ -951,6 +964,7 @@ class GpuReshape(HideC, tensor.Reshape):
class GpuJoin(HideC, Join):
_f16_ok = True
def make_node(self, axis, *tensors):
......
......@@ -16,6 +16,7 @@ class NVCC_compiler(NVCC_base):
"""
Re-implementation of compile_args that does not create an
additionnal context on the GPU.
"""
flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
if config.nvcc.fastmath:
......
......@@ -12,7 +12,48 @@ from .basic_ops import as_gpuarray_variable
class GpuConv(gof.Op):
"""
Implement the batched and stacked 2d convolution on the gpu.
Parameters
----------
version
Each version of c_code implements many kernels for the convolution.
By default we try to guess the best one. You can force one version with
this parameter. This parameter is used by the tests.
direction_hint
'forward', 'bprop weights' or 'bprop inputs'. Serves as a hint for graph
optimizers replacing GpuConv by other implementations. If the GpuConv is
inserted automatically, we take its value from ConvOp.
verbose
For value of 1,2 and 3. Print more information during the execution of
the convolution. Mostly used for optimization or debugging.
kshp
The size of the kernel. If provided, can generate faster code. If the
GpuConv op is automatically inserted, we take its value automatically
from the Conv op.
imshp
The size of the image. Not used for code generation but allows to select
an experimental new version in another repo.
max_threads_dim0
The maximum number of threads for the block size dimensions 0
(blockDim.x) used by the GPU function.
nkern
The number of kernels. Not used for this op, but can be used by graph
optimizers to select a more optimal convolution implementation. If the
GpuConv op is inserted automatically, we take its value from the Conv
op.
bsize
The batch size. Not used for this op, but can be used by graph
optimizers to select a more optimal convolution implementation. If the
GpuConv op is inserted automatically, we take its value from the Conv
op.
fft_opt
Deactivate fft_opt optimization at the op level when set to False. Note
that by default fft optimization aren't enabled.
See :ref:`convolution documentation <libdoc_tensor_nnet_conv>` to enable
them.
"""
@staticmethod
def logical_output_shape_2d(imshp, kshp, mode):
if mode == 'valid':
......@@ -35,43 +76,6 @@ class GpuConv(gof.Op):
nkern=None,
bsize=None,
fft_opt=True):
"""
:param version: each version of c_code implements many kernels for the
convolution. By default we try to guess the best one.
You can force one version with this parameter. This
parameter is used by the tests.
:param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
Serves as a hint for graph optimizers replacing
GpuConv by other implementations. If the GpuConv is
inserted automatically, we take its value from ConvOp.
:param verbose: for value of 1,2 and 3. Print more information during
the execution of the convolution. Mostly used for
optimization or debugging.
:param kshp: The size of the kernel. If provided, can generate
faster code. If the GpuConv op is automatically
inserted,
we take its value automatically from the Conv op.
:param imshp: The size of the image. Not used for code generation but
allows to select an experimental new version in another
repo.
:param max_threads_dim0: The maximum number of threads for the
block size dimensions 0 (blockDim.x) used by the
GPU function.
:param nkern: The number of kernels. Not used for this op, but can be
used by graph optimizers to select a more optimal
convolution implementation. If the GpuConv op is inserted
automatically, we take its value from the Conv op.
:param bsize: The batch size. Not used for this op, but can be
used by graph optimizers to select a more optimal
convolution implementation. If the GpuConv op is inserted
automatically, we take its value from the Conv op.
:param fft_opt: deactivate fft_opt optimization at the op level when
set to False. Note that by default fft optimization
aren't enabled. See
:ref:`convolution documentation <libdoc_tensor_nnet_conv>`
to enable them.
"""
self.border_mode = border_mode
self.subsample = subsample
if logical_img_hw is not None:
......@@ -169,7 +173,10 @@ class GpuConv(gof.Op):
return gof.Apply(self, [img, kern], [out])
def flops(self, inputs, outputs):
""" Useful with the hack in profilemode to print the MFlops"""
"""
Useful with the hack in profilemode to print the MFlops.
"""
images, kerns = inputs
out, = outputs
assert images[1] == kerns[1]
......
......@@ -160,6 +160,7 @@ def c_set_tensor4d(var, desc, err, fail):
class DnnBase(COp):
"""
Creates a handle for cudnn and pulls in the cudnn libraries and headers.
"""
# dnn does not know about broadcasting, so we do not need to assert
# the input broadcasting pattern.
......@@ -230,11 +231,11 @@ class DnnVersion(Op):
def version():
"""return the current cuDNN version we compile with.
"""
Return the current cuDNN version we compile with.
This return a tuple with the header version and the library
version we link with. For older cudnn version without version
information, we return -1.
This return a tuple with the header version and the library version we link
with. For older cudnn version without version information, we return -1.
"""
if not dnn_available():
......@@ -252,12 +253,14 @@ version.v = None
class GpuDnnConvDesc(Op):
"""This Op builds a convolution descriptor for use in the other
convolution operations.
"""
This Op builds a convolution descriptor for use in the other convolution
operations.
see the doc of :func:`dnn_conv` for a description of the parameters
See the doc of :func:`dnn_conv` for a description of the parameters
"""
__props__ = ('border_mode', 'subsample', 'conv_mode')
def c_headers(self):
......@@ -408,17 +411,21 @@ class GpuDnnConv(DnnBase, COp):
"""
The forward convolution.
:param image:
:param kernel:
:param descr: the convolution descriptor
Parameters
----------
image
kernel
descr
The convolution descriptor.
workmem
Either 'none', 'small' or 'large'. Default is the value of
:attr:`config.dnn.conv.workmem`.
"""
__props__ = ('workmem', 'inplace')
def __init__(self, workmem=None, inplace=False):
"""
:param workmem: either 'none', 'small' or 'large'. Default is
the value of :attr:`config.dnn.conv.workmem`.
"""
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)")
if workmem is None:
......@@ -489,8 +496,9 @@ class GpuDnnConv(DnnBase, COp):
def get_out_shape(ishape, kshape, border_mode, subsample):
"""
This function computes the output shape for a convolution with
the specified parameters. `ishape` and `kshape` can be symbolic
the specified parameters. `ishape` and `kshape` can be symbolic
or scalar.
"""
b = ishape[0] # Number of inputs
h = ishape[2] # Height of input feature maps
......@@ -524,11 +532,15 @@ class GpuDnnConvGradW(DnnBase, COp):
"""
The convolution gradient with respect to the weights.
:param image:
:param kernel:
:param descr: the convolution descriptor
Parameters
----------
image
kernel
descr
The convolution descriptor.
"""
__props__ = ('inplace',)
def __init__(self, inplace=False):
......@@ -596,11 +608,15 @@ class GpuDnnConvGradI(DnnBase):
"""
The convolution gradient with respect to the inputs.
:param image:
:param kernel:
:param descr: the convolution descriptor
Parameters
----------
image
kernel
descr
The convolution descriptor.
"""
__props__ = ('inplace',)
def __init__(self, inplace=False):
......@@ -667,14 +683,22 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
:param img: images to do the convolution over
:param kerns: convolution filters
:param border_mode: one of 'valid', 'full'; additionally, the padding size
could be directly specified by an integer or a pair of integers
:param subsample: perform subsampling of the output (default: (1, 1))
:param conv_mode: perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross'. (default: 'conv')
:param direction_hint: Used by graph optimizers to change algorithm choice.
Parameters
----------
img
Images to do the convolution over.
kerns
Convolution filters.
border_mode
One of 'valid', 'full'; additionally, the padding size
could be directly specified by an integer or a pair of integers.
subsample
Perform subsampling of the output (default: (1, 1)).
conv_mode
Perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross' (default: 'conv').
direction_hint
Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW.
......@@ -682,15 +706,15 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
*not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned.
:param workmem: Specify the amount of working memory allowed.
More memory is usually faster. One of 'none', 'small' or
'large'. (default is None which takes its value from
:attr:`config.dnn.conv.workmem`)
workmem
Specify the amount of working memory allowed. More memory is usually
faster. One of 'none', 'small' or 'large' (default is None which takes
its value from :attr:`config.dnn.conv.workmem`).
.. warning:: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1, 1) and
......@@ -749,14 +773,21 @@ class GpuDnnPoolDesc(Op):
This Op builds a pooling descriptor for use in the other
pooling operations.
:param ws: windows size
:param stride: (dx, dy)
:param mode: 'max', 'average_inc_pad' or 'average_exc_pad'
The old deprecated name 'average' correspond to 'average_inc_pad'
:param pad: (padX, padY) padding information.
Parameters
----------
ws
Windows size.
stride
(dx, dy).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
The old deprecated name 'average' correspond to 'average_inc_pad'.
pad
(padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
"""
__props__ = ('ws', 'stride', 'mode', 'pad')
def c_headers(self):
......@@ -859,9 +890,15 @@ class GpuDnnPool(DnnBase):
"""
Pooling.
:param img: the image 4d tensor.
:param desc: the pooling descriptor.
Parameters
----------
img
The image 4d tensor.
desc
The pooling descriptor.
"""
__props__ = ()
def make_node(self, img, desc):
......@@ -1029,11 +1066,19 @@ class GpuDnnPoolGrad(DnnBase):
"""
The pooling gradient.
:param inp: the input of the pooling.
:param out: the output of the pooling in the forward.
:param inp_grad: same size as out, but is the corresponding gradient information.
:param desc: The pooling descriptor.
Parameters
----------
inp
The input of the pooling.
out
The output of the pooling in the forward.
inp_grad
Same size as out, but is the corresponding gradient information.
desc
The pooling descriptor.
"""
__props__ = ()
def make_node(self, inp, out, inp_grad, desc):
......@@ -1209,19 +1254,28 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
:param img: images to do the pooling over
:param ws: subsampling window size
:param stride: subsampling stride (default: (1, 1))
:param mode: one of 'max', 'average_inc_pad' or 'average_exc_pad
(default: 'max')
:param pad: (padX, padY) padding information.
Parameters
----------
img
Images to do the pooling over.
ws
Subsampling window size.
stride
Subsampling stride (default: (1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad'}
pad
(padX, padY) padding information.
padX is the size of the left and right borders,
padY is the size of the top and bottom borders.
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
:note: This Op implements the ignore_border=True of max_pool_2d.
.. warning:: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
Notes
-----
This Op implements the ignore_border=True of max_pool_2d.
"""
img = gpu_contiguous(img)
desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
......@@ -1232,12 +1286,18 @@ class GpuDnnSoftmaxBase(DnnBase):
"""
Op for the cuDNN Softmax.
:param tensor_format: Whether the data format is 'bc01' or 'b01c'.
:param algo: 'fast' or 'accurate' indicating whether computations should be
Parameters
----------
tensor_format
Whether the data format is 'bc01' or 'b01c'.
algo
'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively.
:param mode: 'instance' or 'channel' indicating whether the softmax should
be computed per image across 'c01' or per spatial location '01' per
mode
'instance' or 'channel' indicating whether the softmax should be
computed per image across 'c01' or per spatial location '01' per
image across 'c'.
"""
__props__ = ('tensor_format', 'mode', 'algo')
......@@ -1381,13 +1441,18 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
"""
Op for the cuDNN Softmax.
:param tensor_format: Whether the data format is 'bc01' or 'b01c'.
:param algo: 'fast' or 'accurate' indicating whether computations should be
tensor_format
Whether the data format is 'bc01' or 'b01c'.
algo
'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively.
:param mode: 'instance' or 'channel' indicating whether the softmax should
be computed per image across 'c01' or per spatial location '01' per
mode
'instance' or 'channel' indicating whether the softmax should be
computed per image across 'c01' or per spatial location '01' per
image across 'c'.
"""
direction = 'forward'
softmax_inputs = ['softmax_input']
......@@ -1442,12 +1507,18 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
"""
Op for the cuDNN SoftmaxGrad.
:param tensor_format: Whether the data format is 'bc01' or 'b01c'.
:param algo: 'fast' or 'accurate' indicating whether computations should be
Parameters
----------
tensor_format
Whether the data format is 'bc01' or 'b01c'.
algo
'fast' or 'accurate' indicating whether computations should be
optimized for speed or accuracy respectively.
:param mode: 'instance' or 'channel' indicating whether the softmax should
mode
'instance' or 'channel' indicating whether the softmax should
be computed per image across 'c01' or per spatial location '01' per
image across 'c'.
"""
direction = 'backward'
softmax_inputs = ['softmax_gout', 'softmax_input']
......@@ -1713,7 +1784,10 @@ def local_softmax_dnn(node):
class NoCuDNNRaise(Optimizer):
def apply(self, fgraph):
""" Raise a RuntimeError if cudnn can't be used"""
"""
Raise a RuntimeError if cudnn can't be used.
"""
if not dnn_available():
# Make an assert error as we want Theano to fail, not
# just skip this optimization.
......
......@@ -470,7 +470,8 @@ class GpuElemwise(HideC, Elemwise):
class SupportCodeError(Exception):
"""
We do not support certain things (such as the C++ complex struct)
We do not support certain things (such as the C++ complex struct).
"""
......@@ -571,14 +572,22 @@ class GpuDimShuffle(HideC, DimShuffle):
class GpuCAReduceCuda(HideC, CAReduceDtype):
"""GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
The dimensions along which to reduce is specified by the
`reduce_mask` that you pass to the constructor. The `reduce_mask`
is a tuple of booleans (actually integers 0 or 1) that specify for
each input dimension, whether to reduce it (1) or not (0).
For example, when scalar_op is a theano.scalar.basic.Add instance:
"""
GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
Parameters
----------
reduce-mask
The dimensions along which to reduce. The `reduce_mask` is a tuple of
booleans (actually integers 0 or 1) that specify for each input
dimension, whether to reduce it (1) or not (0).
pre_scalar_op
If present, must be a scalar op with only 1 input. We will execute it
on the input value before reduction.
Examples
--------
When scalar_op is a theano.scalar.basic.Add instance:
- reduce_mask == (1,) sums a vector to a scalar
......@@ -588,8 +597,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
- reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
:note: any reduce_mask of all zeros is a sort of 'copy', and may
be removed during graph optimization
Notes
-----
Any reduce_mask of all zeros is a sort of 'copy', and may be removed during
graph optimization.
This Op is a work in progress.
......@@ -602,9 +613,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
GPUs are not especially well-suited to reduction operations so it is
quite possible that the GPU might be slower for some cases.
pre_scalar_op: if present, must be a scalar op with only 1
input. We will execute it on the input value before reduction.
"""
_f16_ok = True
def __init__(self, scalar_op, axis=None,
......@@ -690,9 +700,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
raise MethodNotDefined("")
def supports_c_code(self, inputs):
""" Returns True if the current op and reduce pattern
has functioning C code """
"""
Returns True if the current op and reduce pattern has functioning C code.
"""
# If we don't even have the right method, we certainly
# don't support the C code
# (This is the test that used to be implemented by
......@@ -871,9 +882,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
return sio.getvalue()
def _makecall(self, node, name, x, z, fail, pattern=None):
"""Return a string for making a kernel call.
"""
Return a string for making a kernel call.
The return value looks something like:
The return value looks something like:
.. code-block:: c
......@@ -972,7 +984,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
def _k_decl(self, node, nodename, pattern=None,
ndim=None, reduce_mask=None):
"""Return a string to declare a kernel function
"""
Return a string to declare a kernel function.
The result will look something like this:
......@@ -989,8 +1002,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(out_dtype)s * Z,
const int sZ0)
Since the nodename is unique, we don't need to put the name
of the scalar_op in here.
Since the nodename is unique, we don't need to put the name
of the scalar_op in here.
"""
in_dtype = "npy_" + node.inputs[0].dtype
......@@ -1057,6 +1070,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
Otherwise, check that the scalar op is maximum or minimum
and return first_item. It should be the first element of the reduction.
As the maximum and minimum of the same value don't change, this work.
"""
if hasattr(self.scalar_op, 'identity'):
return str(self.scalar_op.identity)
......@@ -1084,15 +1098,28 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
def _assign_reduce(self, node, name, left, right, sub, pre):
"""
node: the node argument to this op's c_code
name: the name argument to this op's c_code
left: a C code string identifying an lvalue
right: a C code string identifying an expression
sub: the sub argument to this op's c_code
pre: If True, we will add the pre_scalar_op.c_code
returns C code to reduce left and right, assigning the
result to left."""
Parameters
----------
node
The node argument to this op's c_code.
name
The name argument to this op's c_code.
left
A C code string identifying an lvalue.
right
A C code string identifying an expression.
sub
The sub argument to this op's c_code.
pre
If True, we will add the pre_scalar_op.c_code.
Returns
-------
str
C code to reduce left and right, assigning the result to left.
"""
x, = node.inputs
in_dtype = x.dtype
......@@ -1125,8 +1152,11 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
"""
WRITEME
node, name, sub: these should be passed through from the original
call to c_code
Parameters
----------
node, name, sub
These should be passed through from the original call to c_code.
"""
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
......@@ -1274,9 +1304,11 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
"""
WRITEME
IG: I believe, based on how this is called in c_code, that it
is for the case where we are reducing on all axes and x is
C contiguous.
"""
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
......@@ -1366,8 +1398,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
"""
:param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
Work for N=1,2,3
Parameters
----------
N
The number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
Work for N=1,2,3.
"""
assert N in [1, 2, 3]
......@@ -2552,11 +2589,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
"""CAReduce that reuse the python code from gpuarray.
"""
CAReduce that reuse the python code from gpuarray.
Too slow for now as it only have a python interface.
"""
def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
if not hasattr(scalar_op, 'identity'):
raise ValueError("No identity on scalar op")
......
""" Helper routines for generating gpu kernels for nvcc.
"""
Helper routines for generating gpu kernels for nvcc.
def nvcc_kernel(name, params, body):
"""Return the c code of a kernel function.
"""
:param params: the parameters to the function as one or more strings
def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
:param body: the [nested] list of statements for the body of the
function. These will be separated by ';' characters.
Parameters
----------
params
The parameters to the function as one or more strings.
body
The [nested] list of statements for the body of the function.
These will be separated by ';' characters.
"""
paramstr = ', '.join(params)
......@@ -28,7 +34,10 @@ def nvcc_kernel(name, params, body):
def code_version(version):
"""decorator to support version-based cache mechanism"""
"""
Decorator to support version-based cache mechanism.
"""
if not isinstance(version, tuple):
raise TypeError('version must be tuple', version)
......@@ -42,22 +51,31 @@ UNVERSIONED = ()
@code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer
:param pos: index of executing thread
:param count: number of executing threads
:param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
"""
Return C++ code for a function that reduces a contiguous buffer.
Parameters
----------
N
Length of the buffer.
buf
buffer pointer.
pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction.
:postcondition:
This function leaves the answer in position 0 of the buffer. The
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
Notes
-----
buf should be in gpu shared memory, we access it many times.
"""
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
......@@ -126,19 +144,28 @@ def inline_reduce_prod(N, buf, pos, count):
def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
"""
:param N: length of the buffer
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:param dtype: dtype of the softmax's output
Parameters
----------
N
Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
dtype
Dtype of the softmax's output.
:Precondition: buf and buf2 contain two identical copies of the input
to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times
Notes
-----
buf and buf2 should be in gpu shared memory, we access it many times.
We use __i as an int variable in a loop.
:note2: We use __i as an int variable in a loop
"""
return [
# get max of buf (trashing all but buf[0])
......@@ -169,31 +196,48 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init,
b='', stride_b='', load_b='', dtype='float32'):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(dtype)
:param x: input data
:param stride_x: input data stride
:param load_x: wrapper to read from x
:param pos: index of executing thread
:param count: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the output
:param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
:param manner_init: a function that accepts strings of arguments a
and return c code for its initialization
"""
Return C++ code for a function that reduces a contiguous buffer.
Parameters
----------
N
Length of the buffer.
buf
Buffer pointer of size warpSize * sizeof(dtype).
x
Input data.
stride_x
Input data stride.
load_x
Wrapper to read from x.
pos
Index of executing thread.
count
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction.
manner_init
A function that accepts strings of arguments a and return c code for its
initialization.
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
This function leaves the answer in position 0 of the buffer. The rest of the
buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
Notes
-----
buf should be in gpu shared memory, we access it many times.
"""
if b:
......@@ -270,28 +314,47 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
dtype="float32"):
"""
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(dtype)
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param load_x: wrapper to read from x
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param write_sm: wrapper before writing to sm
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the softmax's output if not float32
Parameters
----------
N
Length of the buffer, atleast waprSize(32).
buf
A shared memory buffer of size warpSize * sizeof(dtype).
x
A ptr to the gpu memory where the row is stored.
stride_x
The stride between each element in x.
load_x
Wrapper to read from x.
sm
A ptr to the gpu memory to store the result.
sm_stride
The stride between each sm element.
write_sm
Wrapper before writing to sm.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the softmax's output if not float32.
:Precondition: buf is empty
:Postcondition: buf[0] contains the softmax,
buf2 contains un-normalized softmax
:Postcondition: buf[0] contains the softmax, buf2 contains un-normalized
softmax
Notes
-----
buf should be in gpu shared memory, we access it many times.
:note: buf should be in gpu shared memory, we access it many times.
We use tx as an int variable in a loop.
:note2: We use tx as an int variable in a loop
"""
ret = [
# get max of buf (trashing all but buf[0])
......
......@@ -22,7 +22,9 @@ from .fp16_help import work_dtype, load_w, write_w
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
"""
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
"""
nin = 3
nout = 3
__props__ = ()
......@@ -276,8 +278,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
"""
Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
Gradient wrt x of the CrossentropySoftmax1Hot Op
Gradient wrt x of the CrossentropySoftmax1Hot Op.
"""
nin = 3
nout = 1
__props__ = ()
......@@ -462,7 +466,9 @@ gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasD
class GpuSoftmax (Op):
"""
Implement Softmax on the gpu.
"""
__props__ = ()
_f16_ok = True
......@@ -651,7 +657,9 @@ gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (Op):
"""
Implement SoftmaxWithBias on the gpu.
"""
nin = 2
nout = 1
__props__ = ()
......
......@@ -89,7 +89,9 @@ def safe_to_cpu(x):
def op_lifter(OP, cuda_only=False):
"""
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
"""
def f(maker):
def local_opt(node):
......@@ -122,7 +124,10 @@ def op_lifter(OP, cuda_only=False):
class InputToGpuOptimizer(Optimizer):
"Transfer the input to the gpu to start the rolling wave."
"""
Transfer the input to the gpu to start the rolling wave.
"""
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
......@@ -173,6 +178,7 @@ def local_gpuaalloc2(node):
Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
Moves an alloc that is an input to join to the gpu.
"""
if (isinstance(node.op, tensor.Alloc) and
all(c != 'output' and
......@@ -654,6 +660,7 @@ def local_gpu_conv(node):
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv)
"""
def GpuConvOp_from_ConvOp(op):
logical_img_hw = None
......@@ -698,7 +705,8 @@ def local_gpu_conv(node):
return ret
def values_eq_approx(a, b):
"""This fct is needed to don't have DebugMode raise useless
"""
This fct is needed to don't have DebugMode raise useless
error due to ronding error.
This happen as We reduce on the two last dimensions, so this
......@@ -736,7 +744,10 @@ register_opt()(conv_groupopt)
@register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node):
""" Merge some GpuCAReduceCuda and GPUElemwise"""
"""
Merge some GpuCAReduceCuda and GPUElemwise.
"""
if (isinstance(node.op, GpuCAReduceCuda) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
......@@ -767,10 +778,11 @@ def tensor_to_gpu(x):
def gpu_safe_new(x, tag=''):
"""
Internal function that constructs a new variable from x with the same
type, but with a different name ( old name + tag). This function is used
type, but with a different name (old name + tag). This function is used
by gradient, or the R-op to construct new variables for the inputs of
the inner graph such that there is no interference between the original
graph and the newly constructed graph.
"""
if hasattr(x, 'name') and x.name is not None:
nw_name = x.name + tag
......@@ -788,8 +800,9 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
"""
Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with
new variables of the same type, and returns those ( in the same
new variables of the same type, and returns those (in the same
order as the original inputs).
"""
if tag is None:
tag = ''
......
......@@ -163,12 +163,16 @@ class GpuIncSubtensor(IncSubtensor):
"""
Implement IncSubtensor on the gpu.
Note: The optimization to make this inplace is in tensor/opt.
The same optimization handles IncSubtensor and GpuIncSubtensor.
This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
Notes
-----
The optimization to make this inplace is in tensor/opt.
The same optimization handles IncSubtensor and GpuIncSubtensor.
This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
"""
@property
def _f16_ok(self):
return self.iadd_node.op._f16_ok
......@@ -256,8 +260,10 @@ class GpuIncSubtensor(IncSubtensor):
return d
def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support
"""
Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, GpuArrayType):
......@@ -265,13 +271,22 @@ class GpuIncSubtensor(IncSubtensor):
def copy_of_x(self, x):
"""
:param x: a string giving the name of a C variable
pointing to an array
:return: C code expression to make a copy of x
Parameters
----------
x
A string giving the name of a C variable pointing to an array.
Returns
-------
str
C code expression to make a copy of x.
Notes
-----
Base class uses `PyArrayObject *`, subclasses may override for
different types of arrays.
Base class uses `PyArrayObject *`, subclasses may override for
different types of arrays.
"""
return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
......@@ -279,13 +294,18 @@ class GpuIncSubtensor(IncSubtensor):
return "PyGpuArrayObject* zview = NULL;"
def make_view_array(self, x, view_ndim):
"""//TODO
:param x: a string identifying an array to be viewed
:param view_ndim: a string specifying the number of dimensions
to have in the view
"""
//TODO
Parameters
----------
x
A string identifying an array to be viewed.
view_ndim
A string specifying the number of dimensions to have in the view.
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
ret = """
size_t dims[%(view_ndim)s];
......@@ -305,18 +325,29 @@ class GpuIncSubtensor(IncSubtensor):
return ret
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code"""
"""
Return a dictionary of arguments to use with helper_c_code.
"""
return {'c_prefix': 'PyGpuArray',
'strides_mul': 1
}
def copy_into(self, view, source):
"""
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and
return 0 on success
Parameters
----------
view : string
C code expression for an array.
source : string
C code expression for an array.
Returns
-------
str
C code expression to copy source into view, and 0 on success.
"""
return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
......@@ -365,7 +396,9 @@ class GpuIncSubtensor(IncSubtensor):
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
"""
Implement AdvancedIncSubtensor1 on the gpu.
"""
def make_node(self, x, y, ilist):
x_ = as_gpuarray_variable(x)
y_ = as_gpuarray_variable(y)
......@@ -454,9 +487,12 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function
"""
Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent.
"""
_f16_ok = True
def make_node(self, x, y, ilist):
......
......@@ -217,10 +217,12 @@ class GpuArrayType(Type):
return (hash(self.typecode) ^ hash(self.broadcastable))
def dtype_specs(self):
"""Return a tuple (python type, c type, numpy typenum) that corresponds
"""
Return a tuple (python type, c type, numpy typenum) that corresponds
to self.dtype.
This function is used internally as part of C code generation.
"""
# TODO: add more type correspondances for e.g. int32, int64, float32,
# complex64, etc.
......@@ -386,7 +388,10 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable
def gpuarray_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False,
broadcastable=None):
"""SharedVariable constructor for GpuArrayType"""
"""
SharedVariable constructor for GpuArrayType.
"""
if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
raise TypeError('ndarray or GpuArray required')
......
......@@ -64,7 +64,7 @@ except ImportError:
class Hint(Op):
"""
Provide arbitrary information to the optimizer
Provide arbitrary information to the optimizer.
These ops are removed from the graph during canonicalization
in order to not interfere with other optimizations.
......@@ -122,7 +122,7 @@ def remove_hint_nodes(node):
class HintsFeature(object):
"""
FunctionGraph Feature to track matrix properties
FunctionGraph Feature to track matrix properties.
This is a similar feature to variable 'tags'. In fact, tags are one way
to provide hints.
......@@ -209,8 +209,12 @@ class HintsFeature(object):
class HintsOptimizer(Optimizer):
"""Optimizer that serves to add HintsFeature as an fgraph feature.
"""
Optimizer that serves to add HintsFeature as an fgraph feature.
"""
def __init__(self):
Optimizer.__init__(self)
......@@ -231,6 +235,7 @@ def psd(v):
"""
Apply a hint that the variable `v` is positive semi-definite, i.e.
it is a symmetric matrix and :math:`x^T A x \ge 0` for any vector x.
"""
return Hint(psd=True, symmetric=True)(v)
......@@ -294,6 +299,7 @@ def tag_solve_triangular(node):
"""
If a general solve() is applied to the output of a cholesky op, then
replace it with a triangular solve.
"""
if node.op == solve:
if node.op.A_structure == 'general':
......@@ -396,12 +402,13 @@ def spectral_radius_bound(X, log2_exponent):
Returns upper bound on the largest eigenvalue of square symmetrix matrix X.
log2_exponent must be a positive-valued integer. The larger it is, the
slower and tighter the bound. Values up to 5 should usually suffice. The
slower and tighter the bound. Values up to 5 should usually suffice. The
algorithm works by multiplying X by itself this many times.
From V.Pan, 1990. "Estimating the Extremal Eigenvalues of a Symmetric
Matrix", Computers Math Applic. Vol 20 n. 2 pp 17-22.
Rq: an efficient algorithm, not used here, is defined in this paper.
"""
if X.type.ndim != 2:
raise TypeError('spectral_radius_bound requires a matrix argument', X)
......
"""
This module provides the Scan Op
This module provides the Scan Op.
Scanning is a general form of recurrence, which can be used for looping.
The idea is that you *scan* a function along some input sequence, producing
an output at each time-step that can be seen (but not modified) by the
function at the next time-step. (Technically, the function can see the
function at the next time-step. Technically, the function can see the
previous K time-steps of your outputs and L time steps (from the past and
future) of your inputs.
......@@ -26,6 +26,7 @@ the symbolic graph.
The Scan Op should typically be used by calling any of the following
functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
``foldr()``.
"""
......
"""
This module provides the Scan Op
This module provides the Scan Op.
Scanning is a general form of recurrence, which can be used for looping.
The idea is that you *scan* a function along some input sequence, producing
an output at each time-step that can be seen (but not modified) by the
function at the next time-step. (Technically, the function can see the
function at the next time-step. Technically, the function can see the
previous K time-steps of your outputs and L time steps (from past and
future) of your inputs.
......@@ -32,6 +32,7 @@ host at each step
The Scan Op should typically be used by calling any of the following
functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
``foldr()``.
"""
__docformat__ = 'restructedtext en'
__authors__ = ("Razvan Pascanu "
......@@ -76,7 +77,9 @@ def scan(fn,
This function constructs and applies a Scan op to the provided
arguments.
:param fn:
Parameters
----------
fn
``fn`` is a function that describes the operations involved in one
step of ``scan``. ``fn`` should construct variables describing the
output of one iteration step. It should expect as input theano
......@@ -167,7 +170,7 @@ def scan(fn,
number of steps ) is still required even though a condition is
passed (and it is used to allocate memory if needed). = {}):
:param sequences:
sequences
``sequences`` is the list of Theano variables or dictionaries
describing the sequences ``scan`` has to iterate over. If a
sequence is given as wrapped in a dictionary, then a set of optional
......@@ -185,8 +188,7 @@ def scan(fn,
Any Theano variable in the list ``sequences`` is automatically
wrapped into a dictionary where ``taps`` is set to ``[0]``
:param outputs_info:
outputs_info
``outputs_info`` is the list of Theano variables or dictionaries
describing the initial state of the outputs computed
recurrently. When this initial states are given as dictionary
......@@ -243,15 +245,13 @@ def scan(fn,
raised (because there is no convention on how scan should map
the provided information to the outputs of ``fn``)
:param non_sequences:
non_sequences
``non_sequences`` is the list of arguments that are passed to
``fn`` at each steps. One can opt to exclude variable
used in ``fn`` from this list as long as they are part of the
computational graph, though for clarity we encourage not to do so.
:param n_steps:
n_steps
``n_steps`` is the number of steps to iterate given as an int
or Theano scalar. If any of the input sequences do not have
enough elements, scan will raise an error. If the *value is 0* the
......@@ -261,8 +261,7 @@ def scan(fn,
in time. If n stpes is not provided, ``scan`` will figure
out the amount of steps it should run given its input sequences.
:param truncate_gradient:
truncate_gradient
``truncate_gradient`` is the number of steps to use in truncated
BPTT. If you compute gradients through a scan op, they are
computed using backpropagation through time. By providing a
......@@ -270,16 +269,14 @@ def scan(fn,
of classical BPTT, where you go for only ``truncate_gradient``
number of steps back in time.
:param go_backwards:
go_backwards
``go_backwards`` is a flag indicating if ``scan`` should go
backwards through the sequences. If you think of each sequence
as indexed by time, making this flag True would mean that
``scan`` goes back in time, namely that for any sequence it
starts from the end and goes towards 0.
:param name:
name
When profiling ``scan``, it is crucial to provide a name for any
instance of ``scan``. The profiler will produce an overall
profile of your code as well as profiles for the computation of
......@@ -287,7 +284,7 @@ def scan(fn,
appears in those profiles and can greatly help to disambiguate
information.
:param mode:
mode
It is recommended to leave this argument to None, especially
when profiling ``scan`` (otherwise the results are not going to
be accurate). If you prefer the computations of one step of
......@@ -296,7 +293,7 @@ def scan(fn,
loop are done (see ``theano.function`` for details about
possible values and their meaning).
:param profile:
profile
Flag or string. If true, or different from the empty string, a
profile object will be created and attached to the inner graph of
scan. In case ``profile`` is True, the profile object will have the
......@@ -305,18 +302,21 @@ def scan(fn,
inner graph with the new cvm linker ( with default modes,
other linkers this argument is useless)
:rtype: tuple
:return: tuple of the form (outputs, updates); ``outputs`` is either a
Theano variable or a list of Theano variables representing the
outputs of ``scan`` (in the same order as in
``outputs_info``). ``updates`` is a subclass of dictionary
specifying the
update rules for all shared variables used in scan
This dictionary should be passed to ``theano.function`` when
you compile your function. The change compared to a normal
dictionary is that we validate that keys are SharedVariable
and addition of those dictionary are validated to be consistent.
"""
Returns
-------
tuple
Tuple of the form (outputs, updates); ``outputs`` is either a
Theano variable or a list of Theano variables representing the
outputs of ``scan`` (in the same order as in
``outputs_info``). ``updates`` is a subclass of dictionary
specifying the
update rules for all shared variables used in scan
This dictionary should be passed to ``theano.function`` when
you compile your function. The change compared to a normal
dictionary is that we validate that keys are SharedVariable
and addition of those dictionary are validated to be consistent.
"""
# Note : see the internal documentation of the scan op for naming
# conventions and all other details
if options is None:
......@@ -544,6 +544,7 @@ def one_step_scan(fn,
truncate_gradient):
"""
This function is evaluated if `n_steps` evaluates to either 1 or -1.
"""
# 1. Grab slices of sequences
inputs_slices = [input[0] for input in inputs]
......
"""
This module provides the Scan Op
This module provides the Scan Op.
See scan.py for details on scan.
See scan.py for details on scan
"""
from __future__ import print_function
......@@ -157,25 +158,32 @@ class ScanOp(PureOp):
def make_thunk(self, node, storage_map, compute_map, no_recycling):
"""
:param node: the Apply node returned by the ``make_node`` function
of the scan op class
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
Parameters
----------
node
The Apply node returned by the ``make_node`` function of the scan
op class.
storage_map
dict variable -> one-element-list where a computed value for this
variable may be found.
compute_map
dict variable -> one-element-list where a boolean value will be
found. The boolean indicates whether the variable's storage_map
container contains a valid value (True) or if it has not been
computed yet (False).
no_recycling
List of variables for which it is forbidden to reuse memory
allocated by a previous call.
Notes
-----
If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
"""
# 1. Collect all memory buffers
node_input_storage = [storage_map[r] for r in node.inputs]
......
"""
This module provides utility functions for the Scan Op
This module provides utility functions for the Scan Op.
See scan.py for details on scan.
See scan.py for details on scan
"""
from __future__ import print_function
__docformat__ = 'restructedtext en'
......@@ -41,8 +42,11 @@ def expand(tensor_var, size):
``tensor_var``, namely:
rval[:d1] = tensor_var
:param tensor_var: Theano tensor variable
:param size: int
Parameters
----------
tensor_var : Theano tensor variable.
size : int
"""
# Corner case that I might use in an optimization
if size == 0:
......@@ -57,7 +61,8 @@ def expand(tensor_var, size):
def to_list(ls):
"""
Converts ``ls`` to list if it is a tuple, or wraps ``ls`` into a list if
it is not a list already
it is not a list already.
"""
if isinstance(ls, (list, tuple)):
return list(ls)
......@@ -70,7 +75,9 @@ class until(object):
Theano can end on a condition. In order to differentiate this condition
from the other outputs of scan, this class is used to wrap the condition
around it.
"""
def __init__(self, condition):
self.condition = tensor.as_tensor_variable(condition)
assert self.condition.ndim == 0
......@@ -78,10 +85,12 @@ class until(object):
def get_updates_and_outputs(ls):
"""
Parses the list ``ls`` into outputs and updates. The semantics
of ``ls`` is defined by the constructive function of scan.
Parses the list ``ls`` into outputs and updates.
The semantics of ``ls`` is defined by the constructive function of scan.
The elemets of ``ls`` are either a list of expressions representing the
outputs/states, a dictionary of updates or a condition.
"""
def is_list_outputs(elem):
if (isinstance(elem, (list, tuple)) and
......@@ -150,23 +159,23 @@ def get_updates_and_outputs(ls):
def clone(output, replace=None, strict=True, share_inputs=True):
"""
Function that allows replacing subgraphs of a computational
graph. It returns a copy of the initial subgraph with the corresponding
Function that allows replacing subgraphs of a computational graph.
It returns a copy of the initial subgraph with the corresponding
substitutions.
:type output: Theano Variables (or Theano expressions)
:param outputs: Theano expression that represents the computational
graph
:type replace: dict
:param replace: dictionary describing which subgraphs should be
replaced by what
Parameters
----------
output : Theano Variables (or Theano expressions)
Theano expression that represents the computational graph.
replace: dict
Dictionary describing which subgraphs should be replaced by what.
share_inputs : bool
If True, use the same inputs (and shared variables) as the original
graph. If False, clone them. Note that cloned shared variables still
use the same underlying storage, so they will always have the same
value.
:type share_inputs: bool
:param share_inputs: If True, use the same inputs (and shared variables)
as the original graph. If False, clone them. Note that cloned
shared variables still use the same underlying storage, so they
will always have the same value.
"""
inps, outs, other_stuff = rebuild_collect_shared(output,
[],
......@@ -189,6 +198,7 @@ def canonical_arguments(sequences,
Mainly it makes sure that arguments are given as lists of dictionaries,
and that the different fields of of a dictionary are set to default
value if the user has not provided any.
"""
states_info = to_list(outputs_info)
parameters = [tensor.as_tensor_variable(x) for x in to_list(non_sequences)]
......@@ -303,13 +313,14 @@ def canonical_arguments(sequences,
def infer_shape(outs, inputs, input_shapes):
'''
"""
Compute the shape of the outputs given the shape of the inputs
of a theano graph.
We do it this way to avoid compiling the inner function just to get
the shape. Changes to ShapeFeature could require changes in this function.
'''
We do it this way to avoid compiling the inner function just to get the
shape. Changes to ShapeFeature could require changes in this function.
"""
# We use a ShapeFeature because it has all the necessary logic
# inside. We don't use the full ShapeFeature interface, but we
# let it initialize itself with an empty fgraph, otherwise we will
......@@ -326,9 +337,10 @@ def infer_shape(outs, inputs, input_shapes):
shape_feature.set_shape(inp, inp_shp)
def local_traverse(out):
'''
"""
Go back in the graph, from out, adding computable shapes to shape_of.
'''
"""
if out in shape_feature.shape_of:
# Its shape is already known
......@@ -358,14 +370,17 @@ def allocate_memory(T, y_info, y):
"""
Allocates memory for an output of scan.
:param T: scalar
Variable representing the number of steps scan will run
:param y_info: dict
Parameters
----------
T : scalar
Variable representing the number of steps scan will run.
y_info : dict
Dictionary describing the output (more specifically describing shape
information for the output
:param y: Tensor variable
information for the output.
y : Tensor variable
Expression describing the computation resulting in out entry of y.
It can be used to infer the shape of y
It can be used to infer the shape of y.
"""
if 'shape' in y_info:
return tensor.zeros([T, ] + list(y_info['shape']),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论