提交 931f4e93 authored 作者: abergeron's avatar abergeron

Merge pull request #3297 from harlouci/numpydoc_sandbox_2

Numpydoc sandbox 2
...@@ -12,7 +12,10 @@ from theano.sandbox.cuda import CudaNdarrayType, GpuOp ...@@ -12,7 +12,10 @@ from theano.sandbox.cuda import CudaNdarrayType, GpuOp
class GpuConv3D(GpuOp): class GpuConv3D(GpuOp):
""" GPU implementation of Conv3D """ """
GPU implementation of Conv3D.
"""
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
...@@ -25,10 +28,18 @@ class GpuConv3D(GpuOp): ...@@ -25,10 +28,18 @@ class GpuConv3D(GpuOp):
def make_node(self, V, W, b, d): def make_node(self, V, W, b, d):
""" """
:param V: Visible unit, input
:param W: Weights, filter Parameters
:param b: bias ----------
:param d: strides when moving the filter over the input V
Visible unit, input.
W
Weights, filter.
b
Bias.
d
Strides when moving the filter over the input.
""" """
V_ = as_cuda_ndarray_variable(V) V_ = as_cuda_ndarray_variable(V)
W_ = as_cuda_ndarray_variable(W) W_ = as_cuda_ndarray_variable(W)
......
...@@ -15,14 +15,25 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu, ...@@ -15,14 +15,25 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
class GpuConvGrad3D(GpuOp): class GpuConvGrad3D(GpuOp):
""" GPU version of gradient of ConvGrad3D with respect to W """ """
GPU version of gradient of ConvGrad3D with respect to W.
"""
def make_node(self, V, d, WShape, dCdH): def make_node(self, V, d, WShape, dCdH):
""" """
:param V: visible
:param d: strides Parameters
:param WShape: shapes of the weights -> shape of this op output ----------
:param dCdH: other input with what V will be convolved. V
Visible.
d
Strides.
WShape
Shapes of the weights -> shape of this op output.
dCdH
Other input with what V will be convolved.
""" """
V_ = as_cuda_ndarray_variable(V) V_ = as_cuda_ndarray_variable(V)
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
......
...@@ -16,7 +16,11 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu, ...@@ -16,7 +16,11 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
class GpuConvTransp3D(GpuOp): class GpuConvTransp3D(GpuOp):
""" The gpu version of ConvTransp3D """ """
The gpu version of ConvTransp3D.
"""
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
......
...@@ -94,12 +94,14 @@ cuda_enabled = False ...@@ -94,12 +94,14 @@ cuda_enabled = False
# Code factorized within a function so that it may be called from multiple # Code factorized within a function so that it may be called from multiple
# places (which is not currently the case, but may be useful in the future). # places (which is not currently the case, but may be useful in the future).
def set_cuda_disabled(): def set_cuda_disabled():
"""Function used to disable cuda. """
Function used to disable cuda.
A warning is displayed, so that the user is aware that cuda-based code is A warning is displayed, so that the user is aware that cuda-based code is
not going to work. not going to work.
Note that there is no point calling this function from outside of Note that there is no point calling this function from outside of
`cuda.__init__`, since it has no effect once the module is loaded. `cuda.__init__`, since it has no effect once the module is loaded.
""" """
global cuda_available, cuda_warning_is_displayed global cuda_available, cuda_warning_is_displayed
cuda_available = False cuda_available = False
...@@ -116,8 +118,9 @@ libcuda_ndarray_so = os.path.join(cuda_ndarray_loc, ...@@ -116,8 +118,9 @@ libcuda_ndarray_so = os.path.join(cuda_ndarray_loc,
def try_import(): def try_import():
""" """
load the cuda_ndarray module if present and up to date Load the cuda_ndarray module if present and up to date.
return True if loaded correctly, otherwise return False Return True if loaded correctly, otherwise return False.
""" """
cuda_files = ( cuda_files = (
'cuda_ndarray.cu', 'cuda_ndarray.cu',
...@@ -219,6 +222,7 @@ if cuda_available: ...@@ -219,6 +222,7 @@ if cuda_available:
def ok(): def ok():
""" """
Check if an existing library exists and can be read. Check if an existing library exists and can be read.
""" """
try: try:
open(libcuda_ndarray_so).close() open(libcuda_ndarray_so).close()
...@@ -266,6 +270,7 @@ class GpuOp(theano.gof.Op): ...@@ -266,6 +270,7 @@ class GpuOp(theano.gof.Op):
It is defined in __init__.py so that it exists even when `cuda_available` It is defined in __init__.py so that it exists even when `cuda_available`
is False (this is necessary to avoid breaking the test suite). is False (this is necessary to avoid breaking the test suite).
""" """
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
...@@ -331,18 +336,23 @@ def use(device, ...@@ -331,18 +336,23 @@ def use(device,
test_driver=True): test_driver=True):
""" """
Error and warning about CUDA should be displayed only when this Error and warning about CUDA should be displayed only when this
function is called. We need to be able to load this module only function is called. We need to be able to load this module only
to check if it is available! to check if it is available!
:param device: string "cpu", "gpu", "gpuN" (N is the device number to use) Parameters
:param force: Will always raise an exception if we can't use the gpu. ----------
:param default_to_move_computation_to_gpu: If gpu init succeeded, enable by device : string
default optimizations to move "cpu", "gpu", "gpuN" (N is the device number to use).
computations to the gpu force
:param move_shared_float32_to_gpu: If gpu init succeeded, put new shared Will always raise an exception if we can't use the gpu.
variables in float32 on the gpu. default_to_move_computation_to_gpu
:param enable_cuda: If the gpu is correctly enabled, If gpu init succeeded, enable by default optimizations to move
set the variable cuda_enabled to True. computations to the gpu.
move_shared_float32_to_gpu
If gpu init succeeded, put new shared variables in float32 on the gpu.
enable_cuda
If the gpu is correctly enabled, set the variable cuda_enabled to True.
""" """
global cuda_enabled, cuda_initialization_error_message global cuda_enabled, cuda_initialization_error_message
if force and not cuda_available and device.startswith('gpu'): if force and not cuda_available and device.startswith('gpu'):
...@@ -480,7 +490,7 @@ use.device_number = None ...@@ -480,7 +490,7 @@ use.device_number = None
def unuse(): def unuse():
""" """
This undo what was done by the call to This undo what was done by the call to.
use('gpu[0-9]', default_to_move_computation_to_gpu=True, use('gpu[0-9]', default_to_move_computation_to_gpu=True,
move_shared_float32_to_gpu=True, move_shared_float32_to_gpu=True,
...@@ -488,7 +498,9 @@ def unuse(): ...@@ -488,7 +498,9 @@ def unuse():
This is used in Pylearn2 tests to enable/disable the GPU when needed. This is used in Pylearn2 tests to enable/disable the GPU when needed.
After this call, the rest of Theano think the GPU shouldn't be used by default. After this call, the rest of Theano think the GPU shouldn't be used by
default.
""" """
global cuda_enabled global cuda_enabled
cuda_enabled = False cuda_enabled = False
...@@ -502,9 +514,11 @@ def unuse(): ...@@ -502,9 +514,11 @@ def unuse():
def handle_shared_float32(tf): def handle_shared_float32(tf):
"""Set the default shared type for float32 tensor to CudaNdarrayType """
Set the default shared type for float32 tensor to CudaNdarrayType.
This function is intended to be called from use(gpu_index), not directly. This function is intended to be called from use(gpu_index), not directly.
""" """
if tf: if tf:
theano.compile.shared_constructor(float32_shared_constructor) theano.compile.shared_constructor(float32_shared_constructor)
......
"""This file implement 3 different version of the elemwise op on the """
This file implement 3 different version of the elemwise op on the
gpu. Only NaiveAlgo is used and it is not very naive now. gpu. Only NaiveAlgo is used and it is not very naive now.
The elemwise fct are also used with scalar operation! So it can happen The elemwise fct are also used with scalar operation! So it can happen
...@@ -40,12 +41,25 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', ...@@ -40,12 +41,25 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value',
class SupportCodeError(Exception): class SupportCodeError(Exception):
"""It is currently not possible to auto-generate a GPU implementation for """
It is currently not possible to auto-generate a GPU implementation for
an elementwise Op with c_support_code_apply(). an elementwise Op with c_support_code_apply().
But we support Op.c_support_code.""" But we support Op.c_support_code.
"""
class NaiveAlgo(object): class NaiveAlgo(object):
"""
Parameters
----------
scalar_op
The scalar operation to execute on each element.
sync
If True, will wait after the kernel launch and check for error call.
"""
verbose = 0 # 1, 2 or 3 for more verbose output. verbose = 0 # 1, 2 or 3 for more verbose output.
@property @property
...@@ -57,10 +71,6 @@ class NaiveAlgo(object): ...@@ -57,10 +71,6 @@ class NaiveAlgo(object):
return ver return ver
def __init__(self, scalar_op, sync=True, inplace_pattern=None): def __init__(self, scalar_op, sync=True, inplace_pattern=None):
"""
:param scalar_op: the scalar operation to execute on each element.
:param sync: if True, will wait after the kernel launch and check for error call.
"""
if inplace_pattern is None: if inplace_pattern is None:
inplace_pattern = {} inplace_pattern = {}
try: try:
...@@ -154,8 +164,10 @@ class NaiveAlgo(object): ...@@ -154,8 +164,10 @@ class NaiveAlgo(object):
return sio.getvalue() return sio.getvalue()
def c_src_kernel_tiling(self, node, nodename): def c_src_kernel_tiling(self, node, nodename):
""" The kernel applies to problems with <= 5 dimensions """ """
The kernel applies to problems with <= 5 dimensions.
"""
# The kernel is intended to be structured roughly like this: # The kernel is intended to be structured roughly like this:
""" """
static __global__ void kernel() static __global__ void kernel()
...@@ -278,8 +290,10 @@ class NaiveAlgo(object): ...@@ -278,8 +290,10 @@ class NaiveAlgo(object):
return sio.getvalue() return sio.getvalue()
def c_src_kernel_tiling_less_registers(self, node, nodename): def c_src_kernel_tiling_less_registers(self, node, nodename):
""" The kernel applies to problems with <= 5 dimensions """ """
The kernel applies to problems with <= 5 dimensions.
"""
nd = node.outputs[0].type.ndim nd = node.outputs[0].type.ndim
n_in = len(node.inputs) n_in = len(node.inputs)
n_out = len(node.outputs) n_out = len(node.outputs)
...@@ -1049,12 +1063,16 @@ class ErfinvGPU(Erfinv): ...@@ -1049,12 +1063,16 @@ class ErfinvGPU(Erfinv):
""" """
Provides a c-code implementation of the inverse error function for GPU. Provides a c-code implementation of the inverse error function for GPU.
Note: We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we Notes
-----
We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we
currently rely on Nvidia's cublas library to provide the erfinv currently rely on Nvidia's cublas library to provide the erfinv
c-implementation (which requires different c_headers). As it stands, c-implementation (which requires different c_headers). As it stands,
theano.scalar.basic_scipy.Erfinv does not have c_code as scipy does not theano.scalar.basic_scipy.Erfinv does not have c_code as scipy does not
export the required C function export the required C function.
""" """
def c_headers(self): def c_headers(self):
return ['math_functions.h', 'cublas_v2.h'] return ['math_functions.h', 'cublas_v2.h']
...@@ -1070,14 +1088,19 @@ erfinv_gpu = ErfinvGPU(upgrade_to_float_no_complex, name='erfinv_gpu') ...@@ -1070,14 +1088,19 @@ erfinv_gpu = ErfinvGPU(upgrade_to_float_no_complex, name='erfinv_gpu')
class ErfcxGPU(Erfinv): class ErfcxGPU(Erfinv):
""" """
Provides a c-code implementation of the scaled complementary error function for GPU. Provides a c-code implementation of the scaled complementary error function
for GPU.
Note: We do not add this c_code to theano.scalar.basic_scipy.Erfcx, as we Notes
-----
We do not add this c_code to theano.scalar.basic_scipy.Erfcx, as we
currently rely on Nvidia's cublas library to provide the erfcx currently rely on Nvidia's cublas library to provide the erfcx
c-implementation (which requires different c_headers). As it stands, c-implementation (which requires different c_headers). As it stands,
theano.scalar.basic_scipy.Erfcx does not have c_code as scipy does not theano.scalar.basic_scipy.Erfcx does not have c_code as scipy does not
export the required C function export the required C function.
""" """
def c_headers(self): def c_headers(self):
return ['math_functions.h', 'cublas_v2.h'] return ['math_functions.h', 'cublas_v2.h']
...@@ -1088,4 +1111,4 @@ class ErfcxGPU(Erfinv): ...@@ -1088,4 +1111,4 @@ class ErfcxGPU(Erfinv):
raise NotImplementedError('type not supported', type) raise NotImplementedError('type not supported', type)
return "%(z)s = erfcx(%(x)s);" % locals() return "%(z)s = erfcx(%(x)s);" % locals()
erfcx_gpu = ErfcxGPU(upgrade_to_float_no_complex, name='erfcx_gpu') erfcx_gpu = ErfcxGPU(upgrade_to_float_no_complex, name='erfcx_gpu')
\ No newline at end of file
...@@ -13,13 +13,19 @@ if cuda_available: ...@@ -13,13 +13,19 @@ if cuda_available:
class GpuCumsum(CumsumOp, GpuOp): class GpuCumsum(CumsumOp, GpuOp):
"""
Parameters
----------
axis
Can not be None. If you want the array flatten, do it before.
"""
SUPPORTED_NDIMS = 3 SUPPORTED_NDIMS = 3
__props__ = ('axis', 'max_threads_dim0', 'max_grid_size1', 'max_grid_size2') __props__ = ('axis', 'max_threads_dim0', 'max_grid_size1', 'max_grid_size2')
def __init__(self, axis): def __init__(self, axis):
"""
``axis`` can not be None. If you want the array flatten, do it before.
"""
self.axis = axis self.axis = axis
self.max_threads_dim0 = None self.max_threads_dim0 = None
self.max_grid_size1 = None self.max_grid_size1 = None
...@@ -415,11 +421,13 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -415,11 +421,13 @@ class GpuCumsum(CumsumOp, GpuOp):
def values_eq_approx_high_tol(a, b): def values_eq_approx_high_tol(a, b):
"""This fct is needed to don't have DebugMode raise useless """
This fct is needed to don't have DebugMode raise useless
error due to rounding error. error due to rounding error.
This happen with big input size due to change in the order of This happen with big input size due to change in the order of
operation. operation.
""" """
rtol = None rtol = None
if a.size > 100000: if a.size > 100000:
......
...@@ -171,10 +171,11 @@ class CuIFFTOp(ScikitsCudaOp): ...@@ -171,10 +171,11 @@ class CuIFFTOp(ScikitsCudaOp):
def to_complex_gpuarray(x, copyif=False): def to_complex_gpuarray(x, copyif=False):
""" """
adapted version of theano.misc.pycuda_utils.to_gpuarray that takes Adapted version of theano.misc.pycuda_utils.to_gpuarray that takes
an array with an extra trailing dimension of length 2 for an array with an extra trailing dimension of length 2 for
real/imaginary parts, and turns it into a complex64 PyCUDA real/imaginary parts, and turns it into a complex64 PyCUDA
GPUArray. GPUArray.
""" """
if not isinstance(x, CudaNdarray): if not isinstance(x, CudaNdarray):
raise ValueError("We can transfer only CudaNdarray " raise ValueError("We can transfer only CudaNdarray "
...@@ -213,7 +214,8 @@ def bptrs(a): ...@@ -213,7 +214,8 @@ def bptrs(a):
""" """
Pointer array when input represents a batch of matrices. Pointer array when input represents a batch of matrices.
taken from scikits.cuda tests/test_cublas.py Taken from scikits.cuda tests/test_cublas.py.
""" """
return pycuda.gpuarray.arange(a.ptr, a.ptr + a.shape[0] * a.strides[0], return pycuda.gpuarray.arange(a.ptr, a.ptr + a.shape[0] * a.strides[0],
a.strides[0], dtype=cublas.ctypes.c_void_p) a.strides[0], dtype=cublas.ctypes.c_void_p)
...@@ -222,8 +224,9 @@ def bptrs(a): ...@@ -222,8 +224,9 @@ def bptrs(a):
def sc_complex_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N', def sc_complex_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N',
handle=None): handle=None):
""" """
uses cublasCgemmBatched to compute a bunch of complex dot products Uses cublasCgemmBatched to compute a bunch of complex dot products
in parallel in parallel.
""" """
if handle is None: if handle is None:
handle = scikits.cuda.misc._global_cublas_handle handle = scikits.cuda.misc._global_cublas_handle
...@@ -292,7 +295,9 @@ class BatchedComplexDotOp(ScikitsCudaOp): ...@@ -292,7 +295,9 @@ class BatchedComplexDotOp(ScikitsCudaOp):
""" """
This version uses cublasCgemmBatched under the hood, instead of This version uses cublasCgemmBatched under the hood, instead of
doing multiple cublasCgemm calls. doing multiple cublasCgemm calls.
""" """
def make_node(self, inp1, inp2): def make_node(self, inp1, inp2):
inp1 = basic_ops.gpu_contiguous( inp1 = basic_ops.gpu_contiguous(
basic_ops.as_cuda_ndarray_variable(inp1)) basic_ops.as_cuda_ndarray_variable(inp1))
...@@ -355,10 +360,15 @@ batched_complex_dot = BatchedComplexDotOp() ...@@ -355,10 +360,15 @@ batched_complex_dot = BatchedComplexDotOp()
def mult_and_reduce(input_fft_v, filters_fft_v, input_shape=None, def mult_and_reduce(input_fft_v, filters_fft_v, input_shape=None,
filter_shape=None): filter_shape=None):
""" """
input_fft_v is (b, ic, i0, i1//2 + 1, 2)
filters_fft_v is (oc, ic, i0, i1//2 + 1, 2)
"""
Parameters
----------
input_fft_v
It's (b, ic, i0, i1//2 + 1, 2).
filters_fft_v
It's (oc, ic, i0, i1//2 + 1, 2).
"""
if input_shape is None: if input_shape is None:
input_shape = input_fft_v.shape # symbolic input_shape = input_fft_v.shape # symbolic
...@@ -405,16 +415,19 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -405,16 +415,19 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
On valid mode the filters must be smaller than the input. On valid mode the filters must be smaller than the input.
input: (b, ic, i0, i1) Parameters
filters: (oc, ic, f0, f1) ----------
input
border_mode: 'valid' of 'full' (b, ic, i0, i1).
filters
(oc, ic, f0, f1).
border_mode : {'valid', 'full'}
pad_last_dim
Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
pad_last_dim: Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
""" """
# use symbolic shapes to compute shape info at runtime if not specified # use symbolic shapes to compute shape info at runtime if not specified
if image_shape is None: if image_shape is None:
image_shape = input.shape image_shape = input.shape
...@@ -546,16 +559,19 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -546,16 +559,19 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
On valid mode the filters must be smaller than the input. On valid mode the filters must be smaller than the input.
input: (b, ic, i0, i1, i2) Parameters
filters: (oc, ic, f0, f1, i2) ----------
input
border_mode: 'valid' of 'full' (b, ic, i0, i1, i2).
filters
(oc, ic, f0, f1, i2).
border_mode : {'valid', 'full'}.
pad_last_dim
Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
pad_last_dim: Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
""" """
# use symbolic shapes to compute shape info at runtime if not specified # use symbolic shapes to compute shape info at runtime if not specified
if image_shape is None: if image_shape is None:
image_shape = input.shape image_shape = input.shape
...@@ -670,5 +686,3 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -670,5 +686,3 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# output should now be the result of a batched valid convolution # output should now be the result of a batched valid convolution
# of the input with the filters. # of the input with the filters.
return basic_ops.as_cuda_ndarray_variable(output) return basic_ops.as_cuda_ndarray_variable(output)
""" Helper routines for generating gpu kernels for nvcc.
""" """
Helper routines for generating gpu kernels for nvcc.
"""
def nvcc_kernel(name, params, body):
"""Return the c code of a kernel function.
:param params: the parameters to the function as one or more strings def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
:param body: the [nested] list of statements for the body of the Parameters
function. These will be separated by ';' characters. ----------
params
The parameters to the function as one or more strings.
body
The [nested] list of statements for the body of the
function. These will be separated by ';' characters.
""" """
paramstr = ', '.join(params) paramstr = ', '.join(params)
...@@ -29,7 +35,10 @@ def nvcc_kernel(name, params, body): ...@@ -29,7 +35,10 @@ def nvcc_kernel(name, params, body):
def code_version(version): def code_version(version):
"""decorator to support version-based cache mechanism""" """
Decorator to support version-based cache mechanism.
"""
if not isinstance(version, tuple): if not isinstance(version, tuple):
raise TypeError('version must be tuple', version) raise TypeError('version must be tuple', version)
...@@ -43,22 +52,31 @@ UNVERSIONED = () ...@@ -43,22 +52,31 @@ UNVERSIONED = ()
@code_version((1,)) @code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn): def inline_reduce(N, buf, pos, count, manner_fn):
"""Return C++ code for a function that reduces a contiguous buffer. """
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer Parameters
:param pos: index of executing thread ----------
:param count: number of executing threads N
Length of the buffer.
:param manner_fn: a function that accepts strings of arguments a buf
Buffer pointer.
pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example: and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction). return "%(a)s + %(b)s" for a sum reduction).
:postcondition: :postcondition:
This function leaves the answer in position 0 of the buffer. The This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function. rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
""" """
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf)) loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
...@@ -127,18 +145,26 @@ def inline_reduce_prod(N, buf, pos, count): ...@@ -127,18 +145,26 @@ def inline_reduce_prod(N, buf, pos, count):
def inline_softmax(N, buf, buf2, threadPos, threadCount): def inline_softmax(N, buf, buf2, threadPos, threadCount):
""" """
:param N: length of the buffer Parameters
:param threadPos: index of executing thread ----------
:param threadCount: number of executing threads N
Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
:Precondition: buf and buf2 contain two identical copies of the input :Precondition: buf and buf2 contain two identical copies of the input
to softmax to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized :Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times Notes
-----
buf and buf2 should be in gpu shared memory, we access it many times.
We use __i as an int variable in a loop.
:note2: We use __i as an int variable in a loop
""" """
return [ return [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
...@@ -169,26 +195,38 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -169,26 +195,38 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
manner_fn, manner_init, manner_fn, manner_init,
b='', stride_b=''): b='', stride_b=''):
"""Return C++ code for a function that reduces a contiguous buffer. """
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(float) Parameters
:param pos: index of executing thread ----------
:param count: number of executing threads N
:param b: Optional, pointer to the bias Length of the buffer.
:param stride_b: Optional, the stride of b if b is provided buf
Buffer pointer of size warpSize * sizeof(float).
:param manner_fn: a function that accepts strings of arguments a pos
Index of executing thread.
count
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
manner_fn
A function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example: and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction). return "%(a)s + %(b)s" for a sum reduction).
:param manner_init: a function that accepts strings of arguments a manner_init
and return c code for its initialization A function that accepts strings of arguments a
and return c code for its initialization.
:postcondition: :postcondition:
This function leaves the answer in position 0 of the buffer. The This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function. rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
""" """
if b: if b:
...@@ -263,24 +301,39 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, ...@@ -263,24 +301,39 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
b='', stride_b=''): b='', stride_b=''):
""" """
:param N: length of the buffer, atleast waprSize(32). Parameters
:param buf: a shared memory buffer of size warpSize * sizeof(float) ----------
:param x: a ptr to the gpu memory where the row is stored N
:param stride_x: the stride between each element in x Length of the buffer, atleast waprSize(32).
:param sm: a ptr to the gpu memory to store the result buf
:param sm_stride: the stride between eash sm element A shared memory buffer of size warpSize * sizeof(float).
:param threadPos: index of executing thread x
:param threadCount: number of executing threads A ptr to the gpu memory where the row is stored.
:param b: Optional, pointer to the bias stride_x
:param stride_b: Optional, the stride of b if b is provided The stride between each element in x.
sm
A ptr to the gpu memory to store the result.
sm_stride
The stride between each sm element.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
:Precondition: buf is empty :Precondition: buf is empty
:Postcondition: buf[0] contains the softmax, :Postcondition: buf[0] contains the softmax,
buf2 contains un-normalized softmax buf2 contains un-normalized softmax
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
We use tx as an int variable in a loop.
:note2: We use tx as an int variable in a loop
""" """
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
......
...@@ -13,7 +13,9 @@ from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, ...@@ -13,7 +13,9 @@ from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp): class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
""" """
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
""" """
nin = 3 nin = 3
nout = 3 nout = 3
...@@ -224,7 +226,9 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1Ho ...@@ -224,7 +226,9 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1Ho
class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp): class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
""" """
nin = 3 nin = 3
nout = 1 nout = 1
"""Gradient wrt x of the CrossentropySoftmax1Hot Op""" """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
...@@ -393,7 +397,9 @@ gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasD ...@@ -393,7 +397,9 @@ gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasD
class GpuSoftmax(GpuOp): class GpuSoftmax(GpuOp):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
...@@ -555,7 +561,9 @@ gpu_softmax = GpuSoftmax() ...@@ -555,7 +561,9 @@ gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias(GpuOp): class GpuSoftmaxWithBias(GpuOp):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
""" """
nin = 2 nin = 2
nout = 1 nout = 1
......
...@@ -85,7 +85,10 @@ nvcc_version = None ...@@ -85,7 +85,10 @@ nvcc_version = None
def is_nvcc_available(): def is_nvcc_available():
"""Return True iff the nvcc compiler is found.""" """
Return True iff the nvcc compiler is found.
"""
def set_version(): def set_version():
p_out = output_subprocess_Popen([nvcc_path, '--version']) p_out = output_subprocess_Popen([nvcc_path, '--version'])
ver_line = decode(p_out[0]).strip().split('\n')[-1] ver_line = decode(p_out[0]).strip().split('\n')[-1]
...@@ -150,6 +153,7 @@ class NVCC_compiler(Compiler): ...@@ -150,6 +153,7 @@ class NVCC_compiler(Compiler):
""" """
This args will be received by compile_str() in the preargs paramter. This args will be received by compile_str() in the preargs paramter.
They will also be included in the "hard" part of the key module. They will also be included in the "hard" part of the key module.
""" """
flags = [flag for flag in config.nvcc.flags.split(' ') if flag] flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
if config.nvcc.fastmath: if config.nvcc.fastmath:
...@@ -209,33 +213,47 @@ class NVCC_compiler(Compiler): ...@@ -209,33 +213,47 @@ class NVCC_compiler(Compiler):
module_name, src_code, module_name, src_code,
location=None, include_dirs=[], lib_dirs=[], libs=[], preargs=[], location=None, include_dirs=[], lib_dirs=[], libs=[], preargs=[],
rpaths=rpath_defaults, py_module=True, hide_symbols=True): rpaths=rpath_defaults, py_module=True, hide_symbols=True):
""":param module_name: string (this has been embedded in the src_code """
:param src_code: a complete c or c++ source listing for the module
:param location: a pre-existing filesystem directory where the
cpp file and .so will be written
:param include_dirs: a list of include directory names
(each gets prefixed with -I)
:param lib_dirs: a list of library search path directory names
(each gets prefixed with -L)
:param libs: a list of libraries to link with
(each gets prefixed with -l)
:param preargs: a list of extra compiler arguments
:param rpaths: list of rpaths to use with Xlinker.
Defaults to `rpath_defaults`.
:param py_module: if False, compile to a shared library, but
do not import as a Python module.
:param hide_symbols: if True (the default), hide all symbols
from the library symbol table unless explicitely exported.
:returns: dynamically-imported python module of the compiled code. Parameters
----------
module_name: str
This has been embedded in the src_code.
src_code
A complete c or c++ source listing for the module.
location
A pre-existing filesystem directory where the
cpp file and .so will be written.
include_dirs
A list of include directory names (each gets prefixed with -I).
lib_dirs
A list of library search path directory names (each gets
prefixed with -L).
libs
A list of libraries to link with (each gets prefixed with -l).
preargs
A list of extra compiler arguments.
rpaths
List of rpaths to use with Xlinker. Defaults to `rpath_defaults`.
py_module
If False, compile to a shared library, but
do not import as a Python module.
hide_symbols
If True (the default), hide all symbols from the library symbol
table unless explicitely exported.
Returns
-------
module
Dynamically-imported python module of the compiled code.
(unless py_module is False, in that case returns None.) (unless py_module is False, in that case returns None.)
:note 1: On Windows 7 with nvcc 3.1 we need to compile in the Notes
real directory Otherwise nvcc never finish. -----
On Windows 7 with nvcc 3.1 we need to compile in the real directory
Otherwise nvcc never finish.
""" """
rpaths = list(rpaths) rpaths = list(rpaths)
if sys.platform == "win32": if sys.platform == "win32":
......
...@@ -141,7 +141,9 @@ class InputToGpuOptimizer(Optimizer): ...@@ -141,7 +141,9 @@ class InputToGpuOptimizer(Optimizer):
Transfer the input of a graph to the gpu if it is necessary. Transfer the input of a graph to the gpu if it is necessary.
It should make this part of the optimizer faster we will will need only 1 It should make this part of the optimizer faster we will will need only 1
pass on the fgraph. pass on the fgraph.
""" """
def __init__(self): def __init__(self):
Optimizer.__init__(self) Optimizer.__init__(self)
...@@ -208,7 +210,10 @@ def dtype_in_elemwise_supported(op): ...@@ -208,7 +210,10 @@ def dtype_in_elemwise_supported(op):
Return True of the Elemwise op is supported on the gpu. Return True of the Elemwise op is supported on the gpu.
Return False otherwise. Return False otherwise.
:note: We need to check inside the Composite op. Notes
-----
We need to check inside the Composite op.
""" """
def get_all_basic_scalar(composite_op): def get_all_basic_scalar(composite_op):
l = [] l = []
...@@ -231,8 +236,10 @@ def dtype_in_elemwise_supported(op): ...@@ -231,8 +236,10 @@ def dtype_in_elemwise_supported(op):
@register_opt() @register_opt()
@local_optimizer([tensor.Elemwise]) @local_optimizer([tensor.Elemwise])
def local_gpu_elemwise_0(node): def local_gpu_elemwise_0(node):
"""elemwise(..., host_from_gpu, ...) """
-> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host) Elemwise(..., host_from_gpu, ...)
-> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
""" """
if (isinstance(node.op, tensor.Elemwise) and if (isinstance(node.op, tensor.Elemwise) and
dtype_in_elemwise_supported(node.op)): dtype_in_elemwise_supported(node.op)):
...@@ -294,6 +301,7 @@ def local_gpu_elemwise_0(node): ...@@ -294,6 +301,7 @@ def local_gpu_elemwise_0(node):
def local_gpu_elemwise_1(node): def local_gpu_elemwise_1(node):
""" """
gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...)) gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
""" """
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_i, = node.inputs host_i, = node.inputs
...@@ -350,6 +358,7 @@ def local_gpu_dimshuffle_0(node): ...@@ -350,6 +358,7 @@ def local_gpu_dimshuffle_0(node):
""" """
dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle) dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle)
gpu_from_host(dimshuffle) -> gpu_dimshuffle(gpu_from_host) gpu_from_host(dimshuffle) -> gpu_dimshuffle(gpu_from_host)
""" """
if isinstance(node.op, tensor.DimShuffle): if isinstance(node.op, tensor.DimShuffle):
input, = node.inputs input, = node.inputs
...@@ -375,6 +384,7 @@ def local_gpu_specifyShape_0(node): ...@@ -375,6 +384,7 @@ def local_gpu_specifyShape_0(node):
""" """
specify_shape(host_from_gpu()) -> host_from_gpu(specify_shape) specify_shape(host_from_gpu()) -> host_from_gpu(specify_shape)
gpu_from_host(specify_shape) -> specify_shape(gpu_from_host) gpu_from_host(specify_shape) -> specify_shape(gpu_from_host)
""" """
if isinstance(node.op, tensor.SpecifyShape): if isinstance(node.op, tensor.SpecifyShape):
input = node.inputs[0] input = node.inputs[0]
...@@ -403,11 +413,11 @@ def local_gpu_dot_to_dot22(node): ...@@ -403,11 +413,11 @@ def local_gpu_dot_to_dot22(node):
transforming the vector into a matrix, apply gpudot22 and reshaping transforming the vector into a matrix, apply gpudot22 and reshaping
the output. the output.
A more suitable solution would be to use the right cublas call A more suitable solution would be to use the right cublas call.
This is needed in fast_compile This is needed in fast_compile.
"""
"""
# In case the got do input upcast, we much check that we can # In case the got do input upcast, we much check that we can
# make it run on the gpu. # make it run on the gpu.
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
...@@ -482,10 +492,11 @@ theano.compile.optdb.register('assert_no_cpu_op', assert_no_cpu_op, 49.2) ...@@ -482,10 +492,11 @@ theano.compile.optdb.register('assert_no_cpu_op', assert_no_cpu_op, 49.2)
@register_opt() @register_opt()
@local_optimizer([theano.ifelse.IfElse, gpu_from_host]) @local_optimizer([theano.ifelse.IfElse, gpu_from_host])
def local_gpu_lazy_ifelse(node): def local_gpu_lazy_ifelse(node):
""" """
gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host) gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host)
ifelse(host_from_gpu) -> host_from_gpu(ifelse) ifelse(host_from_gpu) -> host_from_gpu(ifelse)
""" """
if isinstance(node.op, theano.ifelse.IfElse) and not node.op.gpu: if isinstance(node.op, theano.ifelse.IfElse) and not node.op.gpu:
gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True) gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True)
...@@ -554,6 +565,7 @@ def local_gpu_dot22(node): ...@@ -554,6 +565,7 @@ def local_gpu_dot22(node):
gpu_from_host(dot22) -> gpudot(gpu_from_host) gpu_from_host(dot22) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot22) dot(host_from_gpu) -> host_from_gpu(gpudot22)
""" """
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -577,6 +589,7 @@ def local_gpu_dot22scalar(node): ...@@ -577,6 +589,7 @@ def local_gpu_dot22scalar(node):
gpu_from_host(dot22scalar) -> gpudot(gpu_from_host) gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot22scalar) dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
""" """
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -602,7 +615,9 @@ def local_gpu_dot22scalar(node): ...@@ -602,7 +615,9 @@ def local_gpu_dot22scalar(node):
def local_gpu_solve(node): def local_gpu_solve(node):
""" """
gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host) gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve) CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
""" """
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -627,6 +642,7 @@ def local_gpu_solve(node): ...@@ -627,6 +642,7 @@ def local_gpu_solve(node):
def local_gpu_gemv(node): def local_gpu_gemv(node):
""" """
gpu_from_host(gemv) -> gpu_gemv(gpu_from_host) gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
gemv(host_from_gpu) -> host_from_gpu(gpu_gemv) gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
""" """
...@@ -665,6 +681,7 @@ def local_gpu_gemv(node): ...@@ -665,6 +681,7 @@ def local_gpu_gemv(node):
def local_gpu_ger(node): def local_gpu_ger(node):
""" """
gpu_from_host(ger) -> gpu_ger(gpu_from_host) gpu_from_host(ger) -> gpu_ger(gpu_from_host)
ger(host_from_gpu) -> host_from_gpu(gpu_ger) ger(host_from_gpu) -> host_from_gpu(gpu_ger)
""" """
...@@ -706,6 +723,7 @@ def local_gpu_gemm(node): ...@@ -706,6 +723,7 @@ def local_gpu_gemm(node):
gpu_from_host(gemm) -> gpu_gemm(gpu_from_host) gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm) gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
""" """
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -1120,7 +1138,10 @@ def local_gpu_shape(node): ...@@ -1120,7 +1138,10 @@ def local_gpu_shape(node):
@register_opt() @register_opt()
@local_optimizer([tensor.Rebroadcast]) @local_optimizer([tensor.Rebroadcast])
def local_gpu_rebroadcast(node): def local_gpu_rebroadcast(node):
'''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))''' """
rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))
"""
if isinstance(node.op, tensor.Rebroadcast): if isinstance(node.op, tensor.Rebroadcast):
x, = node.inputs x, = node.inputs
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
...@@ -1342,7 +1363,8 @@ def local_conv_fft_full(node): ...@@ -1342,7 +1363,8 @@ def local_conv_fft_full(node):
def values_eq_approx_high_tol(a, b): def values_eq_approx_high_tol(a, b):
"""This fct is needed to don't have DebugMode raise useless """
This fct is needed to don't have DebugMode raise useless
error due to ronding error. error due to ronding error.
This happen as We reduce on the two last dimensions, so this This happen as We reduce on the two last dimensions, so this
...@@ -1364,6 +1386,7 @@ def local_gpu_conv(node): ...@@ -1364,6 +1386,7 @@ def local_gpu_conv(node):
gpu_from_host(conv) -> gpu_conv(gpu_from_host) gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv) conv(host_from_gpu) -> host_from_gpu(gpu_conv)
""" """
def GpuConvOp_from_ConvOp(op): def GpuConvOp_from_ConvOp(op):
logical_img_hw = None logical_img_hw = None
...@@ -1534,7 +1557,10 @@ conv_groupopt.register('local_conv_gemm', local_conv_gemm, 30, ...@@ -1534,7 +1557,10 @@ conv_groupopt.register('local_conv_gemm', local_conv_gemm, 30,
class LocalCudaMetaOptimizer(LocalMetaOptimizer): class LocalCudaMetaOptimizer(LocalMetaOptimizer):
"""Base class for CUDA-based LocalMetaOptimizers""" """
Base class for CUDA-based LocalMetaOptimizers.
"""
def time_call(self, fn): def time_call(self, fn):
# Override time_call() to do device synchronization # Override time_call() to do device synchronization
...@@ -1827,7 +1853,6 @@ def local_gpu_join(node): ...@@ -1827,7 +1853,6 @@ def local_gpu_join(node):
by other opts, leaving us with by other opts, leaving us with
host_from_gpu(gpu_join) host_from_gpu(gpu_join)
For intermediate places in the graph not covered by the first opt, the For intermediate places in the graph not covered by the first opt, the
following could be useful: following could be useful:
...@@ -1911,8 +1936,12 @@ optdb.register('InplaceGpuBlasOpt', ...@@ -1911,8 +1936,12 @@ optdb.register('InplaceGpuBlasOpt',
def get_device_type_sizes(): def get_device_type_sizes():
""" """
:return:(gpu ptr size, cpu ptr size, int sizes(gpu and cpu))
:return type: tuple Returns
-------
tuple
(gpu ptr size, cpu ptr size, int sizes(gpu and cpu)).
""" """
if hasattr(get_device_type_sizes, 'rval'): if hasattr(get_device_type_sizes, 'rval'):
return get_device_type_sizes.rval return get_device_type_sizes.rval
...@@ -1941,7 +1970,7 @@ def get_device_type_sizes(): ...@@ -1941,7 +1970,7 @@ def get_device_type_sizes():
def max_inputs_to_GpuElemwise(node): def max_inputs_to_GpuElemwise(node):
""" """
return the maximum number of inputs this GpuElemwise Apply node can Return the maximum number of inputs this GpuElemwise Apply node can
accept. accept.
This is needed as currently there is a limit of 256 bytes of This is needed as currently there is a limit of 256 bytes of
...@@ -1950,8 +1979,8 @@ def max_inputs_to_GpuElemwise(node): ...@@ -1950,8 +1979,8 @@ def max_inputs_to_GpuElemwise(node):
2.x (not used). 2.x (not used).
This measures the number of parameters we put in our GPU function and This measures the number of parameters we put in our GPU function and
computes the maximum number of inputs that respect the 256 byte computes the maximum number of inputs that respect the 256 byte limit.
limit.
""" """
type_sizes = get_device_type_sizes() type_sizes = get_device_type_sizes()
int_size = type_sizes['int_size'] int_size = type_sizes['int_size']
...@@ -1986,6 +2015,7 @@ def split_huge_add_or_mul(node): ...@@ -1986,6 +2015,7 @@ def split_huge_add_or_mul(node):
This should not happen for other GpuElemwise as their is only the fusion This should not happen for other GpuElemwise as their is only the fusion
that can generate op with too much input and it check for that. that can generate op with too much input and it check for that.
""" """
if node.op.scalar_op in (scal.add, scal.mul): if node.op.scalar_op in (scal.add, scal.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node) max_nb_inputs = max_inputs_to_GpuElemwise(node)
...@@ -2135,6 +2165,7 @@ def local_gpu_eye(node): ...@@ -2135,6 +2165,7 @@ def local_gpu_eye(node):
gpu_from_host(eye) -> gpueye(gpu_from_host) gpu_from_host(eye) -> gpueye(gpu_from_host)
eye(host_from_gpu) -> host_from_gpu(gpueye) eye(host_from_gpu) -> host_from_gpu(gpueye)
""" """
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -2167,10 +2198,11 @@ def safe_to_cpu(x): ...@@ -2167,10 +2198,11 @@ def safe_to_cpu(x):
def gpu_safe_new(x, tag=''): def gpu_safe_new(x, tag=''):
""" """
Internal function that constructs a new variable from x with the same Internal function that constructs a new variable from x with the same
type, but with a different name ( old name + tag). This function is used type, but with a different name (old name + tag). This function is used
by gradient, or the R-op to construct new variables for the inputs of by gradient, or the R-op to construct new variables for the inputs of
the inner graph such that there is no interference between the original the inner graph such that there is no interference between the original
graph and the newly constructed graph. graph and the newly constructed graph.
""" """
if hasattr(x, 'name') and x.name is not None: if hasattr(x, 'name') and x.name is not None:
nw_name = x.name + tag nw_name = x.name + tag
...@@ -2188,8 +2220,9 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -2188,8 +2220,9 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
""" """
Different interface to clone, that allows you to pass inputs. Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with Compared to clone, this method always replaces the inputs with
new variables of the same type, and returns those ( in the same new variables of the same type, and returns those (in the same
order as the original inputs). order as the original inputs).
""" """
if tag is None: if tag is None:
tag = '' tag = ''
...@@ -2217,7 +2250,9 @@ def tensor_to_cuda(x): ...@@ -2217,7 +2250,9 @@ def tensor_to_cuda(x):
def local_gpu_extract_diagonal(node): def local_gpu_extract_diagonal(node):
""" """
extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal) extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host) gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
""" """
if (isinstance(node.op, nlinalg.ExtractDiag) and if (isinstance(node.op, nlinalg.ExtractDiag) and
isinstance(node.inputs[0].type, isinstance(node.inputs[0].type,
...@@ -2249,9 +2284,10 @@ def typeConstructor(broadcastable, dtype): ...@@ -2249,9 +2284,10 @@ def typeConstructor(broadcastable, dtype):
def gpuScanOptimization(node): def gpuScanOptimization(node):
""" """
scan(host_from_gpu) -> host_from_gpu(GPUscan) scan(host_from_gpu) -> host_from_gpu(GPUscan)
gpu_from_host(scan) -> GPUscan(gpu_from_host) gpu_from_host(scan) -> GPUscan(gpu_from_host)
"""
"""
# gpu_from_host(scan) -> GPUscan(gpu_from_host) # gpu_from_host(scan) -> GPUscan(gpu_from_host)
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
......
""" """
Define CURAND_RandomStreams - backed by CURAND Define CURAND_RandomStreams - backed by CURAND.
""" """
__authors__ = "James Bergstra" __authors__ = "James Bergstra"
...@@ -20,7 +21,8 @@ config = theano.config ...@@ -20,7 +21,8 @@ config = theano.config
class CURAND_Base(GpuOp): class CURAND_Base(GpuOp):
""" Base class for a random number generator implemented in CURAND. """
Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by The random number generator itself is an opaque reference managed by
CURAND. This Op uses a generic-typed shared variable to point to a CObject CURAND. This Op uses a generic-typed shared variable to point to a CObject
...@@ -30,18 +32,23 @@ class CURAND_Base(GpuOp): ...@@ -30,18 +32,23 @@ class CURAND_Base(GpuOp):
The actual random number generator is allocated from the seed, on the first The actual random number generator is allocated from the seed, on the first
call to allocate random numbers (see c_code). call to allocate random numbers (see c_code).
:note: Parameters
One caveat is that the random number state is simply not serializable. ----------
Consequently, attempts to serialize functions compiled with these output_type
random numbers will fail. A theano type (e.g. tensor.fvector).
seed: int
destructive
True or False (on the generator)
Notes
-----
One caveat is that the random number state is simply not serializable.
Consequently, attempts to serialize functions compiled with these
random numbers will fail.
""" """
def __init__(self, output_type, seed, destructive): def __init__(self, output_type, seed, destructive):
"""
output_type: a theano type (e.g. tensor.fvector)
seed: integer
destructive: True or False (on the generator)
"""
theano.gof.Op.__init__(self) theano.gof.Op.__init__(self)
self.destructive = destructive self.destructive = destructive
self.seed = seed self.seed = seed
...@@ -51,11 +58,17 @@ class CURAND_Base(GpuOp): ...@@ -51,11 +58,17 @@ class CURAND_Base(GpuOp):
assert output_type.dtype == "float32" assert output_type.dtype == "float32"
def as_destructive(self): def as_destructive(self):
"""Return an destructive version of self""" """
Return an destructive version of self.
"""
return self.__class__(self.output_type, self.seed, destructive=True) return self.__class__(self.output_type, self.seed, destructive=True)
def _config(self): def _config(self):
"""Return a tuple of attributes that define the Op""" """
Return a tuple of attributes that define the Op.
"""
return ( return (
self.destructive, self.destructive,
self.output_type, self.output_type,
...@@ -81,7 +94,7 @@ class CURAND_Base(GpuOp): ...@@ -81,7 +94,7 @@ class CURAND_Base(GpuOp):
""" """
Return a symbolic sample from generator. Return a symbolic sample from generator.
cls dictates the random variable (e.g. uniform, normal) cls dictates the random variable (e.g. uniform, normal).
""" """
v_size = theano.tensor.as_tensor_variable(size) v_size = theano.tensor.as_tensor_variable(size)
...@@ -237,8 +250,11 @@ class CURAND_Base(GpuOp): ...@@ -237,8 +250,11 @@ class CURAND_Base(GpuOp):
class CURAND_Normal(CURAND_Base): class CURAND_Normal(CURAND_Base):
"""Op to draw normal numbers using CURAND
""" """
Op to draw normal numbers using CURAND.
"""
def _curand_call_str(self, **kwargs): def _curand_call_str(self, **kwargs):
return """curandGenerateNormal(*gen, return """curandGenerateNormal(*gen,
CudaNdarray_DEV_DATA(%(o_sample)s), CudaNdarray_DEV_DATA(%(o_sample)s),
...@@ -248,8 +264,11 @@ class CURAND_Normal(CURAND_Base): ...@@ -248,8 +264,11 @@ class CURAND_Normal(CURAND_Base):
class CURAND_Uniform(CURAND_Base): class CURAND_Uniform(CURAND_Base):
"""Op to draw uniform numbers using CURAND
""" """
Op to draw uniform numbers using CURAND.
"""
def _curand_call_str(self, **kwargs): def _curand_call_str(self, **kwargs):
return """ curandGenerateUniform(*gen, return """ curandGenerateUniform(*gen,
CudaNdarray_DEV_DATA(%(o_sample)s), CudaNdarray_DEV_DATA(%(o_sample)s),
...@@ -262,24 +281,31 @@ class CURAND_RandomStreams(object): ...@@ -262,24 +281,31 @@ class CURAND_RandomStreams(object):
RandomStreams instance that creates CURAND-based random variables. RandomStreams instance that creates CURAND-based random variables.
One caveat is that generators are not serializable. One caveat is that generators are not serializable.
Parameters
----------
seed : int
""" """
def __init__(self, seed): def __init__(self, seed):
""" seed: int
"""
self._start_seed = seed self._start_seed = seed
self._cur_seed = seed self._cur_seed = seed
self._has_lost_states = False # True if self.state_updates incomplete self._has_lost_states = False # True if self.state_updates incomplete
self.state_updates = [] self.state_updates = []
def updates(self): def updates(self):
"""List of all (old, new) generator update pairs created by this """
List of all (old, new) generator update pairs created by this
instance. instance.
""" """
return list(self.state_updates) return list(self.state_updates)
def next_seed(self): def next_seed(self):
"""Return a unique seed for initializing a random variable. """
Return a unique seed for initializing a random variable.
""" """
self._cur_seed += 1 self._cur_seed += 1
return self._cur_seed - 1 return self._cur_seed - 1
...@@ -295,6 +321,7 @@ class CURAND_RandomStreams(object): ...@@ -295,6 +321,7 @@ class CURAND_RandomStreams(object):
dtype=config.floatX): dtype=config.floatX):
""" """
Return symbolic tensor of uniform numbers. Return symbolic tensor of uniform numbers.
""" """
if isinstance(size, tuple): if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
...@@ -321,8 +348,12 @@ class CURAND_RandomStreams(object): ...@@ -321,8 +348,12 @@ class CURAND_RandomStreams(object):
""" """
Return symbolic tensor of normally-distributed numbers. Return symbolic tensor of normally-distributed numbers.
:param: size: Can be a list of integer or Theano variable(ex: the shape Parameters
----------
size
Can be a list of integer or Theano variable (ex: the shape
of other Theano Variable) of other Theano Variable)
""" """
if isinstance(size, tuple): if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
......
"""Provide CudaNdarrayType """
Provide CudaNdarrayType.
""" """
from __future__ import print_function from __future__ import print_function
import os import os
...@@ -31,36 +33,47 @@ class CudaNdarrayType(Type): ...@@ -31,36 +33,47 @@ class CudaNdarrayType(Type):
dtype = 'float32' dtype = 'float32'
Variable = None Variable = None
""" This will be set to the Variable type corresponding to this class. """
This will be set to the Variable type corresponding to this class.
That variable type is `CudaNdarrayVariable` defined in the That variable type is `CudaNdarrayVariable` defined in the
``var.py`` file beside this one. ``var.py`` file beside this one.
:note: The var file depends on the file basic_ops.py, which Notes
depends on this file. A cyclic dependency is avoided by not -----
hardcoding ``Variable = CudaNdarrayVariable``. The var file depends on the file basic_ops.py, which depends on this file.
A cyclic dependency is avoided by not hardcoding
``Variable = CudaNdarrayVariable``.
""" """
Constant = None Constant = None
""" This will be set to `CudaNdarrayConstant` defined in ``var.py`` """
This will be set to `CudaNdarrayConstant` defined in ``var.py``.
:note: Notes
-----
The var file depends on the file basic_ops.py, which depends on this file. The var file depends on the file basic_ops.py, which depends on this file.
A cyclic dependency is avoided by not hardcoding this class. A cyclic dependency is avoided by not hardcoding this class.
""" """
SharedVariable = None SharedVariable = None
""" This will be set to `CudaNdarraySharedVariable` defined in ``var.py`` """
This will be set to `CudaNdarraySharedVariable` defined in ``var.py``.
:note: Notes
-----
The var file depends on the file basic_ops.py, which depends on this file. The var file depends on the file basic_ops.py, which depends on this file.
A cyclic dependency is avoided by not hardcoding this class. A cyclic dependency is avoided by not hardcoding this class.
""" """
if cuda is not None: if cuda is not None:
value_zeros = staticmethod(cuda.CudaNdarray.zeros) value_zeros = staticmethod(cuda.CudaNdarray.zeros)
""" """
Create an CudaNdarray full of 0 values Create an CudaNdarray full of 0 values.
""" """
def __init__(self, broadcastable, name=None, dtype=None): def __init__(self, broadcastable, name=None, dtype=None):
...@@ -120,11 +133,13 @@ class CudaNdarrayType(Type): ...@@ -120,11 +133,13 @@ class CudaNdarrayType(Type):
data) data)
def filter_variable(self, other, allow_convert=True): def filter_variable(self, other, allow_convert=True):
"""Convert a Variable into a CudaNdarrayType, if compatible. """
Convert a Variable into a CudaNdarrayType, if compatible.
This Variable should either already be a CudaNdarrayType, or be This Variable should either already be a CudaNdarrayType, or be
a TensorType. It has to have the right number of dimensions, a TensorType. It has to have the right number of dimensions,
broadcastable pattern, and dtype. broadcastable pattern, and dtype.
""" """
if hasattr(other, '_as_CudaNdarrayVariable'): if hasattr(other, '_as_CudaNdarrayVariable'):
other = other._as_CudaNdarrayVariable() other = other._as_CudaNdarrayVariable()
...@@ -209,10 +224,12 @@ class CudaNdarrayType(Type): ...@@ -209,10 +224,12 @@ class CudaNdarrayType(Type):
) )
def dtype_specs(self): def dtype_specs(self):
"""Return a tuple (python type, c type, numpy typenum) that """
corresponds to self.dtype. Return a tuple (python type, c type, numpy typenum) that corresponds
to self.dtype.
This function is used internally as part of C code generation. This function is used internally as part of C code generation.
""" """
# TODO: add more type correspondances for e.g. int32, int64, float32, # TODO: add more type correspondances for e.g. int32, int64, float32,
# complex64, etc. # complex64, etc.
...@@ -236,7 +253,10 @@ class CudaNdarrayType(Type): ...@@ -236,7 +253,10 @@ class CudaNdarrayType(Type):
self.__class__.__name__, self.dtype)) self.__class__.__name__, self.dtype))
def __eq__(self, other): def __eq__(self, other):
"""Compare True iff other is the same kind of CudaNdarrayType""" """
Compare True iff other is the same kind of CudaNdarrayType.
"""
return (type(self) == type(other) and return (type(self) == type(other) and
other.broadcastable == self.broadcastable) other.broadcastable == self.broadcastable)
...@@ -248,12 +268,16 @@ class CudaNdarrayType(Type): ...@@ -248,12 +268,16 @@ class CudaNdarrayType(Type):
return theano.tensor.patternbroadcast(var, self.broadcastable) return theano.tensor.patternbroadcast(var, self.broadcastable)
def __hash__(self): def __hash__(self):
"""Hash equal for same kinds of CudaNdarrayType""" """
Hash equal for same kinds of CudaNdarrayType.
"""
return hash(type(self)) ^ hash(self.broadcastable) return hash(type(self)) ^ hash(self.broadcastable)
ndim = property(lambda self: len(self.broadcastable), ndim = property(lambda self: len(self.broadcastable),
doc="number of dimensions") doc="number of dimensions")
"""Number of dimensions """
Number of dimensions.
This read-only property is the preferred way to get the number of This read-only property is the preferred way to get the number of
dimensions of a `CudaNdarrayType`. dimensions of a `CudaNdarrayType`.
...@@ -261,12 +285,14 @@ class CudaNdarrayType(Type): ...@@ -261,12 +285,14 @@ class CudaNdarrayType(Type):
""" """
def make_variable(self, name=None): def make_variable(self, name=None):
"""Return a `TensorVariable` of this type """
Return a `TensorVariable` of this type.
:Parameters: Parameters
- `name`: str ----------
A pretty name to identify this `Variable` when printing and name : str
debugging A pretty name to identify this `Variable` when printing and
debugging.
""" """
return self.Variable(self, name=name) return self.Variable(self, name=name)
...@@ -381,7 +407,9 @@ class CudaNdarrayType(Type): ...@@ -381,7 +407,9 @@ class CudaNdarrayType(Type):
return sio.getvalue() return sio.getvalue()
def c_extract_out(self, name, sub, check_input=True, check_broadcast=True): def c_extract_out(self, name, sub, check_input=True, check_broadcast=True):
""" To allow the hack to skip check_broadcast. """
To allow the hack to skip check_broadcast.
""" """
return """ return """
if (py_%(name)s == Py_None) if (py_%(name)s == Py_None)
...@@ -411,7 +439,10 @@ class CudaNdarrayType(Type): ...@@ -411,7 +439,10 @@ class CudaNdarrayType(Type):
""" % locals() """ % locals()
def c_sync(self, name, sub): def c_sync(self, name, sub):
"""Override `CLinkerOp.c_sync` """ """
Override `CLinkerOp.c_sync`.
"""
return """ return """
//std::cerr << "sync\\n"; //std::cerr << "sync\\n";
if (NULL == %(name)s) { if (NULL == %(name)s) {
...@@ -433,11 +464,17 @@ class CudaNdarrayType(Type): ...@@ -433,11 +464,17 @@ class CudaNdarrayType(Type):
""" % locals() """ % locals()
def c_headers(self): def c_headers(self):
"""Override `CLinkerOp.c_headers` """ """
Override `CLinkerOp.c_headers`.
"""
return ['cuda_ndarray.cuh'] return ['cuda_ndarray.cuh']
def c_header_dirs(self): def c_header_dirs(self):
"""Override `CLinkerOp.c_headers` """ """
Override `CLinkerOp.c_headers`.
"""
ret = [os.path.dirname(cuda_ndarray.__file__)] ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root cuda_root = config.cuda.root
if cuda_root: if cuda_root:
......
...@@ -19,15 +19,18 @@ except ImportError: ...@@ -19,15 +19,18 @@ except ImportError:
class _operators(tensor.basic._tensor_py_operators): class _operators(tensor.basic._tensor_py_operators):
"""Define a few properties and conversion methods for CudaNdarray Variables. """
Define a few properties and conversion methods for CudaNdarray Variables.
The default implementation of arithemetic operators is to build graphs of TensorType The default implementation of arithemetic operators is to build graphs of
variables. TensorType variables.
The optimization pass (specialization) will insert pure GPU implementations. The optimization pass (specialization) will insert pure GPU implementations.
This approach relieves the Cuda-Ops of having to deal with input argument checking and This approach relieves the Cuda-Ops of having to deal with input argument
gradients. checking and gradients.
""" """
def _as_TensorVariable(self): def _as_TensorVariable(self):
return HostFromGpu()(self) return HostFromGpu()(self)
def _as_CudaNdarrayVariable(self): def _as_CudaNdarrayVariable(self):
...@@ -63,7 +66,8 @@ CudaNdarrayType.Constant = CudaNdarrayConstant ...@@ -63,7 +66,8 @@ CudaNdarrayType.Constant = CudaNdarrayConstant
class CudaNdarraySharedVariable(_operators, SharedVariable): class CudaNdarraySharedVariable(_operators, SharedVariable):
""" """
Shared Variable interface to CUDA-allocated arrays Shared Variable interface to CUDA-allocated arrays.
""" """
get_value_return_ndarray = True get_value_return_ndarray = True
...@@ -72,20 +76,23 @@ class CudaNdarraySharedVariable(_operators, SharedVariable): ...@@ -72,20 +76,23 @@ class CudaNdarraySharedVariable(_operators, SharedVariable):
""" """
Return the value of this SharedVariable's internal array. Return the value of this SharedVariable's internal array.
:param borrow: Parameters
permit the return of internal storage, when used in conjunction with ----------
``return_internal_type=True`` borrow
:param return_internal_type: Permit the return of internal storage, when used in conjunction with
True to return the internal ``cuda_ndarray`` instance rather than a ``numpy.ndarray`` ``return_internal_type=True``.
(Default False) return_internal_type
True to return the internal ``cuda_ndarray`` instance rather than a
``numpy.ndarray`` (Default False).
By default ``get_value()`` copies from the GPU to a ``numpy.ndarray`` and returns that By default ``get_value()`` copies from the GPU to a ``numpy.ndarray``
host-allocated array. and returns that host-allocated array.
``get_value(False,True)`` will return a GPU-allocated copy of the original GPU array. ``get_value(False,True)`` will return a GPU-allocated copy of the
original GPU array.
``get_value(True,True)`` will return the original GPU-allocated array without any ``get_value(True,True)`` will return the original GPU-allocated array
copying. without any copying.
""" """
if return_internal_type or not self.get_value_return_ndarray: if return_internal_type or not self.get_value_return_ndarray:
...@@ -101,33 +108,39 @@ class CudaNdarraySharedVariable(_operators, SharedVariable): ...@@ -101,33 +108,39 @@ class CudaNdarraySharedVariable(_operators, SharedVariable):
""" """
Assign `value` to the GPU-allocated array. Assign `value` to the GPU-allocated array.
:param borrow: ``True`` permits reusing `value` itself, ``False`` requires that this function Parameters
copies `value` into internal storage. ----------
borrow : bool
:note: ``True`` permits reusing `value` itself, ``False`` requires that
this function copies `value` into internal storage.
Prior to Theano 0.3.1, set_value did not work in-place on the GPU. This meant that sometimes, Notes
GPU memory for the new value would be allocated before the old memory was released. If you're -----
running near the limits of GPU memory, this could cause you to run out of GPU memory. Prior to Theano 0.3.1, set_value did not work in-place on the GPU. This
meant that sometimes, GPU memory for the new value would be allocated
before the old memory was released. If you're running near the limits of
GPU memory, this could cause you to run out of GPU memory.
Beginning with Theano 0.3.1, set_value will work in-place on the GPU, if the following conditions Beginning with Theano 0.3.1, set_value will work in-place on the GPU, if
are met: the following conditions are met:
* The destination on the GPU must be c_contiguous. * The destination on the GPU must be c_contiguous.
* The source is on the CPU. * The source is on the CPU.
* The old value must have the same dtype as the new value (which is a given for now, * The old value must have the same dtype as the new value (which is
since only float32 is supported). a given for now, since only float32 is supported).
* The old and new value must have the same shape. * The old and new value must have the same shape.
* The old value is being completely replaced by the new value (not partially modified, * The old value is being completely replaced by the new value (not
e.g. by replacing some subtensor of it). partially modified, e.g. by replacing some subtensor of it).
* You change the value of the shared variable via set_value, not via the .value * You change the value of the shared variable via set_value, not via
accessors. You should not use the .value accessors anyway, since they will soon be the .value accessors. You should not use the .value accessors
deprecated and removed. anyway, since they will soon be deprecated and removed.
It is also worth mentioning that, for efficient transfer to the GPU,
Theano will make the new data ``c_contiguous``. This can require an
extra copy of the data on the host.
It is also worth mentioning that, for efficient transfer to the GPU, Theano will make the new data The inplace on gpu memory work when borrow is either True or False.
``c_contiguous``. This can require an extra copy of the data on the host.
The inplace on gpu memory work when borrow is either True or False.
""" """
if not borrow: if not borrow:
# TODO: check for cuda_ndarray type # TODO: check for cuda_ndarray type
...@@ -147,8 +160,10 @@ CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable ...@@ -147,8 +160,10 @@ CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
def cuda_shared_constructor(value, name=None, strict=False, def cuda_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, broadcastable=None): allow_downcast=None, borrow=False, broadcastable=None):
"""SharedVariable Constructor for CudaNdarrayType""" """
SharedVariable Constructor for CudaNdarrayType.
"""
# THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD # THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD
# SO INT shared vars, float64 shared vars, etc. all end up on the card. # SO INT shared vars, float64 shared vars, etc. all end up on the card.
# THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT. # THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT.
...@@ -179,7 +194,11 @@ def cuda_shared_constructor(value, name=None, strict=False, ...@@ -179,7 +194,11 @@ def cuda_shared_constructor(value, name=None, strict=False,
def float32_shared_constructor(value, name=None, strict=False, def float32_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, broadcastable=None): allow_downcast=None, borrow=False, broadcastable=None):
"""SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray""" """
SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or
CudaNdarray.
"""
if theano.sandbox.cuda.use.device_number is None: if theano.sandbox.cuda.use.device_number is None:
theano.sandbox.cuda.use("gpu", theano.sandbox.cuda.use("gpu",
force=True, force=True,
......
...@@ -47,7 +47,9 @@ def as_gpuarray(x): ...@@ -47,7 +47,9 @@ def as_gpuarray(x):
class Kernel(object): class Kernel(object):
""" """
This class groups together all the attributes of a gpu kernel. This class groups together all the attributes of a gpu kernel.
""" """
def __init__(self, code, params, name, flags, def __init__(self, code, params, name, flags,
codevar=None, binvar=None, objvar=None): codevar=None, binvar=None, objvar=None):
self.code = code self.code = code
...@@ -113,9 +115,9 @@ class Kernel(object): ...@@ -113,9 +115,9 @@ class Kernel(object):
class GpuKernelBase(object): class GpuKernelBase(object):
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
""" """
This is the method to override. This should return an This is the method to override. This should return an iterable of Kernel
iterable of Kernel objects that describe the kernels this op objects that describe the kernels this op will need.
will need.
""" """
raise MethodNotDefined('gpu_kernels') raise MethodNotDefined('gpu_kernels')
...@@ -552,13 +554,20 @@ cuda_from_gpu = CudaFromGpu() ...@@ -552,13 +554,20 @@ cuda_from_gpu = CudaFromGpu()
class GpuAlloc(HideC, Alloc): class GpuAlloc(HideC, Alloc):
"""
Parameters
----------
memset_0
It's only an optimized version. True, it means the
value is always 0, so the c code call memset as it is faster.
"""
__props__ = ('memset_0',) __props__ = ('memset_0',)
_f16_ok = True _f16_ok = True
def __init__(self, memset_0=False): def __init__(self, memset_0=False):
"""memset_0 is only an optimized version. True, it mean the
value is always 0, so the c code call memset as it is faster.
"""
self.memset_0 = memset_0 self.memset_0 = memset_0
def __str__(self): def __str__(self):
...@@ -777,7 +786,9 @@ class GpuContiguous(Op): ...@@ -777,7 +786,9 @@ class GpuContiguous(Op):
""" """
Always return a c contiguous output. Copy the input only if it is Always return a c contiguous output. Copy the input only if it is
not already c contiguous. not already c contiguous.
""" """
__props__ = () __props__ = ()
view_map = {0: [0]} view_map = {0: [0]}
_f16_ok = True _f16_ok = True
...@@ -835,7 +846,9 @@ gpu_contiguous = GpuContiguous() ...@@ -835,7 +846,9 @@ gpu_contiguous = GpuContiguous()
class GpuReshape(HideC, tensor.Reshape): class GpuReshape(HideC, tensor.Reshape):
""" """
Implement Reshape on the gpu. Implement Reshape on the gpu.
""" """
_f16_ok = True _f16_ok = True
# __hash__, __eq__, __str__ come from tensor.Reshape # __hash__, __eq__, __str__ come from tensor.Reshape
...@@ -951,6 +964,7 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -951,6 +964,7 @@ class GpuReshape(HideC, tensor.Reshape):
class GpuJoin(HideC, Join): class GpuJoin(HideC, Join):
_f16_ok = True _f16_ok = True
def make_node(self, axis, *tensors): def make_node(self, axis, *tensors):
......
...@@ -16,6 +16,7 @@ class NVCC_compiler(NVCC_base): ...@@ -16,6 +16,7 @@ class NVCC_compiler(NVCC_base):
""" """
Re-implementation of compile_args that does not create an Re-implementation of compile_args that does not create an
additionnal context on the GPU. additionnal context on the GPU.
""" """
flags = [flag for flag in config.nvcc.flags.split(' ') if flag] flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
if config.nvcc.fastmath: if config.nvcc.fastmath:
......
...@@ -12,7 +12,48 @@ from .basic_ops import as_gpuarray_variable ...@@ -12,7 +12,48 @@ from .basic_ops import as_gpuarray_variable
class GpuConv(gof.Op): class GpuConv(gof.Op):
""" """
Implement the batched and stacked 2d convolution on the gpu. Implement the batched and stacked 2d convolution on the gpu.
Parameters
----------
version
Each version of c_code implements many kernels for the convolution.
By default we try to guess the best one. You can force one version with
this parameter. This parameter is used by the tests.
direction_hint
'forward', 'bprop weights' or 'bprop inputs'. Serves as a hint for graph
optimizers replacing GpuConv by other implementations. If the GpuConv is
inserted automatically, we take its value from ConvOp.
verbose
For value of 1,2 and 3. Print more information during the execution of
the convolution. Mostly used for optimization or debugging.
kshp
The size of the kernel. If provided, can generate faster code. If the
GpuConv op is automatically inserted, we take its value automatically
from the Conv op.
imshp
The size of the image. Not used for code generation but allows to select
an experimental new version in another repo.
max_threads_dim0
The maximum number of threads for the block size dimensions 0
(blockDim.x) used by the GPU function.
nkern
The number of kernels. Not used for this op, but can be used by graph
optimizers to select a more optimal convolution implementation. If the
GpuConv op is inserted automatically, we take its value from the Conv
op.
bsize
The batch size. Not used for this op, but can be used by graph
optimizers to select a more optimal convolution implementation. If the
GpuConv op is inserted automatically, we take its value from the Conv
op.
fft_opt
Deactivate fft_opt optimization at the op level when set to False. Note
that by default fft optimization aren't enabled.
See :ref:`convolution documentation <libdoc_tensor_nnet_conv>` to enable
them.
""" """
@staticmethod @staticmethod
def logical_output_shape_2d(imshp, kshp, mode): def logical_output_shape_2d(imshp, kshp, mode):
if mode == 'valid': if mode == 'valid':
...@@ -35,43 +76,6 @@ class GpuConv(gof.Op): ...@@ -35,43 +76,6 @@ class GpuConv(gof.Op):
nkern=None, nkern=None,
bsize=None, bsize=None,
fft_opt=True): fft_opt=True):
"""
:param version: each version of c_code implements many kernels for the
convolution. By default we try to guess the best one.
You can force one version with this parameter. This
parameter is used by the tests.
:param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
Serves as a hint for graph optimizers replacing
GpuConv by other implementations. If the GpuConv is
inserted automatically, we take its value from ConvOp.
:param verbose: for value of 1,2 and 3. Print more information during
the execution of the convolution. Mostly used for
optimization or debugging.
:param kshp: The size of the kernel. If provided, can generate
faster code. If the GpuConv op is automatically
inserted,
we take its value automatically from the Conv op.
:param imshp: The size of the image. Not used for code generation but
allows to select an experimental new version in another
repo.
:param max_threads_dim0: The maximum number of threads for the
block size dimensions 0 (blockDim.x) used by the
GPU function.
:param nkern: The number of kernels. Not used for this op, but can be
used by graph optimizers to select a more optimal
convolution implementation. If the GpuConv op is inserted
automatically, we take its value from the Conv op.
:param bsize: The batch size. Not used for this op, but can be
used by graph optimizers to select a more optimal
convolution implementation. If the GpuConv op is inserted
automatically, we take its value from the Conv op.
:param fft_opt: deactivate fft_opt optimization at the op level when
set to False. Note that by default fft optimization
aren't enabled. See
:ref:`convolution documentation <libdoc_tensor_nnet_conv>`
to enable them.
"""
self.border_mode = border_mode self.border_mode = border_mode
self.subsample = subsample self.subsample = subsample
if logical_img_hw is not None: if logical_img_hw is not None:
...@@ -169,7 +173,10 @@ class GpuConv(gof.Op): ...@@ -169,7 +173,10 @@ class GpuConv(gof.Op):
return gof.Apply(self, [img, kern], [out]) return gof.Apply(self, [img, kern], [out])
def flops(self, inputs, outputs): def flops(self, inputs, outputs):
""" Useful with the hack in profilemode to print the MFlops""" """
Useful with the hack in profilemode to print the MFlops.
"""
images, kerns = inputs images, kerns = inputs
out, = outputs out, = outputs
assert images[1] == kerns[1] assert images[1] == kerns[1]
......
...@@ -470,7 +470,8 @@ class GpuElemwise(HideC, Elemwise): ...@@ -470,7 +470,8 @@ class GpuElemwise(HideC, Elemwise):
class SupportCodeError(Exception): class SupportCodeError(Exception):
""" """
We do not support certain things (such as the C++ complex struct) We do not support certain things (such as the C++ complex struct).
""" """
...@@ -571,14 +572,22 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -571,14 +572,22 @@ class GpuDimShuffle(HideC, DimShuffle):
class GpuCAReduceCuda(HideC, CAReduceDtype): class GpuCAReduceCuda(HideC, CAReduceDtype):
"""GpuCAReduceCuda is a Reduction along some dimensions by a scalar op. """
GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
The dimensions along which to reduce is specified by the
`reduce_mask` that you pass to the constructor. The `reduce_mask` Parameters
is a tuple of booleans (actually integers 0 or 1) that specify for ----------
each input dimension, whether to reduce it (1) or not (0). reduce-mask
The dimensions along which to reduce. The `reduce_mask` is a tuple of
For example, when scalar_op is a theano.scalar.basic.Add instance: booleans (actually integers 0 or 1) that specify for each input
dimension, whether to reduce it (1) or not (0).
pre_scalar_op
If present, must be a scalar op with only 1 input. We will execute it
on the input value before reduction.
Examples
--------
When scalar_op is a theano.scalar.basic.Add instance:
- reduce_mask == (1,) sums a vector to a scalar - reduce_mask == (1,) sums a vector to a scalar
...@@ -588,8 +597,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -588,8 +597,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
- reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor. - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
:note: any reduce_mask of all zeros is a sort of 'copy', and may Notes
be removed during graph optimization -----
Any reduce_mask of all zeros is a sort of 'copy', and may be removed during
graph optimization.
This Op is a work in progress. This Op is a work in progress.
...@@ -602,9 +613,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -602,9 +613,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
GPUs are not especially well-suited to reduction operations so it is GPUs are not especially well-suited to reduction operations so it is
quite possible that the GPU might be slower for some cases. quite possible that the GPU might be slower for some cases.
pre_scalar_op: if present, must be a scalar op with only 1
input. We will execute it on the input value before reduction.
""" """
_f16_ok = True _f16_ok = True
def __init__(self, scalar_op, axis=None, def __init__(self, scalar_op, axis=None,
...@@ -690,9 +700,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -690,9 +700,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
raise MethodNotDefined("") raise MethodNotDefined("")
def supports_c_code(self, inputs): def supports_c_code(self, inputs):
""" Returns True if the current op and reduce pattern """
has functioning C code """ Returns True if the current op and reduce pattern has functioning C code.
"""
# If we don't even have the right method, we certainly # If we don't even have the right method, we certainly
# don't support the C code # don't support the C code
# (This is the test that used to be implemented by # (This is the test that used to be implemented by
...@@ -871,9 +882,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -871,9 +882,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
return sio.getvalue() return sio.getvalue()
def _makecall(self, node, name, x, z, fail, pattern=None): def _makecall(self, node, name, x, z, fail, pattern=None):
"""Return a string for making a kernel call. """
Return a string for making a kernel call.
The return value looks something like: The return value looks something like:
.. code-block:: c .. code-block:: c
...@@ -972,7 +984,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -972,7 +984,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
def _k_decl(self, node, nodename, pattern=None, def _k_decl(self, node, nodename, pattern=None,
ndim=None, reduce_mask=None): ndim=None, reduce_mask=None):
"""Return a string to declare a kernel function """
Return a string to declare a kernel function.
The result will look something like this: The result will look something like this:
...@@ -989,8 +1002,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -989,8 +1002,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(out_dtype)s * Z, %(out_dtype)s * Z,
const int sZ0) const int sZ0)
Since the nodename is unique, we don't need to put the name Since the nodename is unique, we don't need to put the name
of the scalar_op in here. of the scalar_op in here.
""" """
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
...@@ -1057,6 +1070,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1057,6 +1070,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
Otherwise, check that the scalar op is maximum or minimum Otherwise, check that the scalar op is maximum or minimum
and return first_item. It should be the first element of the reduction. and return first_item. It should be the first element of the reduction.
As the maximum and minimum of the same value don't change, this work. As the maximum and minimum of the same value don't change, this work.
""" """
if hasattr(self.scalar_op, 'identity'): if hasattr(self.scalar_op, 'identity'):
return str(self.scalar_op.identity) return str(self.scalar_op.identity)
...@@ -1084,15 +1098,28 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1084,15 +1098,28 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
def _assign_reduce(self, node, name, left, right, sub, pre): def _assign_reduce(self, node, name, left, right, sub, pre):
""" """
node: the node argument to this op's c_code
name: the name argument to this op's c_code
left: a C code string identifying an lvalue
right: a C code string identifying an expression
sub: the sub argument to this op's c_code
pre: If True, we will add the pre_scalar_op.c_code
returns C code to reduce left and right, assigning the Parameters
result to left.""" ----------
node
The node argument to this op's c_code.
name
The name argument to this op's c_code.
left
A C code string identifying an lvalue.
right
A C code string identifying an expression.
sub
The sub argument to this op's c_code.
pre
If True, we will add the pre_scalar_op.c_code.
Returns
-------
str
C code to reduce left and right, assigning the result to left.
"""
x, = node.inputs x, = node.inputs
in_dtype = x.dtype in_dtype = x.dtype
...@@ -1125,8 +1152,11 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1125,8 +1152,11 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
""" """
WRITEME WRITEME
node, name, sub: these should be passed through from the original Parameters
call to c_code ----------
node, name, sub
These should be passed through from the original call to c_code.
""" """
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
...@@ -1274,9 +1304,11 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1274,9 +1304,11 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
def c_code_reduce_ccontig(self, sio, node, name, x, z, fail): def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
""" """
WRITEME WRITEME
IG: I believe, based on how this is called in c_code, that it IG: I believe, based on how this is called in c_code, that it
is for the case where we are reducing on all axes and x is is for the case where we are reducing on all axes and x is
C contiguous. C contiguous.
""" """
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
...@@ -1366,8 +1398,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1366,8 +1398,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
def c_code_reduce_01X(self, sio, node, name, x, z, fail, N): def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
""" """
:param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
Work for N=1,2,3 Parameters
----------
N
The number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
Work for N=1,2,3.
""" """
assert N in [1, 2, 3] assert N in [1, 2, 3]
...@@ -2552,11 +2589,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2552,11 +2589,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
"""CAReduce that reuse the python code from gpuarray. """
CAReduce that reuse the python code from gpuarray.
Too slow for now as it only have a python interface. Too slow for now as it only have a python interface.
""" """
def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None): def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
if not hasattr(scalar_op, 'identity'): if not hasattr(scalar_op, 'identity'):
raise ValueError("No identity on scalar op") raise ValueError("No identity on scalar op")
......
""" Helper routines for generating gpu kernels for nvcc.
""" """
Helper routines for generating gpu kernels for nvcc.
def nvcc_kernel(name, params, body): """
"""Return the c code of a kernel function.
:param params: the parameters to the function as one or more strings def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
:param body: the [nested] list of statements for the body of the Parameters
function. These will be separated by ';' characters. ----------
params
The parameters to the function as one or more strings.
body
The [nested] list of statements for the body of the function.
These will be separated by ';' characters.
""" """
paramstr = ', '.join(params) paramstr = ', '.join(params)
...@@ -28,7 +34,10 @@ def nvcc_kernel(name, params, body): ...@@ -28,7 +34,10 @@ def nvcc_kernel(name, params, body):
def code_version(version): def code_version(version):
"""decorator to support version-based cache mechanism""" """
Decorator to support version-based cache mechanism.
"""
if not isinstance(version, tuple): if not isinstance(version, tuple):
raise TypeError('version must be tuple', version) raise TypeError('version must be tuple', version)
...@@ -42,22 +51,31 @@ UNVERSIONED = () ...@@ -42,22 +51,31 @@ UNVERSIONED = ()
@code_version((1,)) @code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn): def inline_reduce(N, buf, pos, count, manner_fn):
"""Return C++ code for a function that reduces a contiguous buffer. """
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer Parameters
:param pos: index of executing thread ----------
:param count: number of executing threads N
Length of the buffer.
:param manner_fn: a function that accepts strings of arguments a buf
and b, and returns c code for their reduction. (Example: buffer pointer.
return "%(a)s + %(b)s" for a sum reduction). pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction.
:postcondition: :postcondition:
This function leaves the answer in position 0 of the buffer. The This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function. rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
""" """
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf)) loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
...@@ -126,19 +144,28 @@ def inline_reduce_prod(N, buf, pos, count): ...@@ -126,19 +144,28 @@ def inline_reduce_prod(N, buf, pos, count):
def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
""" """
:param N: length of the buffer Parameters
:param threadPos: index of executing thread ----------
:param threadCount: number of executing threads N
:param dtype: dtype of the softmax's output Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
dtype
Dtype of the softmax's output.
:Precondition: buf and buf2 contain two identical copies of the input :Precondition: buf and buf2 contain two identical copies of the input
to softmax to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized :Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times Notes
-----
buf and buf2 should be in gpu shared memory, we access it many times.
We use __i as an int variable in a loop.
:note2: We use __i as an int variable in a loop
""" """
return [ return [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
...@@ -169,31 +196,48 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -169,31 +196,48 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init, manner_fn, manner_init,
b='', stride_b='', load_b='', dtype='float32'): b='', stride_b='', load_b='', dtype='float32'):
"""Return C++ code for a function that reduces a contiguous buffer. """
Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(dtype) Parameters
:param x: input data ----------
:param stride_x: input data stride N
:param load_x: wrapper to read from x Length of the buffer.
:param pos: index of executing thread buf
:param count: number of executing threads Buffer pointer of size warpSize * sizeof(dtype).
:param b: Optional, pointer to the bias x
:param stride_b: Optional, the stride of b if b is provided Input data.
:param load_b: Optional, wrapper to read from b if b is provided stride_x
:param dtype: Optional, the dtype of the output Input data stride.
load_x
:param manner_fn: a function that accepts strings of arguments a Wrapper to read from x.
and b, and returns c code for their reduction. (Example: pos
return "%(a)s + %(b)s" for a sum reduction). Index of executing thread.
:param manner_init: a function that accepts strings of arguments a count
and return c code for its initialization Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction.
manner_init
A function that accepts strings of arguments a and return c code for its
initialization.
:postcondition: :postcondition:
This function leaves the answer in position 0 of the buffer. The This function leaves the answer in position 0 of the buffer. The rest of the
rest of the buffer is trashed by this function. buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times. Notes
-----
buf should be in gpu shared memory, we access it many times.
""" """
if b: if b:
...@@ -270,28 +314,47 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -270,28 +314,47 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
dtype="float32"): dtype="float32"):
""" """
:param N: length of the buffer, atleast waprSize(32). Parameters
:param buf: a shared memory buffer of size warpSize * sizeof(dtype) ----------
:param x: a ptr to the gpu memory where the row is stored N
:param stride_x: the stride between each element in x Length of the buffer, atleast waprSize(32).
:param load_x: wrapper to read from x buf
:param sm: a ptr to the gpu memory to store the result A shared memory buffer of size warpSize * sizeof(dtype).
:param sm_stride: the stride between eash sm element x
:param write_sm: wrapper before writing to sm A ptr to the gpu memory where the row is stored.
:param threadPos: index of executing thread stride_x
:param threadCount: number of executing threads The stride between each element in x.
:param b: Optional, pointer to the bias load_x
:param stride_b: Optional, the stride of b if b is provided Wrapper to read from x.
:param load_b: Optional, wrapper to read from b if b is provided sm
:param dtype: Optional, the dtype of the softmax's output if not float32 A ptr to the gpu memory to store the result.
sm_stride
The stride between each sm element.
write_sm
Wrapper before writing to sm.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the softmax's output if not float32.
:Precondition: buf is empty :Precondition: buf is empty
:Postcondition: buf[0] contains the softmax, :Postcondition: buf[0] contains the softmax, buf2 contains un-normalized
buf2 contains un-normalized softmax softmax
Notes
-----
buf should be in gpu shared memory, we access it many times.
:note: buf should be in gpu shared memory, we access it many times. We use tx as an int variable in a loop.
:note2: We use tx as an int variable in a loop
""" """
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
......
...@@ -22,7 +22,9 @@ from .fp16_help import work_dtype, load_w, write_w ...@@ -22,7 +22,9 @@ from .fp16_help import work_dtype, load_w, write_w
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
""" """
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
""" """
nin = 3 nin = 3
nout = 3 nout = 3
__props__ = () __props__ = ()
...@@ -276,8 +278,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -276,8 +278,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
Gradient wrt x of the CrossentropySoftmax1Hot Op Gradient wrt x of the CrossentropySoftmax1Hot Op.
""" """
nin = 3 nin = 3
nout = 1 nout = 1
__props__ = () __props__ = ()
...@@ -462,7 +466,9 @@ gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasD ...@@ -462,7 +466,9 @@ gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasD
class GpuSoftmax (Op): class GpuSoftmax (Op):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
__props__ = () __props__ = ()
_f16_ok = True _f16_ok = True
...@@ -651,7 +657,9 @@ gpu_softmax = GpuSoftmax() ...@@ -651,7 +657,9 @@ gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (Op): class GpuSoftmaxWithBias (Op):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
""" """
nin = 2 nin = 2
nout = 1 nout = 1
__props__ = () __props__ = ()
......
...@@ -89,7 +89,9 @@ def safe_to_cpu(x): ...@@ -89,7 +89,9 @@ def safe_to_cpu(x):
def op_lifter(OP, cuda_only=False): def op_lifter(OP, cuda_only=False):
""" """
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...)) OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...) gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
""" """
def f(maker): def f(maker):
def local_opt(node): def local_opt(node):
...@@ -122,7 +124,10 @@ def op_lifter(OP, cuda_only=False): ...@@ -122,7 +124,10 @@ def op_lifter(OP, cuda_only=False):
class InputToGpuOptimizer(Optimizer): class InputToGpuOptimizer(Optimizer):
"Transfer the input to the gpu to start the rolling wave." """
Transfer the input to the gpu to start the rolling wave.
"""
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
...@@ -173,6 +178,7 @@ def local_gpuaalloc2(node): ...@@ -173,6 +178,7 @@ def local_gpuaalloc2(node):
Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...) Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
Moves an alloc that is an input to join to the gpu. Moves an alloc that is an input to join to the gpu.
""" """
if (isinstance(node.op, tensor.Alloc) and if (isinstance(node.op, tensor.Alloc) and
all(c != 'output' and all(c != 'output' and
...@@ -654,6 +660,7 @@ def local_gpu_conv(node): ...@@ -654,6 +660,7 @@ def local_gpu_conv(node):
gpu_from_host(conv) -> gpu_conv(gpu_from_host) gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv) conv(host_from_gpu) -> host_from_gpu(gpu_conv)
""" """
def GpuConvOp_from_ConvOp(op): def GpuConvOp_from_ConvOp(op):
logical_img_hw = None logical_img_hw = None
...@@ -698,7 +705,8 @@ def local_gpu_conv(node): ...@@ -698,7 +705,8 @@ def local_gpu_conv(node):
return ret return ret
def values_eq_approx(a, b): def values_eq_approx(a, b):
"""This fct is needed to don't have DebugMode raise useless """
This fct is needed to don't have DebugMode raise useless
error due to ronding error. error due to ronding error.
This happen as We reduce on the two last dimensions, so this This happen as We reduce on the two last dimensions, so this
...@@ -736,7 +744,10 @@ register_opt()(conv_groupopt) ...@@ -736,7 +744,10 @@ register_opt()(conv_groupopt)
@register_opt("low_memory") @register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda]) @local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node): def local_gpu_elemwise_careduce(node):
""" Merge some GpuCAReduceCuda and GPUElemwise""" """
Merge some GpuCAReduceCuda and GPUElemwise.
"""
if (isinstance(node.op, GpuCAReduceCuda) and if (isinstance(node.op, GpuCAReduceCuda) and
node.op.pre_scalar_op is None and node.op.pre_scalar_op is None and
node.inputs[0].owner and node.inputs[0].owner and
...@@ -767,10 +778,11 @@ def tensor_to_gpu(x): ...@@ -767,10 +778,11 @@ def tensor_to_gpu(x):
def gpu_safe_new(x, tag=''): def gpu_safe_new(x, tag=''):
""" """
Internal function that constructs a new variable from x with the same Internal function that constructs a new variable from x with the same
type, but with a different name ( old name + tag). This function is used type, but with a different name (old name + tag). This function is used
by gradient, or the R-op to construct new variables for the inputs of by gradient, or the R-op to construct new variables for the inputs of
the inner graph such that there is no interference between the original the inner graph such that there is no interference between the original
graph and the newly constructed graph. graph and the newly constructed graph.
""" """
if hasattr(x, 'name') and x.name is not None: if hasattr(x, 'name') and x.name is not None:
nw_name = x.name + tag nw_name = x.name + tag
...@@ -788,8 +800,9 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -788,8 +800,9 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
""" """
Different interface to clone, that allows you to pass inputs. Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with Compared to clone, this method always replaces the inputs with
new variables of the same type, and returns those ( in the same new variables of the same type, and returns those (in the same
order as the original inputs). order as the original inputs).
""" """
if tag is None: if tag is None:
tag = '' tag = ''
......
...@@ -163,12 +163,16 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -163,12 +163,16 @@ class GpuIncSubtensor(IncSubtensor):
""" """
Implement IncSubtensor on the gpu. Implement IncSubtensor on the gpu.
Note: The optimization to make this inplace is in tensor/opt. Notes
The same optimization handles IncSubtensor and GpuIncSubtensor. -----
This Op has c_code too; it inherits tensor.IncSubtensor's c_code. The optimization to make this inplace is in tensor/opt.
The helper methods like do_type_checking, copy_of_x, etc. specialize The same optimization handles IncSubtensor and GpuIncSubtensor.
the c_code for this Op. This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
""" """
@property @property
def _f16_ok(self): def _f16_ok(self):
return self.iadd_node.op._f16_ok return self.iadd_node.op._f16_ok
...@@ -256,8 +260,10 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -256,8 +260,10 @@ class GpuIncSubtensor(IncSubtensor):
return d return d
def do_type_checking(self, node): def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support """
Should raise NotImplementedError if c_code does not support
the types involved in this node. the types involved in this node.
""" """
if not isinstance(node.inputs[0].type, GpuArrayType): if not isinstance(node.inputs[0].type, GpuArrayType):
...@@ -265,13 +271,22 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -265,13 +271,22 @@ class GpuIncSubtensor(IncSubtensor):
def copy_of_x(self, x): def copy_of_x(self, x):
""" """
:param x: a string giving the name of a C variable
pointing to an array
:return: C code expression to make a copy of x Parameters
----------
x
A string giving the name of a C variable pointing to an array.
Returns
-------
str
C code expression to make a copy of x.
Notes
-----
Base class uses `PyArrayObject *`, subclasses may override for
different types of arrays.
Base class uses `PyArrayObject *`, subclasses may override for
different types of arrays.
""" """
return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals() return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
...@@ -279,13 +294,18 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -279,13 +294,18 @@ class GpuIncSubtensor(IncSubtensor):
return "PyGpuArrayObject* zview = NULL;" return "PyGpuArrayObject* zview = NULL;"
def make_view_array(self, x, view_ndim): def make_view_array(self, x, view_ndim):
"""//TODO """
:param x: a string identifying an array to be viewed //TODO
:param view_ndim: a string specifying the number of dimensions
to have in the view Parameters
----------
x
A string identifying an array to be viewed.
view_ndim
A string specifying the number of dimensions to have in the view.
This doesn't need to actually set up the view with the This doesn't need to actually set up the view with the
right indexing; we'll do that manually later. right indexing; we'll do that manually later.
""" """
ret = """ ret = """
size_t dims[%(view_ndim)s]; size_t dims[%(view_ndim)s];
...@@ -305,18 +325,29 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -305,18 +325,29 @@ class GpuIncSubtensor(IncSubtensor):
return ret return ret
def get_helper_c_code_args(self): def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code""" """
Return a dictionary of arguments to use with helper_c_code.
"""
return {'c_prefix': 'PyGpuArray', return {'c_prefix': 'PyGpuArray',
'strides_mul': 1 'strides_mul': 1
} }
def copy_into(self, view, source): def copy_into(self, view, source):
""" """
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and Parameters
return 0 on success ----------
view : string
C code expression for an array.
source : string
C code expression for an array.
Returns
-------
str
C code expression to copy source into view, and 0 on success.
""" """
return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals() return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
...@@ -365,7 +396,9 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -365,7 +396,9 @@ class GpuIncSubtensor(IncSubtensor):
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
""" """
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
x_ = as_gpuarray_variable(x) x_ = as_gpuarray_variable(x)
y_ = as_gpuarray_variable(y) y_ = as_gpuarray_variable(y)
...@@ -454,9 +487,12 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -454,9 +487,12 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function """
Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent. only avail on compute capability 2.0 and more recent.
""" """
_f16_ok = True _f16_ok = True
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
......
...@@ -217,10 +217,12 @@ class GpuArrayType(Type): ...@@ -217,10 +217,12 @@ class GpuArrayType(Type):
return (hash(self.typecode) ^ hash(self.broadcastable)) return (hash(self.typecode) ^ hash(self.broadcastable))
def dtype_specs(self): def dtype_specs(self):
"""Return a tuple (python type, c type, numpy typenum) that corresponds """
Return a tuple (python type, c type, numpy typenum) that corresponds
to self.dtype. to self.dtype.
This function is used internally as part of C code generation. This function is used internally as part of C code generation.
""" """
# TODO: add more type correspondances for e.g. int32, int64, float32, # TODO: add more type correspondances for e.g. int32, int64, float32,
# complex64, etc. # complex64, etc.
...@@ -386,7 +388,10 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable ...@@ -386,7 +388,10 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable
def gpuarray_shared_constructor(value, name=None, strict=False, def gpuarray_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, allow_downcast=None, borrow=False,
broadcastable=None): broadcastable=None):
"""SharedVariable constructor for GpuArrayType""" """
SharedVariable constructor for GpuArrayType.
"""
if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)): if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
raise TypeError('ndarray or GpuArray required') raise TypeError('ndarray or GpuArray required')
......
...@@ -64,7 +64,7 @@ except ImportError: ...@@ -64,7 +64,7 @@ except ImportError:
class Hint(Op): class Hint(Op):
""" """
Provide arbitrary information to the optimizer Provide arbitrary information to the optimizer.
These ops are removed from the graph during canonicalization These ops are removed from the graph during canonicalization
in order to not interfere with other optimizations. in order to not interfere with other optimizations.
...@@ -122,7 +122,7 @@ def remove_hint_nodes(node): ...@@ -122,7 +122,7 @@ def remove_hint_nodes(node):
class HintsFeature(object): class HintsFeature(object):
""" """
FunctionGraph Feature to track matrix properties FunctionGraph Feature to track matrix properties.
This is a similar feature to variable 'tags'. In fact, tags are one way This is a similar feature to variable 'tags'. In fact, tags are one way
to provide hints. to provide hints.
...@@ -209,8 +209,12 @@ class HintsFeature(object): ...@@ -209,8 +209,12 @@ class HintsFeature(object):
class HintsOptimizer(Optimizer): class HintsOptimizer(Optimizer):
"""Optimizer that serves to add HintsFeature as an fgraph feature.
""" """
Optimizer that serves to add HintsFeature as an fgraph feature.
"""
def __init__(self): def __init__(self):
Optimizer.__init__(self) Optimizer.__init__(self)
...@@ -231,6 +235,7 @@ def psd(v): ...@@ -231,6 +235,7 @@ def psd(v):
""" """
Apply a hint that the variable `v` is positive semi-definite, i.e. Apply a hint that the variable `v` is positive semi-definite, i.e.
it is a symmetric matrix and :math:`x^T A x \ge 0` for any vector x. it is a symmetric matrix and :math:`x^T A x \ge 0` for any vector x.
""" """
return Hint(psd=True, symmetric=True)(v) return Hint(psd=True, symmetric=True)(v)
...@@ -294,6 +299,7 @@ def tag_solve_triangular(node): ...@@ -294,6 +299,7 @@ def tag_solve_triangular(node):
""" """
If a general solve() is applied to the output of a cholesky op, then If a general solve() is applied to the output of a cholesky op, then
replace it with a triangular solve. replace it with a triangular solve.
""" """
if node.op == solve: if node.op == solve:
if node.op.A_structure == 'general': if node.op.A_structure == 'general':
...@@ -396,12 +402,13 @@ def spectral_radius_bound(X, log2_exponent): ...@@ -396,12 +402,13 @@ def spectral_radius_bound(X, log2_exponent):
Returns upper bound on the largest eigenvalue of square symmetrix matrix X. Returns upper bound on the largest eigenvalue of square symmetrix matrix X.
log2_exponent must be a positive-valued integer. The larger it is, the log2_exponent must be a positive-valued integer. The larger it is, the
slower and tighter the bound. Values up to 5 should usually suffice. The slower and tighter the bound. Values up to 5 should usually suffice. The
algorithm works by multiplying X by itself this many times. algorithm works by multiplying X by itself this many times.
From V.Pan, 1990. "Estimating the Extremal Eigenvalues of a Symmetric From V.Pan, 1990. "Estimating the Extremal Eigenvalues of a Symmetric
Matrix", Computers Math Applic. Vol 20 n. 2 pp 17-22. Matrix", Computers Math Applic. Vol 20 n. 2 pp 17-22.
Rq: an efficient algorithm, not used here, is defined in this paper. Rq: an efficient algorithm, not used here, is defined in this paper.
""" """
if X.type.ndim != 2: if X.type.ndim != 2:
raise TypeError('spectral_radius_bound requires a matrix argument', X) raise TypeError('spectral_radius_bound requires a matrix argument', X)
......
""" """
This module provides the Scan Op This module provides the Scan Op.
Scanning is a general form of recurrence, which can be used for looping. Scanning is a general form of recurrence, which can be used for looping.
The idea is that you *scan* a function along some input sequence, producing The idea is that you *scan* a function along some input sequence, producing
an output at each time-step that can be seen (but not modified) by the an output at each time-step that can be seen (but not modified) by the
function at the next time-step. (Technically, the function can see the function at the next time-step. Technically, the function can see the
previous K time-steps of your outputs and L time steps (from the past and previous K time-steps of your outputs and L time steps (from the past and
future) of your inputs. future) of your inputs.
...@@ -26,6 +26,7 @@ the symbolic graph. ...@@ -26,6 +26,7 @@ the symbolic graph.
The Scan Op should typically be used by calling any of the following The Scan Op should typically be used by calling any of the following
functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``, functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
``foldr()``. ``foldr()``.
""" """
......
""" """
This module provides the Scan Op This module provides the Scan Op.
Scanning is a general form of recurrence, which can be used for looping. Scanning is a general form of recurrence, which can be used for looping.
The idea is that you *scan* a function along some input sequence, producing The idea is that you *scan* a function along some input sequence, producing
an output at each time-step that can be seen (but not modified) by the an output at each time-step that can be seen (but not modified) by the
function at the next time-step. (Technically, the function can see the function at the next time-step. Technically, the function can see the
previous K time-steps of your outputs and L time steps (from past and previous K time-steps of your outputs and L time steps (from past and
future) of your inputs. future) of your inputs.
...@@ -32,6 +32,7 @@ host at each step ...@@ -32,6 +32,7 @@ host at each step
The Scan Op should typically be used by calling any of the following The Scan Op should typically be used by calling any of the following
functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``, functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
``foldr()``. ``foldr()``.
""" """
__docformat__ = 'restructedtext en' __docformat__ = 'restructedtext en'
__authors__ = ("Razvan Pascanu " __authors__ = ("Razvan Pascanu "
...@@ -76,7 +77,9 @@ def scan(fn, ...@@ -76,7 +77,9 @@ def scan(fn,
This function constructs and applies a Scan op to the provided This function constructs and applies a Scan op to the provided
arguments. arguments.
:param fn: Parameters
----------
fn
``fn`` is a function that describes the operations involved in one ``fn`` is a function that describes the operations involved in one
step of ``scan``. ``fn`` should construct variables describing the step of ``scan``. ``fn`` should construct variables describing the
output of one iteration step. It should expect as input theano output of one iteration step. It should expect as input theano
...@@ -167,7 +170,7 @@ def scan(fn, ...@@ -167,7 +170,7 @@ def scan(fn,
number of steps ) is still required even though a condition is number of steps ) is still required even though a condition is
passed (and it is used to allocate memory if needed). = {}): passed (and it is used to allocate memory if needed). = {}):
:param sequences: sequences
``sequences`` is the list of Theano variables or dictionaries ``sequences`` is the list of Theano variables or dictionaries
describing the sequences ``scan`` has to iterate over. If a describing the sequences ``scan`` has to iterate over. If a
sequence is given as wrapped in a dictionary, then a set of optional sequence is given as wrapped in a dictionary, then a set of optional
...@@ -185,8 +188,7 @@ def scan(fn, ...@@ -185,8 +188,7 @@ def scan(fn,
Any Theano variable in the list ``sequences`` is automatically Any Theano variable in the list ``sequences`` is automatically
wrapped into a dictionary where ``taps`` is set to ``[0]`` wrapped into a dictionary where ``taps`` is set to ``[0]``
outputs_info
:param outputs_info:
``outputs_info`` is the list of Theano variables or dictionaries ``outputs_info`` is the list of Theano variables or dictionaries
describing the initial state of the outputs computed describing the initial state of the outputs computed
recurrently. When this initial states are given as dictionary recurrently. When this initial states are given as dictionary
...@@ -243,15 +245,13 @@ def scan(fn, ...@@ -243,15 +245,13 @@ def scan(fn,
raised (because there is no convention on how scan should map raised (because there is no convention on how scan should map
the provided information to the outputs of ``fn``) the provided information to the outputs of ``fn``)
non_sequences
:param non_sequences:
``non_sequences`` is the list of arguments that are passed to ``non_sequences`` is the list of arguments that are passed to
``fn`` at each steps. One can opt to exclude variable ``fn`` at each steps. One can opt to exclude variable
used in ``fn`` from this list as long as they are part of the used in ``fn`` from this list as long as they are part of the
computational graph, though for clarity we encourage not to do so. computational graph, though for clarity we encourage not to do so.
n_steps
:param n_steps:
``n_steps`` is the number of steps to iterate given as an int ``n_steps`` is the number of steps to iterate given as an int
or Theano scalar. If any of the input sequences do not have or Theano scalar. If any of the input sequences do not have
enough elements, scan will raise an error. If the *value is 0* the enough elements, scan will raise an error. If the *value is 0* the
...@@ -261,8 +261,7 @@ def scan(fn, ...@@ -261,8 +261,7 @@ def scan(fn,
in time. If n stpes is not provided, ``scan`` will figure in time. If n stpes is not provided, ``scan`` will figure
out the amount of steps it should run given its input sequences. out the amount of steps it should run given its input sequences.
truncate_gradient
:param truncate_gradient:
``truncate_gradient`` is the number of steps to use in truncated ``truncate_gradient`` is the number of steps to use in truncated
BPTT. If you compute gradients through a scan op, they are BPTT. If you compute gradients through a scan op, they are
computed using backpropagation through time. By providing a computed using backpropagation through time. By providing a
...@@ -270,16 +269,14 @@ def scan(fn, ...@@ -270,16 +269,14 @@ def scan(fn,
of classical BPTT, where you go for only ``truncate_gradient`` of classical BPTT, where you go for only ``truncate_gradient``
number of steps back in time. number of steps back in time.
go_backwards
:param go_backwards:
``go_backwards`` is a flag indicating if ``scan`` should go ``go_backwards`` is a flag indicating if ``scan`` should go
backwards through the sequences. If you think of each sequence backwards through the sequences. If you think of each sequence
as indexed by time, making this flag True would mean that as indexed by time, making this flag True would mean that
``scan`` goes back in time, namely that for any sequence it ``scan`` goes back in time, namely that for any sequence it
starts from the end and goes towards 0. starts from the end and goes towards 0.
name
:param name:
When profiling ``scan``, it is crucial to provide a name for any When profiling ``scan``, it is crucial to provide a name for any
instance of ``scan``. The profiler will produce an overall instance of ``scan``. The profiler will produce an overall
profile of your code as well as profiles for the computation of profile of your code as well as profiles for the computation of
...@@ -287,7 +284,7 @@ def scan(fn, ...@@ -287,7 +284,7 @@ def scan(fn,
appears in those profiles and can greatly help to disambiguate appears in those profiles and can greatly help to disambiguate
information. information.
:param mode: mode
It is recommended to leave this argument to None, especially It is recommended to leave this argument to None, especially
when profiling ``scan`` (otherwise the results are not going to when profiling ``scan`` (otherwise the results are not going to
be accurate). If you prefer the computations of one step of be accurate). If you prefer the computations of one step of
...@@ -296,7 +293,7 @@ def scan(fn, ...@@ -296,7 +293,7 @@ def scan(fn,
loop are done (see ``theano.function`` for details about loop are done (see ``theano.function`` for details about
possible values and their meaning). possible values and their meaning).
:param profile: profile
Flag or string. If true, or different from the empty string, a Flag or string. If true, or different from the empty string, a
profile object will be created and attached to the inner graph of profile object will be created and attached to the inner graph of
scan. In case ``profile`` is True, the profile object will have the scan. In case ``profile`` is True, the profile object will have the
...@@ -305,18 +302,21 @@ def scan(fn, ...@@ -305,18 +302,21 @@ def scan(fn,
inner graph with the new cvm linker ( with default modes, inner graph with the new cvm linker ( with default modes,
other linkers this argument is useless) other linkers this argument is useless)
:rtype: tuple Returns
:return: tuple of the form (outputs, updates); ``outputs`` is either a -------
Theano variable or a list of Theano variables representing the tuple
outputs of ``scan`` (in the same order as in Tuple of the form (outputs, updates); ``outputs`` is either a
``outputs_info``). ``updates`` is a subclass of dictionary Theano variable or a list of Theano variables representing the
specifying the outputs of ``scan`` (in the same order as in
update rules for all shared variables used in scan ``outputs_info``). ``updates`` is a subclass of dictionary
This dictionary should be passed to ``theano.function`` when specifying the
you compile your function. The change compared to a normal update rules for all shared variables used in scan
dictionary is that we validate that keys are SharedVariable This dictionary should be passed to ``theano.function`` when
and addition of those dictionary are validated to be consistent. you compile your function. The change compared to a normal
""" dictionary is that we validate that keys are SharedVariable
and addition of those dictionary are validated to be consistent.
"""
# Note : see the internal documentation of the scan op for naming # Note : see the internal documentation of the scan op for naming
# conventions and all other details # conventions and all other details
if options is None: if options is None:
...@@ -544,6 +544,7 @@ def one_step_scan(fn, ...@@ -544,6 +544,7 @@ def one_step_scan(fn,
truncate_gradient): truncate_gradient):
""" """
This function is evaluated if `n_steps` evaluates to either 1 or -1. This function is evaluated if `n_steps` evaluates to either 1 or -1.
""" """
# 1. Grab slices of sequences # 1. Grab slices of sequences
inputs_slices = [input[0] for input in inputs] inputs_slices = [input[0] for input in inputs]
......
""" """
This module provides the Scan Op This module provides the Scan Op.
See scan.py for details on scan.
See scan.py for details on scan
""" """
from __future__ import print_function from __future__ import print_function
...@@ -157,25 +158,32 @@ class ScanOp(PureOp): ...@@ -157,25 +158,32 @@ class ScanOp(PureOp):
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
""" """
:param node: the Apply node returned by the ``make_node`` function
of the scan op class
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to Parameters
reuse memory allocated by a previous call. ----------
node
The Apply node returned by the ``make_node`` function of the scan
op class.
storage_map
dict variable -> one-element-list where a computed value for this
variable may be found.
compute_map
dict variable -> one-element-list where a boolean value will be
found. The boolean indicates whether the variable's storage_map
container contains a valid value (True) or if it has not been
computed yet (False).
no_recycling
List of variables for which it is forbidden to reuse memory
allocated by a previous call.
Notes
-----
If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
""" """
# 1. Collect all memory buffers # 1. Collect all memory buffers
node_input_storage = [storage_map[r] for r in node.inputs] node_input_storage = [storage_map[r] for r in node.inputs]
......
""" """
This module provides utility functions for the Scan Op This module provides utility functions for the Scan Op.
See scan.py for details on scan.
See scan.py for details on scan
""" """
from __future__ import print_function from __future__ import print_function
__docformat__ = 'restructedtext en' __docformat__ = 'restructedtext en'
...@@ -41,8 +42,11 @@ def expand(tensor_var, size): ...@@ -41,8 +42,11 @@ def expand(tensor_var, size):
``tensor_var``, namely: ``tensor_var``, namely:
rval[:d1] = tensor_var rval[:d1] = tensor_var
:param tensor_var: Theano tensor variable Parameters
:param size: int ----------
tensor_var : Theano tensor variable.
size : int
""" """
# Corner case that I might use in an optimization # Corner case that I might use in an optimization
if size == 0: if size == 0:
...@@ -57,7 +61,8 @@ def expand(tensor_var, size): ...@@ -57,7 +61,8 @@ def expand(tensor_var, size):
def to_list(ls): def to_list(ls):
""" """
Converts ``ls`` to list if it is a tuple, or wraps ``ls`` into a list if Converts ``ls`` to list if it is a tuple, or wraps ``ls`` into a list if
it is not a list already it is not a list already.
""" """
if isinstance(ls, (list, tuple)): if isinstance(ls, (list, tuple)):
return list(ls) return list(ls)
...@@ -70,7 +75,9 @@ class until(object): ...@@ -70,7 +75,9 @@ class until(object):
Theano can end on a condition. In order to differentiate this condition Theano can end on a condition. In order to differentiate this condition
from the other outputs of scan, this class is used to wrap the condition from the other outputs of scan, this class is used to wrap the condition
around it. around it.
""" """
def __init__(self, condition): def __init__(self, condition):
self.condition = tensor.as_tensor_variable(condition) self.condition = tensor.as_tensor_variable(condition)
assert self.condition.ndim == 0 assert self.condition.ndim == 0
...@@ -78,10 +85,12 @@ class until(object): ...@@ -78,10 +85,12 @@ class until(object):
def get_updates_and_outputs(ls): def get_updates_and_outputs(ls):
""" """
Parses the list ``ls`` into outputs and updates. The semantics Parses the list ``ls`` into outputs and updates.
of ``ls`` is defined by the constructive function of scan.
The semantics of ``ls`` is defined by the constructive function of scan.
The elemets of ``ls`` are either a list of expressions representing the The elemets of ``ls`` are either a list of expressions representing the
outputs/states, a dictionary of updates or a condition. outputs/states, a dictionary of updates or a condition.
""" """
def is_list_outputs(elem): def is_list_outputs(elem):
if (isinstance(elem, (list, tuple)) and if (isinstance(elem, (list, tuple)) and
...@@ -150,23 +159,23 @@ def get_updates_and_outputs(ls): ...@@ -150,23 +159,23 @@ def get_updates_and_outputs(ls):
def clone(output, replace=None, strict=True, share_inputs=True): def clone(output, replace=None, strict=True, share_inputs=True):
""" """
Function that allows replacing subgraphs of a computational Function that allows replacing subgraphs of a computational graph.
graph. It returns a copy of the initial subgraph with the corresponding
It returns a copy of the initial subgraph with the corresponding
substitutions. substitutions.
:type output: Theano Variables (or Theano expressions) Parameters
:param outputs: Theano expression that represents the computational ----------
graph output : Theano Variables (or Theano expressions)
Theano expression that represents the computational graph.
:type replace: dict replace: dict
:param replace: dictionary describing which subgraphs should be Dictionary describing which subgraphs should be replaced by what.
replaced by what share_inputs : bool
If True, use the same inputs (and shared variables) as the original
graph. If False, clone them. Note that cloned shared variables still
use the same underlying storage, so they will always have the same
value.
:type share_inputs: bool
:param share_inputs: If True, use the same inputs (and shared variables)
as the original graph. If False, clone them. Note that cloned
shared variables still use the same underlying storage, so they
will always have the same value.
""" """
inps, outs, other_stuff = rebuild_collect_shared(output, inps, outs, other_stuff = rebuild_collect_shared(output,
[], [],
...@@ -189,6 +198,7 @@ def canonical_arguments(sequences, ...@@ -189,6 +198,7 @@ def canonical_arguments(sequences,
Mainly it makes sure that arguments are given as lists of dictionaries, Mainly it makes sure that arguments are given as lists of dictionaries,
and that the different fields of of a dictionary are set to default and that the different fields of of a dictionary are set to default
value if the user has not provided any. value if the user has not provided any.
""" """
states_info = to_list(outputs_info) states_info = to_list(outputs_info)
parameters = [tensor.as_tensor_variable(x) for x in to_list(non_sequences)] parameters = [tensor.as_tensor_variable(x) for x in to_list(non_sequences)]
...@@ -303,13 +313,14 @@ def canonical_arguments(sequences, ...@@ -303,13 +313,14 @@ def canonical_arguments(sequences,
def infer_shape(outs, inputs, input_shapes): def infer_shape(outs, inputs, input_shapes):
''' """
Compute the shape of the outputs given the shape of the inputs Compute the shape of the outputs given the shape of the inputs
of a theano graph. of a theano graph.
We do it this way to avoid compiling the inner function just to get We do it this way to avoid compiling the inner function just to get the
the shape. Changes to ShapeFeature could require changes in this function. shape. Changes to ShapeFeature could require changes in this function.
'''
"""
# We use a ShapeFeature because it has all the necessary logic # We use a ShapeFeature because it has all the necessary logic
# inside. We don't use the full ShapeFeature interface, but we # inside. We don't use the full ShapeFeature interface, but we
# let it initialize itself with an empty fgraph, otherwise we will # let it initialize itself with an empty fgraph, otherwise we will
...@@ -326,9 +337,10 @@ def infer_shape(outs, inputs, input_shapes): ...@@ -326,9 +337,10 @@ def infer_shape(outs, inputs, input_shapes):
shape_feature.set_shape(inp, inp_shp) shape_feature.set_shape(inp, inp_shp)
def local_traverse(out): def local_traverse(out):
''' """
Go back in the graph, from out, adding computable shapes to shape_of. Go back in the graph, from out, adding computable shapes to shape_of.
'''
"""
if out in shape_feature.shape_of: if out in shape_feature.shape_of:
# Its shape is already known # Its shape is already known
...@@ -358,14 +370,17 @@ def allocate_memory(T, y_info, y): ...@@ -358,14 +370,17 @@ def allocate_memory(T, y_info, y):
""" """
Allocates memory for an output of scan. Allocates memory for an output of scan.
:param T: scalar Parameters
Variable representing the number of steps scan will run ----------
:param y_info: dict T : scalar
Variable representing the number of steps scan will run.
y_info : dict
Dictionary describing the output (more specifically describing shape Dictionary describing the output (more specifically describing shape
information for the output information for the output.
:param y: Tensor variable y : Tensor variable
Expression describing the computation resulting in out entry of y. Expression describing the computation resulting in out entry of y.
It can be used to infer the shape of y It can be used to infer the shape of y.
""" """
if 'shape' in y_info: if 'shape' in y_info:
return tensor.zeros([T, ] + list(y_info['shape']), return tensor.zeros([T, ] + list(y_info['shape']),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论