Merge pull request #3297 from harlouci/numpydoc_sandbox_2

Numpydoc sandbox 2

Merge pull request #3297 from harlouci/numpydoc_sandbox_2
931f4e93 · abergeron · d1eba87d · 533f5dab · 931f4e93 · 931f4e93
--- a/theano/sandbox/cuda/GpuConv3D.py
+++ b/theano/sandbox/cuda/GpuConv3D.py
@@ -12,7 +12,10 @@ from theano.sandbox.cuda import CudaNdarrayType, GpuOp
 class GpuConv3D(GpuOp):
-    """ GPU implementation of Conv3D """
+    """
+    GPU implementation of Conv3D.
+    """
    def __eq__(self, other):
        return type(self) == type(other)
@@ -25,10 +28,18 @@ class GpuConv3D(GpuOp):
    def make_node(self, V, W, b, d):
        """
-            :param V: Visible unit, input
-            :param W: Weights, filter
+        Parameters
-            :param b: bias
+        ----------
-            :param d: strides when moving the filter over the input
+        V
+            Visible unit, input.
+        W
+            Weights, filter.
+        b
+            Bias.
+        d
+            Strides when moving the filter over the input.
        """
        V_ = as_cuda_ndarray_variable(V)
        W_ = as_cuda_ndarray_variable(W)

--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -15,14 +15,25 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
 class GpuConvGrad3D(GpuOp):
-    """ GPU version of gradient of ConvGrad3D with respect to W """
+    """
+    GPU version of gradient of ConvGrad3D with respect to W.
+    """
    def make_node(self, V, d, WShape, dCdH):
        """
-        :param V: visible
-        :param d: strides
+        Parameters
-        :param WShape: shapes of the weights -> shape of this op output
+        ----------
-        :param dCdH: other input with what V will be convolved.
+        V
+            Visible.
+        d
+            Strides.
+        WShape
+            Shapes of the weights -> shape of this op output.
+        dCdH
+            Other input with what V will be convolved.
        """
        V_ = as_cuda_ndarray_variable(V)
        d_ = T.as_tensor_variable(d)

--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -16,7 +16,11 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
 class GpuConvTransp3D(GpuOp):
-    """ The gpu version of ConvTransp3D """
+    """
+    The gpu version of ConvTransp3D.
+    """
    def __eq__(self, other):
        return type(self) == type(other)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -94,12 +94,14 @@ cuda_enabled = False
 # Code factorized within a function so that it may be called from multiple
 # places (which is not currently the case, but may be useful in the future).
 def set_cuda_disabled():
-    """Function used to disable cuda.
+    """
+    Function used to disable cuda.
    A warning is displayed, so that the user is aware that cuda-based code is
    not going to work.
    Note that there is no point calling this function from outside of
    `cuda.__init__`, since it has no effect once the module is loaded.
    """
    global cuda_available, cuda_warning_is_displayed
    cuda_available = False
@@ -116,8 +118,9 @@ libcuda_ndarray_so = os.path.join(cuda_ndarray_loc,
 def try_import():
    """
-    load the cuda_ndarray module if present and up to date
+    Load the cuda_ndarray module if present and up to date.
-    return True if loaded correctly, otherwise return False
+    Return True if loaded correctly, otherwise return False.
    """
    cuda_files = (
        'cuda_ndarray.cu',
@@ -219,6 +222,7 @@ if cuda_available:
    def ok():
        """
        Check if an existing library exists and can be read.
        """
        try:
            open(libcuda_ndarray_so).close()
@@ -266,6 +270,7 @@ class GpuOp(theano.gof.Op):
    It is defined in __init__.py so that it exists even when `cuda_available`
    is False (this is necessary to avoid breaking the test suite).
    """
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
@@ -331,18 +336,23 @@ def use(device,
        test_driver=True):
    """
    Error and warning about CUDA should be displayed only when this
-    function is called.  We need to be able to load this module only
+    function is called. We need to be able to load this module only
    to check if it is available!
-    :param device: string "cpu", "gpu", "gpuN" (N is the device number to use)
+    Parameters
-    :param force: Will always raise an exception if we can't use the gpu.
+    ----------
-    :param default_to_move_computation_to_gpu: If gpu init succeeded, enable by
+    device : string 
-                                               default optimizations to move
+        "cpu", "gpu", "gpuN" (N is the device number to use).
-                                               computations to the gpu
+    force
-    :param move_shared_float32_to_gpu: If gpu init succeeded, put new shared
+        Will always raise an exception if we can't use the gpu.
-                                       variables in float32 on the gpu.
+    default_to_move_computation_to_gpu
-    :param enable_cuda: If the gpu is correctly enabled,
+        If gpu init succeeded, enable by default optimizations to move
-                        set the variable cuda_enabled to True.
+        computations to the gpu.
+    move_shared_float32_to_gpu
+        If gpu init succeeded, put new shared variables in float32 on the gpu.
+    enable_cuda
+        If the gpu is correctly enabled, set the variable cuda_enabled to True.
    """
    global cuda_enabled, cuda_initialization_error_message
    if force and not cuda_available and device.startswith('gpu'):
@@ -480,7 +490,7 @@ use.device_number = None
 def unuse():
    """
-    This undo what was done by the call to
+    This undo what was done by the call to.
    use('gpu[0-9]', default_to_move_computation_to_gpu=True,
        move_shared_float32_to_gpu=True,
@@ -488,7 +498,9 @@ def unuse():
    This is used in Pylearn2 tests to enable/disable the GPU when needed.
-    After this call, the rest of Theano think the GPU shouldn't be used by default.
+    After this call, the rest of Theano think the GPU shouldn't be used by
+    default.
    """
    global cuda_enabled
    cuda_enabled = False
@@ -502,9 +514,11 @@ def unuse():
 def handle_shared_float32(tf):
-    """Set the default shared type for float32 tensor to CudaNdarrayType
+    """
+    Set the default shared type for float32 tensor to CudaNdarrayType.
    This function is intended to be called from use(gpu_index), not directly.
    """
    if tf:
        theano.compile.shared_constructor(float32_shared_constructor)

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
-"""This file implement 3 different version of the elemwise op on the
+"""
+This file implement 3 different version of the elemwise op on the
 gpu. Only NaiveAlgo is used and it is not very naive now.
 The elemwise fct are also used with scalar operation! So it can happen
@@ -40,12 +41,25 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value',
 class SupportCodeError(Exception):
-    """It is currently not possible to auto-generate a GPU implementation for
+    """
+    It is currently not possible to auto-generate a GPU implementation for
    an elementwise Op with c_support_code_apply().
-    But we support Op.c_support_code."""
+    But we support Op.c_support_code.
+    """
 class NaiveAlgo(object):
+    """
+    Parameters
+    ----------
+    scalar_op
+        The scalar operation to execute on each element.
+    sync
+        If True, will wait after the kernel launch and check for error call.
+    """
    verbose = 0  # 1, 2 or 3 for more verbose output.
    @property
@@ -57,10 +71,6 @@ class NaiveAlgo(object):
            return ver
    def __init__(self, scalar_op, sync=True, inplace_pattern=None):
-        """
-        :param scalar_op: the scalar operation to execute on each element.
-        :param sync: if True, will wait after the kernel launch and check for error call.
-        """
        if inplace_pattern is None:
            inplace_pattern = {}
        try:
@@ -154,8 +164,10 @@ class NaiveAlgo(object):
        return sio.getvalue()
    def c_src_kernel_tiling(self, node, nodename):
-        """ The kernel applies to problems with <= 5 dimensions """
+        """
+        The kernel applies to problems with <= 5 dimensions.
+        """
        # The kernel is intended to be structured roughly like this:
        """
        static __global__ void kernel()
@@ -278,8 +290,10 @@ class NaiveAlgo(object):
        return sio.getvalue()
    def c_src_kernel_tiling_less_registers(self, node, nodename):
-        """ The kernel applies to problems with <= 5 dimensions """
+        """
+        The kernel applies to problems with <= 5 dimensions.
+        """
        nd = node.outputs[0].type.ndim
        n_in = len(node.inputs)
        n_out = len(node.outputs)
@@ -1049,12 +1063,16 @@ class ErfinvGPU(Erfinv):
    """
    Provides a c-code implementation of the inverse error function for GPU.
-    Note: We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we
+    Notes
+    -----
+    We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we
    currently rely on Nvidia's cublas library to provide the erfinv
    c-implementation (which requires different c_headers). As it stands,
    theano.scalar.basic_scipy.Erfinv does not have c_code as scipy does not
-    export the required C function
+    export the required C function.
    """
    def c_headers(self):
        return ['math_functions.h', 'cublas_v2.h']
@@ -1070,14 +1088,19 @@ erfinv_gpu = ErfinvGPU(upgrade_to_float_no_complex, name='erfinv_gpu')
 class ErfcxGPU(Erfinv):
    """
-    Provides a c-code implementation of the scaled complementary error function for GPU.
+    Provides a c-code implementation of the scaled complementary error function
+    for GPU.
-    Note: We do not add this c_code to theano.scalar.basic_scipy.Erfcx, as we
+    Notes
+    -----
+    We do not add this c_code to theano.scalar.basic_scipy.Erfcx, as we
    currently rely on Nvidia's cublas library to provide the erfcx
    c-implementation (which requires different c_headers). As it stands,
    theano.scalar.basic_scipy.Erfcx does not have c_code as scipy does not
-    export the required C function
+    export the required C function.
    """
    def c_headers(self):
        return ['math_functions.h', 'cublas_v2.h']
@@ -1088,4 +1111,4 @@ class ErfcxGPU(Erfinv):
            raise NotImplementedError('type not supported', type)
        return "%(z)s = erfcx(%(x)s);" % locals()
 erfcx_gpu = ErfcxGPU(upgrade_to_float_no_complex, name='erfcx_gpu')
\ No newline at end of file
--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
@@ -13,13 +13,19 @@ if cuda_available:
 class GpuCumsum(CumsumOp, GpuOp):
+    """
+    Parameters
+    ----------
+    axis
+        Can not be None. If you want the array flatten, do it before.
+    """
    SUPPORTED_NDIMS = 3
    __props__ = ('axis', 'max_threads_dim0', 'max_grid_size1', 'max_grid_size2')
    def __init__(self, axis):
-        """
-        ``axis`` can not be None. If you want the array flatten, do it before.
-        """
        self.axis = axis
        self.max_threads_dim0 = None
        self.max_grid_size1 = None
@@ -415,11 +421,13 @@ class GpuCumsum(CumsumOp, GpuOp):
 def values_eq_approx_high_tol(a, b):
-    """This fct is needed to don't have DebugMode raise useless
+    """
+    This fct is needed to don't have DebugMode raise useless
    error due to rounding error.
    This happen with big input size due to change in the order of
    operation.
    """
    rtol = None
    if a.size > 100000:

--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -171,10 +171,11 @@ class CuIFFTOp(ScikitsCudaOp):
 def to_complex_gpuarray(x, copyif=False):
    """
-    adapted version of theano.misc.pycuda_utils.to_gpuarray that takes
+    Adapted version of theano.misc.pycuda_utils.to_gpuarray that takes
    an array with an extra trailing dimension of length 2 for
    real/imaginary parts, and turns it into a complex64 PyCUDA
    GPUArray.
    """
    if not isinstance(x, CudaNdarray):
        raise ValueError("We can transfer only CudaNdarray "
@@ -213,7 +214,8 @@ def bptrs(a):
    """
    Pointer array when input represents a batch of matrices.
-    taken from scikits.cuda tests/test_cublas.py
+    Taken from scikits.cuda tests/test_cublas.py.
    """
    return pycuda.gpuarray.arange(a.ptr, a.ptr + a.shape[0] * a.strides[0],
                                  a.strides[0], dtype=cublas.ctypes.c_void_p)
@@ -222,8 +224,9 @@ def bptrs(a):
 def sc_complex_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N',
                           handle=None):
    """
-    uses cublasCgemmBatched to compute a bunch of complex dot products
+    Uses cublasCgemmBatched to compute a bunch of complex dot products
-    in parallel
+    in parallel.
    """
    if handle is None:
        handle = scikits.cuda.misc._global_cublas_handle
@@ -292,7 +295,9 @@ class BatchedComplexDotOp(ScikitsCudaOp):
    """
    This version uses cublasCgemmBatched under the hood, instead of
    doing multiple cublasCgemm calls.
    """
    def make_node(self, inp1, inp2):
        inp1 = basic_ops.gpu_contiguous(
            basic_ops.as_cuda_ndarray_variable(inp1))
@@ -355,10 +360,15 @@ batched_complex_dot = BatchedComplexDotOp()
 def mult_and_reduce(input_fft_v, filters_fft_v, input_shape=None,
                    filter_shape=None):
    """
-    input_fft_v is (b, ic, i0, i1//2 + 1, 2)
-    filters_fft_v is (oc, ic, i0, i1//2 + 1, 2)
-    """
+    Parameters
+    ----------
+    input_fft_v 
+        It's (b, ic, i0, i1//2 + 1, 2).
+    filters_fft_v
+        It's (oc, ic, i0, i1//2 + 1, 2).
+    """
    if input_shape is None:
        input_shape = input_fft_v.shape  # symbolic
@@ -405,16 +415,19 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
    On valid mode the filters must be smaller than the input.
-    input: (b, ic, i0, i1)
+    Parameters
-    filters: (oc, ic, f0, f1)
+    ----------
+    input
-    border_mode: 'valid' of 'full'
+        (b, ic, i0, i1).
+    filters
+        (oc, ic, f0, f1).
+    border_mode : {'valid', 'full'}
+    pad_last_dim
+        Unconditionally pad the last dimension of the input
+        to to turn it from odd to even.  Will strip the
+        padding before returning the result.
-    pad_last_dim: Unconditionally pad the last dimension of the input
-                  to to turn it from odd to even.  Will strip the
-                  padding before returning the result.
    """
    # use symbolic shapes to compute shape info at runtime if not specified
    if image_shape is None:
        image_shape = input.shape
@@ -546,16 +559,19 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
    On valid mode the filters must be smaller than the input.
-    input: (b, ic, i0, i1, i2)
+    Parameters
-    filters: (oc, ic, f0, f1, i2)
+    ----------
+    input
-    border_mode: 'valid' of 'full'
+        (b, ic, i0, i1, i2).
+    filters
+        (oc, ic, f0, f1, i2).
+    border_mode : {'valid', 'full'}.
+    pad_last_dim
+        Unconditionally pad the last dimension of the input
+        to to turn it from odd to even.  Will strip the
+        padding before returning the result.
-    pad_last_dim: Unconditionally pad the last dimension of the input
-                  to to turn it from odd to even.  Will strip the
-                  padding before returning the result.
    """
    # use symbolic shapes to compute shape info at runtime if not specified
    if image_shape is None:
        image_shape = input.shape
@@ -670,5 +686,3 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
    # output should now be the result of a batched valid convolution
    # of the input with the filters.
    return basic_ops.as_cuda_ndarray_variable(output)
--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
-""" Helper routines for generating gpu kernels for nvcc.
 """
+Helper routines for generating gpu kernels for nvcc.
+"""
-def nvcc_kernel(name, params, body):
-    """Return the c code of a kernel function.
-    :param params: the parameters to the function as one or more strings
+def nvcc_kernel(name, params, body):
+    """
+    Return the c code of a kernel function.
-    :param body: the [nested] list of statements for the body of the
+    Parameters
-         function.  These will be separated by ';' characters.
+    ----------
+    params
+        The parameters to the function as one or more strings.
+    body
+        The [nested] list of statements for the body of the
+        function. These will be separated by ';' characters.
    """
    paramstr = ', '.join(params)
@@ -29,7 +35,10 @@ def nvcc_kernel(name, params, body):
 def code_version(version):
-    """decorator to support version-based cache mechanism"""
+    """
+    Decorator to support version-based cache mechanism.
+    """
    if not isinstance(version, tuple):
        raise TypeError('version must be tuple', version)
@@ -43,22 +52,31 @@ UNVERSIONED = ()
 @code_version((1,))
 def inline_reduce(N, buf, pos, count, manner_fn):
-    """Return C++ code for a function that reduces a contiguous buffer.
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
-    :param N: length of the buffer
-    :param buf: buffer pointer
+    Parameters
-    :param pos: index of executing thread
+    ----------
-    :param count: number of executing threads
+    N
+        Length of the buffer.
-    :param manner_fn: a function that accepts strings of arguments a
+    buf
+        Buffer pointer.
+    pos
+        Index of executing thread.
+    count
+        Number of executing threads.
+    manner_fn
+        A function that accepts strings of arguments a
        and b, and returns c code for their reduction. (Example:
        return "%(a)s + %(b)s" for a sum reduction).
    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
+    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.
-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    ----- 
+    buf should be in gpu shared memory, we access it many times.
    """
    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
@@ -127,18 +145,26 @@ def inline_reduce_prod(N, buf, pos, count):
 def inline_softmax(N, buf, buf2, threadPos, threadCount):
    """
-    :param N: length of the buffer
+    Parameters
-    :param threadPos: index of executing thread
+    ----------
-    :param threadCount: number of executing threads
+    N
+        Length of the buffer.
+    threadPos
+        Index of executing thread.
+    threadCount
+        Number of executing threads.
    :Precondition: buf and buf2 contain two identical copies of the input
        to softmax
    :Postcondition: buf contains the softmax, buf2 contains un-normalized
        softmax
-    :note: buf and buf2 should be in gpu shared memory, we access it many times
+    Notes
+    -----
+    buf and buf2 should be in gpu shared memory, we access it many times.
+    We use __i as an int variable in a loop.
-    :note2: We use __i as an int variable in a loop
    """
    return [
            # get max of buf (trashing all but buf[0])
@@ -169,26 +195,38 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
 def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
                               manner_fn, manner_init,
                               b='', stride_b=''):
-    """Return C++ code for a function that reduces a contiguous buffer.
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
-    :param N: length of the buffer
-    :param buf: buffer pointer of size warpSize * sizeof(float)
+    Parameters
-    :param pos: index of executing thread
+    ----------
-    :param count: number of executing threads
+    N
-    :param b: Optional, pointer to the bias
+        Length of the buffer.
-    :param stride_b: Optional, the stride of b if b is provided
+    buf
+        Buffer pointer of size warpSize * sizeof(float).
-    :param manner_fn: a function that accepts strings of arguments a
+    pos
+        Index of executing thread.
+    count
+        Number of executing threads.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.
+    manner_fn
+        A function that accepts strings of arguments a
        and b, and returns c code for their reduction. (Example:
        return "%(a)s + %(b)s" for a sum reduction).
-    :param manner_init: a function that accepts strings of arguments a
+    manner_init
-        and return c code for its initialization
+        A function that accepts strings of arguments a
+        and return c code for its initialization.
    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
+    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.
-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
    """
    if b:
@@ -263,24 +301,39 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
                                b='', stride_b=''):
    """
-    :param N: length of the buffer, atleast waprSize(32).
+    Parameters
-    :param buf: a shared memory buffer of size warpSize * sizeof(float)
+    ----------
-    :param x: a ptr to the gpu memory where the row is stored
+    N
-    :param stride_x: the stride between each element in x
+        Length of the buffer, atleast waprSize(32).
-    :param sm: a ptr to the gpu memory to store the result
+    buf
-    :param sm_stride: the stride between eash sm element
+        A shared memory buffer of size warpSize * sizeof(float).
-    :param threadPos: index of executing thread
+    x
-    :param threadCount: number of executing threads
+        A ptr to the gpu memory where the row is stored.
-    :param b: Optional, pointer to the bias
+    stride_x
-    :param stride_b: Optional, the stride of b if b is provided
+        The stride between each element in x.
+    sm
+        A ptr to the gpu memory to store the result.
+    sm_stride
+        The stride between each sm element.
+    threadPos
+        Index of executing thread.
+    threadCount
+        Number of executing threads.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.
    :Precondition: buf is empty
    :Postcondition: buf[0] contains the softmax,
        buf2 contains un-normalized softmax
-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
+    We use tx as an int variable in a loop.
-    :note2: We use tx as an int variable in a loop
    """
    ret = [
        # get max of buf (trashing all but buf[0])

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -13,7 +13,9 @@ from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
 class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
    """
    nin = 3
    nout = 3
@@ -224,7 +226,9 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1Ho
 class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
    """
    nin = 3
    nout = 1
    """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
@@ -393,7 +397,9 @@ gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasD
 class GpuSoftmax(GpuOp):
    """
    Implement Softmax on the gpu.
    """
    def __eq__(self, other):
        return type(self) == type(other)
@@ -555,7 +561,9 @@ gpu_softmax = GpuSoftmax()
 class GpuSoftmaxWithBias(GpuOp):
    """
    Implement SoftmaxWithBias on the gpu.
    """
    nin = 2
    nout = 1

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -85,7 +85,10 @@ nvcc_version = None
 def is_nvcc_available():
-    """Return True iff the nvcc compiler is found."""
+    """
+    Return True iff the nvcc compiler is found.
+    """
    def set_version():
        p_out = output_subprocess_Popen([nvcc_path, '--version'])
        ver_line = decode(p_out[0]).strip().split('\n')[-1]
@@ -150,6 +153,7 @@ class NVCC_compiler(Compiler):
        """
        This args will be received by compile_str() in the preargs paramter.
        They will also be included in the "hard" part of the key module.
        """
        flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
        if config.nvcc.fastmath:
@@ -209,33 +213,47 @@ class NVCC_compiler(Compiler):
            module_name, src_code,
            location=None, include_dirs=[], lib_dirs=[], libs=[], preargs=[],
            rpaths=rpath_defaults, py_module=True, hide_symbols=True):
-        """:param module_name: string (this has been embedded in the src_code
+        """
-        :param src_code: a complete c or c++ source listing for the module
-        :param location: a pre-existing filesystem directory where the
-                         cpp file and .so will be written
-        :param include_dirs: a list of include directory names
-                             (each gets prefixed with -I)
-        :param lib_dirs: a list of library search path directory names
-                         (each gets prefixed with -L)
-        :param libs: a list of libraries to link with
-                     (each gets prefixed with -l)
-        :param preargs: a list of extra compiler arguments
-        :param rpaths: list of rpaths to use with Xlinker.
-                       Defaults to `rpath_defaults`.
-        :param py_module: if False, compile to a shared library, but
-            do not import as a Python module.
-        :param hide_symbols: if True (the default), hide all symbols
-        from the library symbol table unless explicitely exported.
-        :returns: dynamically-imported python module of the compiled code.
+        Parameters
+        ----------
+        module_name: str
+             This has been embedded in the src_code.
+        src_code
+            A complete c or c++ source listing for the module.
+        location
+            A pre-existing filesystem directory where the
+            cpp file and .so will be written.
+        include_dirs
+            A list of include directory names (each gets prefixed with -I).
+        lib_dirs
+            A list of library search path directory names (each gets 
+            prefixed with -L).
+        libs
+            A list of libraries to link with (each gets prefixed with -l).
+        preargs
+            A list of extra compiler arguments.
+        rpaths
+            List of rpaths to use with Xlinker. Defaults to `rpath_defaults`.
+        py_module
+            If False, compile to a shared library, but
+            do not import as a Python module.
+        hide_symbols
+            If True (the default), hide all symbols from the library symbol
+            table unless explicitely exported.
+        Returns
+        -------
+        module
+            Dynamically-imported python module of the compiled code.
            (unless py_module is False, in that case returns None.)
-        :note 1: On Windows 7 with nvcc 3.1 we need to compile in the
+        Notes
-                 real directory Otherwise nvcc never finish.
+        -----
+        On Windows 7 with nvcc 3.1 we need to compile in the real directory
+        Otherwise nvcc never finish.
        """
        rpaths = list(rpaths)
        if sys.platform == "win32":

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -141,7 +141,9 @@ class InputToGpuOptimizer(Optimizer):
    Transfer the input of a graph to the gpu if it is necessary.
    It should make this part of the optimizer faster we will will need only 1
    pass on the fgraph.
    """
    def __init__(self):
        Optimizer.__init__(self)
@@ -208,7 +210,10 @@ def dtype_in_elemwise_supported(op):
    Return True of the Elemwise op is supported on the gpu.
    Return False otherwise.
-    :note: We need to check inside the Composite op.
+    Notes
+    -----
+    We need to check inside the Composite op.
    """
    def get_all_basic_scalar(composite_op):
        l = []
@@ -231,8 +236,10 @@ def dtype_in_elemwise_supported(op):
 @register_opt()
 @local_optimizer([tensor.Elemwise])
 def local_gpu_elemwise_0(node):
-    """elemwise(..., host_from_gpu, ...)
+    """
-       -> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
+    Elemwise(..., host_from_gpu, ...)
+    -> host_from_gpu(elemwise(gpu_from_host, ..., gpu_from_host)
    """
    if (isinstance(node.op, tensor.Elemwise) and
        dtype_in_elemwise_supported(node.op)):
@@ -294,6 +301,7 @@ def local_gpu_elemwise_0(node):
 def local_gpu_elemwise_1(node):
    """
    gpu_from_host(Elemwise)) -> GpuElemwise(gpu_from_host(...))
    """
    if isinstance(node.op, GpuFromHost):
        host_i, = node.inputs
@@ -350,6 +358,7 @@ def local_gpu_dimshuffle_0(node):
    """
    dimshuffle(host_from_gpu()) -> host_from_gpu(gpu_dimshuffle)
    gpu_from_host(dimshuffle) -> gpu_dimshuffle(gpu_from_host)
    """
    if isinstance(node.op, tensor.DimShuffle):
        input, = node.inputs
@@ -375,6 +384,7 @@ def local_gpu_specifyShape_0(node):
    """
    specify_shape(host_from_gpu()) -> host_from_gpu(specify_shape)
    gpu_from_host(specify_shape) -> specify_shape(gpu_from_host)
    """
    if isinstance(node.op, tensor.SpecifyShape):
        input = node.inputs[0]
@@ -403,11 +413,11 @@ def local_gpu_dot_to_dot22(node):
    transforming the vector into a matrix, apply gpudot22 and reshaping
    the output.
-    A more suitable solution would be to use the right cublas call
+    A more suitable solution would be to use the right cublas call.
-    This is needed in fast_compile
+    This is needed in fast_compile.
-    """
+    """
    # In case the got do input upcast, we much check that we can
    # make it run on the gpu.
    if isinstance(node.op, GpuFromHost):
@@ -482,10 +492,11 @@ theano.compile.optdb.register('assert_no_cpu_op', assert_no_cpu_op, 49.2)
 @register_opt()
 @local_optimizer([theano.ifelse.IfElse, gpu_from_host])
 def local_gpu_lazy_ifelse(node):
-    """
+    """    
    gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host)
    ifelse(host_from_gpu) -> host_from_gpu(ifelse)
    """
    if isinstance(node.op, theano.ifelse.IfElse) and not node.op.gpu:
        gpu_ifelse = theano.ifelse.IfElse(node.op.n_outs, gpu=True)
@@ -554,6 +565,7 @@ def local_gpu_dot22(node):
    gpu_from_host(dot22) -> gpudot(gpu_from_host)
    dot(host_from_gpu) -> host_from_gpu(gpudot22)
    """
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -577,6 +589,7 @@ def local_gpu_dot22scalar(node):
    gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
    dot(host_from_gpu) -> host_from_gpu(gpudot22scalar)
    """
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -602,7 +615,9 @@ def local_gpu_dot22scalar(node):
 def local_gpu_solve(node):
    """
    gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
    CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
    """
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -627,6 +642,7 @@ def local_gpu_solve(node):
 def local_gpu_gemv(node):
    """
    gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
    gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
    """
@@ -665,6 +681,7 @@ def local_gpu_gemv(node):
 def local_gpu_ger(node):
    """
    gpu_from_host(ger) -> gpu_ger(gpu_from_host)
    ger(host_from_gpu) -> host_from_gpu(gpu_ger)
    """
@@ -706,6 +723,7 @@ def local_gpu_gemm(node):
    gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
    gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
    """
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -1120,7 +1138,10 @@ def local_gpu_shape(node):
 @register_opt()
 @local_optimizer([tensor.Rebroadcast])
 def local_gpu_rebroadcast(node):
-    '''rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))'''
+    """
+    rebroadcast(host_from_gpu(x)) -> host_from_gpu(rebroadcast(x))
+    """
    if isinstance(node.op, tensor.Rebroadcast):
        x, = node.inputs
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
@@ -1342,7 +1363,8 @@ def local_conv_fft_full(node):
 def values_eq_approx_high_tol(a, b):
-    """This fct is needed to don't have DebugMode raise useless
+    """
+    This fct is needed to don't have DebugMode raise useless
    error due to ronding error.
    This happen as We reduce on the two last dimensions, so this
@@ -1364,6 +1386,7 @@ def local_gpu_conv(node):
    gpu_from_host(conv) -> gpu_conv(gpu_from_host)
    conv(host_from_gpu) -> host_from_gpu(gpu_conv)
    """
    def GpuConvOp_from_ConvOp(op):
        logical_img_hw = None
@@ -1534,7 +1557,10 @@ conv_groupopt.register('local_conv_gemm', local_conv_gemm, 30,
 class LocalCudaMetaOptimizer(LocalMetaOptimizer):
-    """Base class for CUDA-based LocalMetaOptimizers"""
+    """
+    Base class for CUDA-based LocalMetaOptimizers.
+    """
    def time_call(self, fn):
        # Override time_call() to do device synchronization
@@ -1827,7 +1853,6 @@ def local_gpu_join(node):
    by other opts, leaving us with
    host_from_gpu(gpu_join)
    For intermediate places in the graph not covered by the first opt, the
    following could be useful:
@@ -1911,8 +1936,12 @@ optdb.register('InplaceGpuBlasOpt',
 def get_device_type_sizes():
    """
-    :return:(gpu ptr size, cpu ptr size, int sizes(gpu and cpu))
-    :return type: tuple
+    Returns
+    -------
+    tuple
+        (gpu ptr size, cpu ptr size, int sizes(gpu and cpu)).
    """
    if hasattr(get_device_type_sizes, 'rval'):
        return get_device_type_sizes.rval
@@ -1941,7 +1970,7 @@ def get_device_type_sizes():
 def max_inputs_to_GpuElemwise(node):
    """
-    return the maximum number of inputs this GpuElemwise Apply node can
+    Return the maximum number of inputs this GpuElemwise Apply node can
    accept.
    This is needed as currently there is a limit of 256 bytes of
@@ -1950,8 +1979,8 @@ def max_inputs_to_GpuElemwise(node):
    2.x (not used).
    This measures the number of parameters we put in our GPU function and
-    computes the maximum number of inputs that respect the 256 byte
+    computes the maximum number of inputs that respect the 256 byte limit.
-    limit.
    """
    type_sizes = get_device_type_sizes()
    int_size = type_sizes['int_size']
@@ -1986,6 +2015,7 @@ def split_huge_add_or_mul(node):
    This should not happen for other GpuElemwise as their is only the fusion
    that can generate op with too much input and it check for that.
    """
    if node.op.scalar_op in (scal.add, scal.mul):
        max_nb_inputs = max_inputs_to_GpuElemwise(node)
@@ -2135,6 +2165,7 @@ def local_gpu_eye(node):
    gpu_from_host(eye) -> gpueye(gpu_from_host)
    eye(host_from_gpu) -> host_from_gpu(gpueye)
    """
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
@@ -2167,10 +2198,11 @@ def safe_to_cpu(x):
 def gpu_safe_new(x, tag=''):
    """
    Internal function that constructs a new variable from x with the same
-    type, but with a different name ( old name + tag). This function is used
+    type, but with a different name (old name + tag). This function is used
    by gradient, or the R-op to construct new variables for the inputs of
    the inner graph such that there is no interference between the original
    graph and the newly constructed graph.
    """
    if hasattr(x, 'name') and x.name is not None:
        nw_name = x.name + tag
@@ -2188,8 +2220,9 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
    """
    Different interface to clone, that allows you to pass inputs.
    Compared to clone, this method always replaces the inputs with
-    new variables of the same type, and returns those ( in the same
+    new variables of the same type, and returns those (in the same
    order as the original inputs).
    """
    if tag is None:
        tag = ''
@@ -2217,7 +2250,9 @@ def tensor_to_cuda(x):
 def local_gpu_extract_diagonal(node):
    """
    extract_diagonal(host_from_gpu()) -> host_from_gpu(extract_diagonal)
    gpu_from_host(extract_diagonal) -> extract_diagonal(gpu_from_host)
    """
    if (isinstance(node.op, nlinalg.ExtractDiag) and
        isinstance(node.inputs[0].type,
@@ -2249,9 +2284,10 @@ def typeConstructor(broadcastable, dtype):
 def gpuScanOptimization(node):
    """
    scan(host_from_gpu) -> host_from_gpu(GPUscan)
    gpu_from_host(scan) -> GPUscan(gpu_from_host)
-    """
+    """
    # gpu_from_host(scan) -> GPUscan(gpu_from_host)
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]

--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
 """
-Define CURAND_RandomStreams - backed by CURAND
+Define CURAND_RandomStreams - backed by CURAND.
 """
 __authors__ = "James Bergstra"
@@ -20,7 +21,8 @@ config = theano.config
 class CURAND_Base(GpuOp):
-    """ Base class for a random number generator implemented in CURAND.
+    """ 
+    Base class for a random number generator implemented in CURAND.
    The random number generator itself is an opaque reference managed by
    CURAND.  This Op uses a generic-typed shared variable to point to a CObject
@@ -30,18 +32,23 @@ class CURAND_Base(GpuOp):
    The actual random number generator is allocated from the seed, on the first
    call to allocate random numbers (see c_code).
-    :note:
+    Parameters
-        One caveat is that the random number state is simply not serializable.
+    ----------
-        Consequently, attempts to serialize functions compiled with these
+    output_type
-        random numbers will fail.
+        A theano type (e.g. tensor.fvector).
+    seed: int
+    destructive
+        True or False (on the generator)
+    Notes
+    -----
+    One caveat is that the random number state is simply not serializable.
+    Consequently, attempts to serialize functions compiled with these
+    random numbers will fail.
    """
    def __init__(self, output_type, seed, destructive):
-        """
-        output_type: a theano type (e.g. tensor.fvector)
-        seed: integer
-        destructive: True or False (on the generator)
-        """
        theano.gof.Op.__init__(self)
        self.destructive = destructive
        self.seed = seed
@@ -51,11 +58,17 @@ class CURAND_Base(GpuOp):
        assert output_type.dtype == "float32"
    def as_destructive(self):
-        """Return an destructive version of self"""
+        """
+        Return an destructive version of self.
+        """
        return self.__class__(self.output_type, self.seed, destructive=True)
    def _config(self):
-        """Return a tuple of attributes that define the Op"""
+        """
+        Return a tuple of attributes that define the Op.
+        """
        return (
                self.destructive,
                self.output_type,
@@ -81,7 +94,7 @@ class CURAND_Base(GpuOp):
        """
        Return a symbolic sample from generator.
-        cls dictates the random variable (e.g. uniform, normal)
+        cls dictates the random variable (e.g. uniform, normal).
        """
        v_size = theano.tensor.as_tensor_variable(size)
@@ -237,8 +250,11 @@ class CURAND_Base(GpuOp):
 class CURAND_Normal(CURAND_Base):
-    """Op to draw normal numbers using CURAND
    """
+    Op to draw normal numbers using CURAND.
+    """
    def _curand_call_str(self, **kwargs):
        return """curandGenerateNormal(*gen,
                CudaNdarray_DEV_DATA(%(o_sample)s),
@@ -248,8 +264,11 @@ class CURAND_Normal(CURAND_Base):
 class CURAND_Uniform(CURAND_Base):
-    """Op to draw uniform numbers using CURAND
    """
+    Op to draw uniform numbers using CURAND.
+    """
    def _curand_call_str(self, **kwargs):
        return """ curandGenerateUniform(*gen,
                CudaNdarray_DEV_DATA(%(o_sample)s),
@@ -262,24 +281,31 @@ class CURAND_RandomStreams(object):
    RandomStreams instance that creates CURAND-based random variables.
    One caveat is that generators are not serializable.
+    Parameters
+    ----------
+    seed : int
    """
    def __init__(self, seed):
-        """ seed: int
-        """
        self._start_seed = seed
        self._cur_seed = seed
        self._has_lost_states = False  # True if self.state_updates incomplete
        self.state_updates = []
    def updates(self):
-        """List of all (old, new) generator update pairs created by this
+        """
+        List of all (old, new) generator update pairs created by this
        instance.
        """
        return list(self.state_updates)
    def next_seed(self):
-        """Return a unique seed for initializing a random variable.
+        """
+        Return a unique seed for initializing a random variable.
        """
        self._cur_seed += 1
        return self._cur_seed - 1
@@ -295,6 +321,7 @@ class CURAND_RandomStreams(object):
            dtype=config.floatX):
        """
        Return symbolic tensor of uniform numbers.
        """
        if isinstance(size, tuple):
            msg = "size must be a tuple of int or a Theano variable"
@@ -321,8 +348,12 @@ class CURAND_RandomStreams(object):
        """
        Return symbolic tensor of normally-distributed numbers.
-        :param: size: Can be a list of integer or Theano variable(ex: the shape
+        Parameters
+        ----------
+        size
+            Can be a list of integer or Theano variable (ex: the shape
            of other Theano Variable)
        """
        if isinstance(size, tuple):
            msg = "size must be a tuple of int or a Theano variable"

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
-"""Provide CudaNdarrayType
+"""
+Provide CudaNdarrayType.
 """
 from __future__ import print_function
 import os
@@ -31,36 +33,47 @@ class CudaNdarrayType(Type):
    dtype = 'float32'
    Variable = None
-    """ This will be set to the Variable type corresponding to this class.
+    """
+    This will be set to the Variable type corresponding to this class.
    That variable type is `CudaNdarrayVariable` defined in the
    ``var.py`` file beside this one.
-    :note: The var file depends on the file basic_ops.py, which
+    Notes
-    depends on this file.  A cyclic dependency is avoided by not
+    -----
-    hardcoding ``Variable = CudaNdarrayVariable``.
+    The var file depends on the file basic_ops.py, which depends on this file.
+    A cyclic dependency is avoided by not hardcoding
+    ``Variable = CudaNdarrayVariable``.
    """
    Constant = None
-    """ This will be set to `CudaNdarrayConstant` defined in ``var.py``
+    """
+    This will be set to `CudaNdarrayConstant` defined in ``var.py``.
-    :note:
+    Notes
+    -----
    The var file depends on the file basic_ops.py, which depends on this file.
    A cyclic dependency is avoided by not hardcoding this class.
    """
    SharedVariable = None
-    """ This will be set to `CudaNdarraySharedVariable` defined in ``var.py``
+    """
+    This will be set to `CudaNdarraySharedVariable` defined in ``var.py``.
-    :note:
+    Notes
+    -----
    The var file depends on the file basic_ops.py, which depends on this file.
    A cyclic dependency is avoided by not hardcoding this class.
    """
    if cuda is not None:
        value_zeros = staticmethod(cuda.CudaNdarray.zeros)
    """
-    Create an CudaNdarray full of 0 values
+    Create an CudaNdarray full of 0 values.
    """
    def __init__(self, broadcastable, name=None, dtype=None):
@@ -120,11 +133,13 @@ class CudaNdarrayType(Type):
                        data)
    def filter_variable(self, other, allow_convert=True):
-        """Convert a Variable into a CudaNdarrayType, if compatible.
+        """
+        Convert a Variable into a CudaNdarrayType, if compatible.
        This Variable should either already be a CudaNdarrayType, or be
        a TensorType. It has to have the right number of dimensions,
        broadcastable pattern, and dtype.
        """
        if hasattr(other, '_as_CudaNdarrayVariable'):
            other = other._as_CudaNdarrayVariable()
@@ -209,10 +224,12 @@ class CudaNdarrayType(Type):
        )
    def dtype_specs(self):
-        """Return a tuple (python type, c type, numpy typenum) that
+        """
-        corresponds to self.dtype.
+        Return a tuple (python type, c type, numpy typenum) that corresponds
+        to self.dtype.
        This function is used internally as part of C code generation.
        """
        # TODO: add more type correspondances for e.g. int32, int64, float32,
        # complex64, etc.
@@ -236,7 +253,10 @@ class CudaNdarrayType(Type):
                    self.__class__.__name__, self.dtype))
    def __eq__(self, other):
-        """Compare True iff other is the same kind of CudaNdarrayType"""
+        """
+        Compare True iff other is the same kind of CudaNdarrayType.
+        """
        return (type(self) == type(other) and
                other.broadcastable == self.broadcastable)
@@ -248,12 +268,16 @@ class CudaNdarrayType(Type):
            return theano.tensor.patternbroadcast(var, self.broadcastable)
    def __hash__(self):
-        """Hash equal for same kinds of CudaNdarrayType"""
+        """
+        Hash equal for same kinds of CudaNdarrayType.
+        """
        return hash(type(self)) ^ hash(self.broadcastable)
    ndim = property(lambda self: len(self.broadcastable),
                    doc="number of dimensions")
-    """Number of dimensions
+    """
+    Number of dimensions.
    This read-only property is the preferred way to get the number of
    dimensions of a `CudaNdarrayType`.
@@ -261,12 +285,14 @@ class CudaNdarrayType(Type):
    """
    def make_variable(self, name=None):
-        """Return a `TensorVariable` of this type
+        """
+        Return a `TensorVariable` of this type.
-        :Parameters:
+        Parameters
-         - `name`: str
+        ----------
-           A pretty name to identify this `Variable` when printing and
+        name : str
-           debugging
+            A pretty name to identify this `Variable` when printing and
+            debugging.
        """
        return self.Variable(self, name=name)
@@ -381,7 +407,9 @@ class CudaNdarrayType(Type):
        return sio.getvalue()
    def c_extract_out(self, name, sub, check_input=True, check_broadcast=True):
-        """ To allow the hack to skip check_broadcast.
+        """ 
+        To allow the hack to skip check_broadcast.
        """
        return """
        if (py_%(name)s == Py_None)
@@ -411,7 +439,10 @@ class CudaNdarrayType(Type):
        """ % locals()
    def c_sync(self, name, sub):
-        """Override `CLinkerOp.c_sync` """
+        """
+        Override `CLinkerOp.c_sync`.
+        """
        return """
        //std::cerr << "sync\\n";
        if (NULL == %(name)s) {
@@ -433,11 +464,17 @@ class CudaNdarrayType(Type):
        """ % locals()
    def c_headers(self):
-        """Override `CLinkerOp.c_headers` """
+        """
+        Override `CLinkerOp.c_headers`.
+        """
        return ['cuda_ndarray.cuh']
    def c_header_dirs(self):
-        """Override `CLinkerOp.c_headers` """
+        """
+        Override `CLinkerOp.c_headers`.
+        """
        ret = [os.path.dirname(cuda_ndarray.__file__)]
        cuda_root = config.cuda.root
        if cuda_root:

--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -19,15 +19,18 @@ except ImportError:
 class _operators(tensor.basic._tensor_py_operators):
-    """Define a few properties and conversion methods for CudaNdarray Variables.
+    """
+    Define a few properties and conversion methods for CudaNdarray Variables.
-    The default implementation of arithemetic operators is to build graphs of TensorType
+    The default implementation of arithemetic operators is to build graphs of
-    variables.
+    TensorType variables.
    The optimization pass (specialization) will insert pure GPU implementations.
-    This approach relieves the Cuda-Ops of having to deal with input argument checking and
+    This approach relieves the Cuda-Ops of having to deal with input argument
-    gradients.
+    checking and gradients.
    """
    def _as_TensorVariable(self):
        return HostFromGpu()(self)
    def _as_CudaNdarrayVariable(self):
@@ -63,7 +66,8 @@ CudaNdarrayType.Constant = CudaNdarrayConstant
 class CudaNdarraySharedVariable(_operators, SharedVariable):
    """
-    Shared Variable interface to CUDA-allocated arrays
+    Shared Variable interface to CUDA-allocated arrays.
    """
    get_value_return_ndarray = True
@@ -72,20 +76,23 @@ class CudaNdarraySharedVariable(_operators, SharedVariable):
        """
        Return the value of this SharedVariable's internal array.
-        :param borrow:
+        Parameters
-                permit the return of internal storage, when used in conjunction with
+        ----------
-                ``return_internal_type=True``
+        borrow
-        :param return_internal_type:
+            Permit the return of internal storage, when used in conjunction with
-                True to return the internal ``cuda_ndarray`` instance rather than a ``numpy.ndarray``
+            ``return_internal_type=True``.
-                (Default False)
+        return_internal_type
+            True to return the internal ``cuda_ndarray`` instance rather than a
+            ``numpy.ndarray`` (Default False).
-        By default ``get_value()`` copies from the GPU to a ``numpy.ndarray`` and returns that
+        By default ``get_value()`` copies from the GPU to a ``numpy.ndarray``
-        host-allocated array.
+        and returns that host-allocated array.
-        ``get_value(False,True)`` will return a GPU-allocated copy of the original GPU array.
+        ``get_value(False,True)`` will return a GPU-allocated copy of the
+        original GPU array.
-        ``get_value(True,True)`` will return the original GPU-allocated array without any
+        ``get_value(True,True)`` will return the original GPU-allocated array
-        copying.
+        without any copying.
        """
        if return_internal_type or not self.get_value_return_ndarray:
@@ -101,33 +108,39 @@ class CudaNdarraySharedVariable(_operators, SharedVariable):
        """
        Assign `value` to the GPU-allocated array.
-        :param borrow: ``True`` permits reusing `value` itself, ``False`` requires that this function
+        Parameters
-                       copies `value` into internal storage.
+        ----------
+        borrow : bool
-        :note:
+            ``True`` permits reusing `value` itself, ``False`` requires that
+            this function copies `value` into internal storage.
-            Prior to Theano 0.3.1, set_value did not work in-place on the GPU. This meant that sometimes,
+        Notes
-            GPU memory for the new value would be allocated before the old memory was released. If you're
+        -----
-            running near the limits of GPU memory, this could cause you to run out of GPU memory.
+        Prior to Theano 0.3.1, set_value did not work in-place on the GPU. This
+        meant that sometimes, GPU memory for the new value would be allocated
+        before the old memory was released. If you're running near the limits of
+        GPU memory, this could cause you to run out of GPU memory.
-            Beginning with Theano 0.3.1, set_value will work in-place on the GPU, if the following conditions
+        Beginning with Theano 0.3.1, set_value will work in-place on the GPU, if
-            are met:
+        the following conditions are met:
            * The destination on the GPU must be c_contiguous.
            * The source is on the CPU.
-            * The old value must have the same dtype as the new value (which is a given for now,
+            * The old value must have the same dtype as the new value (which is
-              since only float32 is supported).
+            a given for now, since only float32 is supported).
            * The old and new value must have the same shape.
-            * The old value is being completely replaced by the new value (not partially modified,
+            * The old value is being completely replaced by the new value (not
-              e.g. by replacing some subtensor of it).
+            partially modified, e.g. by replacing some subtensor of it).
-            * You change the value of the shared variable via set_value, not via the .value
+            * You change the value of the shared variable via set_value, not via
-              accessors. You should not use the .value accessors anyway, since they will soon be
+            the .value accessors. You should not use the .value accessors
-              deprecated and removed.
+            anyway, since they will soon be deprecated and removed.
+        It is also worth mentioning that, for efficient transfer to the GPU,
+        Theano will make the new data ``c_contiguous``. This can require an
+        extra copy of the data on the host.
-            It is also worth mentioning that, for efficient transfer to the GPU, Theano will make the new data
+        The inplace on gpu memory work when borrow is either True or False.
-            ``c_contiguous``. This can require an extra copy of the data on the host.
-            The inplace on gpu memory work when borrow is either True or False.
        """
        if not borrow:
            # TODO: check for cuda_ndarray type
@@ -147,8 +160,10 @@ CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
 def cuda_shared_constructor(value, name=None, strict=False,
        allow_downcast=None, borrow=False, broadcastable=None):
-    """SharedVariable Constructor for CudaNdarrayType"""
+    """
+    SharedVariable Constructor for CudaNdarrayType.
+    """
    # THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD
    # SO INT shared vars, float64 shared vars, etc. all end up on the card.
    # THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT.
@@ -179,7 +194,11 @@ def cuda_shared_constructor(value, name=None, strict=False,
 def float32_shared_constructor(value, name=None, strict=False,
        allow_downcast=None, borrow=False, broadcastable=None):
-    """SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray"""
+    """
+    SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or
+    CudaNdarray.
+    """
    if theano.sandbox.cuda.use.device_number is None:
        theano.sandbox.cuda.use("gpu",
                                force=True,

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -47,7 +47,9 @@ def as_gpuarray(x):
 class Kernel(object):
    """
    This class groups together all the attributes of a gpu kernel.
    """
    def __init__(self, code, params, name, flags,
                 codevar=None, binvar=None, objvar=None):
        self.code = code
@@ -113,9 +115,9 @@ class Kernel(object):
 class GpuKernelBase(object):
    def gpu_kernels(self, node, name):
        """
-        This is the method to override.  This should return an
+        This is the method to override. This should return an iterable of Kernel
-        iterable of Kernel objects that describe the kernels this op
+        objects that describe the kernels this op will need.
-        will need.
        """
        raise MethodNotDefined('gpu_kernels')
@@ -552,13 +554,20 @@ cuda_from_gpu = CudaFromGpu()
 class GpuAlloc(HideC, Alloc):
+    """
+    Parameters
+    ----------
+    memset_0
+        It's only an optimized version. True, it means the
+        value is always 0, so the c code call memset as it is faster.
+    """
    __props__ = ('memset_0',)
    _f16_ok = True
    def __init__(self, memset_0=False):
-        """memset_0 is only an optimized version. True, it mean the
-        value is always 0, so the c code call memset as it is faster.
-        """
        self.memset_0 = memset_0
    def __str__(self):
@@ -777,7 +786,9 @@ class GpuContiguous(Op):
    """
    Always return a c contiguous output. Copy the input only if it is
    not already c contiguous.
    """
    __props__ = ()
    view_map = {0: [0]}
    _f16_ok = True
@@ -835,7 +846,9 @@ gpu_contiguous = GpuContiguous()
 class GpuReshape(HideC, tensor.Reshape):
    """
    Implement Reshape on the gpu.
    """
    _f16_ok = True
    # __hash__, __eq__, __str__ come from tensor.Reshape
@@ -951,6 +964,7 @@ class GpuReshape(HideC, tensor.Reshape):
 class GpuJoin(HideC, Join):
    _f16_ok = True
    def make_node(self, axis, *tensors):

--- a/theano/sandbox/gpuarray/comp.py
+++ b/theano/sandbox/gpuarray/comp.py
@@ -16,6 +16,7 @@ class NVCC_compiler(NVCC_base):
        """
        Re-implementation of compile_args that does not create an
        additionnal context on the GPU.
        """
        flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
        if config.nvcc.fastmath:

--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
@@ -12,7 +12,48 @@ from .basic_ops import as_gpuarray_variable
 class GpuConv(gof.Op):
    """
    Implement the batched and stacked 2d convolution on the gpu.
+    Parameters
+    ----------
+    version
+        Each version of c_code implements many kernels for the convolution.
+        By default we try to guess the best one. You can force one version with
+        this parameter. This parameter is used by the tests.
+    direction_hint
+        'forward', 'bprop weights' or 'bprop inputs'. Serves as a hint for graph
+        optimizers replacing GpuConv by other implementations. If the GpuConv is
+        inserted automatically, we take its value from ConvOp.
+    verbose
+        For value of 1,2 and 3. Print more information during the execution of
+        the convolution. Mostly used for optimization or debugging.
+    kshp
+        The size of the kernel. If provided, can generate faster code. If the
+        GpuConv op is automatically inserted, we take its value automatically
+        from the Conv op.
+    imshp
+        The size of the image. Not used for code generation but allows to select
+        an experimental new version in another repo.
+    max_threads_dim0
+        The maximum number of threads for the block size dimensions 0
+        (blockDim.x) used by the GPU function.
+    nkern
+        The number of kernels. Not used for this op, but can be used by graph
+        optimizers to select a more optimal convolution implementation. If the
+        GpuConv op is inserted automatically, we take its value from the Conv
+        op.
+    bsize
+        The batch size. Not used for this op, but can be used by graph
+        optimizers to select a more optimal convolution implementation. If the
+        GpuConv op is inserted automatically, we take its value from the Conv
+        op.
+    fft_opt
+        Deactivate fft_opt optimization at the op level when set to False. Note
+        that by default fft optimization aren't enabled.
+        See :ref:`convolution documentation <libdoc_tensor_nnet_conv>` to enable
+        them.
    """
    @staticmethod
    def logical_output_shape_2d(imshp, kshp, mode):
        if mode == 'valid':
@@ -35,43 +76,6 @@ class GpuConv(gof.Op):
            nkern=None,
            bsize=None,
            fft_opt=True):
-        """
-        :param version: each version of c_code implements many kernels for the
-                        convolution. By default we try to guess the best one.
-                        You can force one version with this parameter. This
-                        parameter is used by the tests.
-        :param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
-                        Serves as a hint for graph optimizers replacing
-                        GpuConv by other implementations. If the GpuConv is
-                        inserted automatically, we take its value from ConvOp.
-        :param verbose: for value of 1,2 and 3. Print more information during
-                        the execution of the convolution. Mostly used for
-                        optimization or debugging.
-        :param kshp:    The size of the kernel. If provided, can generate
-                        faster code. If the GpuConv op is automatically
-                        inserted,
-                        we take its value automatically from the Conv op.
-        :param imshp:   The size of the image. Not used for code generation but
-                        allows to select an experimental new version in another
-                        repo.
-        :param max_threads_dim0: The maximum number of threads for the
-                        block size dimensions 0 (blockDim.x) used by the
-                        GPU function.
-        :param nkern:   The number of kernels. Not used for this op, but can be
-                        used by graph optimizers to select a more optimal
-                        convolution implementation. If the GpuConv op is inserted
-                        automatically, we take its value from the Conv op.
-        :param bsize:   The batch size. Not used for this op, but can be
-                        used by graph optimizers to select a more optimal
-                        convolution implementation. If the GpuConv op is inserted
-                        automatically, we take its value from the Conv op.
-        :param fft_opt: deactivate fft_opt optimization at the op level when
-                        set to False. Note that by default fft optimization
-                        aren't enabled. See
-                        :ref:`convolution documentation <libdoc_tensor_nnet_conv>`
-                        to enable them.
-        """
        self.border_mode = border_mode
        self.subsample = subsample
        if logical_img_hw is not None:
@@ -169,7 +173,10 @@ class GpuConv(gof.Op):
        return gof.Apply(self, [img, kern], [out])
    def flops(self, inputs, outputs):
-        """ Useful with the hack in profilemode to print the MFlops"""
+        """
+        Useful with the hack in profilemode to print the MFlops.
+        """
        images, kerns = inputs
        out, = outputs
        assert images[1] == kerns[1]

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -470,7 +470,8 @@ class GpuElemwise(HideC, Elemwise):
 class SupportCodeError(Exception):
    """
-    We do not support certain things (such as the C++ complex struct)
+    We do not support certain things (such as the C++ complex struct).
    """
@@ -571,14 +572,22 @@ class GpuDimShuffle(HideC, DimShuffle):
 class GpuCAReduceCuda(HideC, CAReduceDtype):
-    """GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
+    """
+    GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
-    The dimensions along which to reduce is specified by the
-    `reduce_mask` that you pass to the constructor.  The `reduce_mask`
+    Parameters
-    is a tuple of booleans (actually integers 0 or 1) that specify for
+    ----------
-    each input dimension, whether to reduce it (1) or not (0).
+    reduce-mask
+        The dimensions along which to reduce. The `reduce_mask` is a tuple of
-    For example, when scalar_op is a theano.scalar.basic.Add instance:
+        booleans (actually integers 0 or 1) that specify for each input
+        dimension, whether to reduce it (1) or not (0).
+    pre_scalar_op
+        If present, must be a scalar op with only 1 input. We will execute it
+        on the input value before reduction.
+    Examples
+    --------
+    When scalar_op is a theano.scalar.basic.Add instance:
      - reduce_mask == (1,) sums a vector to a scalar
@@ -588,8 +597,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
      - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
-    :note: any reduce_mask of all zeros is a sort of 'copy', and may
+    Notes
-           be removed during graph optimization
+    -----
+    Any reduce_mask of all zeros is a sort of 'copy', and may be removed during
+    graph optimization.
    This Op is a work in progress.
@@ -602,9 +613,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
    GPUs are not especially well-suited to reduction operations so it is
    quite possible that the GPU might be slower for some cases.
-    pre_scalar_op: if present, must be a scalar op with only 1
-    input. We will execute it on the input value before reduction.
    """
    _f16_ok = True
    def __init__(self, scalar_op, axis=None,
@@ -690,9 +700,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        raise MethodNotDefined("")
    def supports_c_code(self, inputs):
-        """ Returns True if the current op and reduce pattern
+        """
-            has functioning C code """
+        Returns True if the current op and reduce pattern has functioning C code.
+        """
        # If we don't even have the right method, we certainly
        # don't support the C code
        # (This is the test that used to be implemented by
@@ -871,9 +882,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        return sio.getvalue()
    def _makecall(self, node, name, x, z, fail, pattern=None):
-        """Return a string for making a kernel call.
+        """
+        Return a string for making a kernel call.
-            The return value looks something like:
+        The return value looks something like:
            .. code-block:: c
@@ -972,7 +984,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
    def _k_decl(self, node, nodename, pattern=None,
                ndim=None, reduce_mask=None):
-        """Return a string to declare a kernel function
+        """
+        Return a string to declare a kernel function.
        The result will look something like this:
@@ -989,8 +1002,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                    %(out_dtype)s * Z,
                    const int sZ0)
-            Since the nodename is unique, we don't need to put the name
+        Since the nodename is unique, we don't need to put the name
-            of the scalar_op in here.
+        of the scalar_op in here.
        """
        in_dtype = "npy_" + node.inputs[0].dtype
@@ -1057,6 +1070,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        Otherwise, check that the scalar op is maximum or minimum
        and return first_item. It should be the first element of the reduction.
        As the maximum and minimum of the same value don't change, this work.
        """
        if hasattr(self.scalar_op, 'identity'):
            return str(self.scalar_op.identity)
@@ -1084,15 +1098,28 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
    def _assign_reduce(self, node, name, left, right, sub, pre):
        """
-            node: the node argument to this op's c_code
-            name: the name argument to this op's c_code
-            left: a C code string identifying an lvalue
-            right: a C code string identifying an expression
-            sub: the sub argument to this op's c_code
-            pre: If True, we will add the pre_scalar_op.c_code
-            returns C code to reduce left and right, assigning the
+        Parameters
-            result to left."""
+        ----------
+        node
+            The node argument to this op's c_code.
+        name
+            The name argument to this op's c_code.
+        left
+            A C code string identifying an lvalue.
+        right
+            A C code string identifying an expression.
+        sub
+            The sub argument to this op's c_code.
+        pre
+            If True, we will add the pre_scalar_op.c_code.
+        Returns
+        -------
+        str
+            C code to reduce left and right, assigning the result to left.
+        """
        x, = node.inputs
        in_dtype = x.dtype
@@ -1125,8 +1152,11 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        """
        WRITEME
-        node, name, sub: these should be passed through from the original
+        Parameters
-        call to c_code
+        ----------
+        node, name, sub 
+            These should be passed through from the original call to c_code.
        """
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
@@ -1274,9 +1304,11 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
    def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
        """
        WRITEME
        IG: I believe, based on how this is called in c_code, that it
        is for the case where we are reducing on all axes and x is
        C contiguous.
        """
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
@@ -1366,8 +1398,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
    def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
        """
-        :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
-                  Work for N=1,2,3
+        Parameters
+        ----------
+        N
+            The number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
+            Work for N=1,2,3.
        """
        assert N in [1, 2, 3]
@@ -2552,11 +2589,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
 class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
-    """CAReduce that reuse the python code from gpuarray.
+    """
+    CAReduce that reuse the python code from gpuarray.
    Too slow for now as it only have a python interface.
    """
    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
        if not hasattr(scalar_op, 'identity'):
            raise ValueError("No identity on scalar op")

--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
-""" Helper routines for generating gpu kernels for nvcc.
 """
+Helper routines for generating gpu kernels for nvcc.
-def nvcc_kernel(name, params, body):
+"""
-    """Return the c code of a kernel function.
-    :param params: the parameters to the function as one or more strings
+def nvcc_kernel(name, params, body):
+    """
+    Return the c code of a kernel function.
-    :param body: the [nested] list of statements for the body of the
+    Parameters
-         function.  These will be separated by ';' characters.
+    ----------
+    params
+        The parameters to the function as one or more strings.
+    body
+        The [nested] list of statements for the body of the function.
+        These will be separated by ';' characters.
    """
    paramstr = ', '.join(params)
@@ -28,7 +34,10 @@ def nvcc_kernel(name, params, body):
 def code_version(version):
-    """decorator to support version-based cache mechanism"""
+    """
+    Decorator to support version-based cache mechanism.
+    """
    if not isinstance(version, tuple):
        raise TypeError('version must be tuple', version)
@@ -42,22 +51,31 @@ UNVERSIONED = ()
 @code_version((1,))
 def inline_reduce(N, buf, pos, count, manner_fn):
-    """Return C++ code for a function that reduces a contiguous buffer.
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
-    :param N: length of the buffer
-    :param buf: buffer pointer
+    Parameters
-    :param pos: index of executing thread
+    ----------
-    :param count: number of executing threads
+    N
+        Length of the buffer.
-    :param manner_fn: a function that accepts strings of arguments a
+    buf
-        and b, and returns c code for their reduction. (Example:
+        buffer pointer.
-        return "%(a)s + %(b)s" for a sum reduction).
+    pos
+        Index of executing thread.
+    count
+        Number of executing threads.
+    manner_fn
+        A function that accepts strings of arguments a and b, and returns c code
+        for their reduction.
+        Example: return "%(a)s + %(b)s" for a sum reduction.
    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
+    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.
-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
    """
    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
@@ -126,19 +144,28 @@ def inline_reduce_prod(N, buf, pos, count):
 def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    """
-    :param N: length of the buffer
+    Parameters
-    :param threadPos: index of executing thread
+    ----------
-    :param threadCount: number of executing threads
+    N
-    :param dtype: dtype of the softmax's output
+        Length of the buffer.
+    threadPos
+        Index of executing thread.
+    threadCount
+        Number of executing threads.
+    dtype
+        Dtype of the softmax's output.
    :Precondition: buf and buf2 contain two identical copies of the input
        to softmax
    :Postcondition: buf contains the softmax, buf2 contains un-normalized
        softmax
-    :note: buf and buf2 should be in gpu shared memory, we access it many times
+    Notes
+    -----
+    buf and buf2 should be in gpu shared memory, we access it many times.
+    We use __i as an int variable in a loop.
-    :note2: We use __i as an int variable in a loop
    """
    return [
            # get max of buf (trashing all but buf[0])
@@ -169,31 +196,48 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
 def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
                               manner_fn, manner_init,
                               b='', stride_b='', load_b='', dtype='float32'):
-    """Return C++ code for a function that reduces a contiguous buffer.
+    """
+    Return C++ code for a function that reduces a contiguous buffer.
-    :param N: length of the buffer
-    :param buf: buffer pointer of size warpSize * sizeof(dtype)
+    Parameters
-    :param x: input data
+    ----------
-    :param stride_x: input data stride
+    N
-    :param load_x: wrapper to read from x
+        Length of the buffer.
-    :param pos: index of executing thread
+    buf
-    :param count: number of executing threads
+        Buffer pointer of size warpSize * sizeof(dtype).
-    :param b: Optional, pointer to the bias
+    x
-    :param stride_b: Optional, the stride of b if b is provided
+        Input data.
-    :param load_b: Optional, wrapper to read from b if b is provided
+    stride_x
-    :param dtype: Optional, the dtype of the output
+        Input data stride.
+    load_x
-    :param manner_fn: a function that accepts strings of arguments a
+        Wrapper to read from x.
-        and b, and returns c code for their reduction. (Example:
+    pos
-        return "%(a)s + %(b)s" for a sum reduction).
+        Index of executing thread.
-    :param manner_init: a function that accepts strings of arguments a
+    count
-        and return c code for its initialization
+        Number of executing threads.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.
+    load_b
+        Optional, wrapper to read from b if b is provided.
+    dtype
+        Optional, the dtype of the output.
+    manner_fn
+        A function that accepts strings of arguments a and b, and returns c code
+        for their reduction. 
+        Example: return "%(a)s + %(b)s" for a sum reduction.
+    manner_init
+        A function that accepts strings of arguments a and return c code for its
+        initialization.
    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The
+    This function leaves the answer in position 0 of the buffer. The rest of the
-    rest of the buffer is trashed by this function.
+    buffer is trashed by this function.
-    :note: buf should be in gpu shared memory, we access it many times.
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
    """
    if b:
@@ -270,28 +314,47 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                dtype="float32"):
    """
-    :param N: length of the buffer, atleast waprSize(32).
+    Parameters
-    :param buf: a shared memory buffer of size warpSize * sizeof(dtype)
+    ----------
-    :param x: a ptr to the gpu memory where the row is stored
+    N 
-    :param stride_x: the stride between each element in x
+        Length of the buffer, atleast waprSize(32).
-    :param load_x: wrapper to read from x
+    buf
-    :param sm: a ptr to the gpu memory to store the result
+        A shared memory buffer of size warpSize * sizeof(dtype).
-    :param sm_stride: the stride between eash sm element
+    x
-    :param write_sm: wrapper before writing to sm
+        A ptr to the gpu memory where the row is stored.
-    :param threadPos: index of executing thread
+    stride_x
-    :param threadCount: number of executing threads
+        The stride between each element in x.
-    :param b: Optional, pointer to the bias
+    load_x
-    :param stride_b: Optional, the stride of b if b is provided
+        Wrapper to read from x.
-    :param load_b: Optional, wrapper to read from b if b is provided
+    sm
-    :param dtype: Optional, the dtype of the softmax's output if not float32
+        A ptr to the gpu memory to store the result.
+    sm_stride
+        The stride between each sm element.
+    write_sm
+        Wrapper before writing to sm.
+    threadPos
+        Index of executing thread.
+    threadCount
+        Number of executing threads.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.
+    load_b
+        Optional, wrapper to read from b if b is provided.
+    dtype
+        Optional, the dtype of the softmax's output if not float32.
    :Precondition: buf is empty
-    :Postcondition: buf[0] contains the softmax,
+    :Postcondition: buf[0] contains the softmax, buf2 contains un-normalized
-        buf2 contains un-normalized softmax
+        softmax
+    Notes
+    -----
+    buf should be in gpu shared memory, we access it many times.
-    :note: buf should be in gpu shared memory, we access it many times.
+    We use tx as an int variable in a loop.
-    :note2: We use tx as an int variable in a loop
    """
    ret = [
        # get max of buf (trashing all but buf[0])

--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -22,7 +22,9 @@ from .fp16_help import work_dtype, load_w, write_w
 class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
    """
    nin = 3
    nout = 3
    __props__ = ()
@@ -276,8 +278,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
-    Gradient wrt x of the CrossentropySoftmax1Hot Op
+    Gradient wrt x of the CrossentropySoftmax1Hot Op.
    """
    nin = 3
    nout = 1
    __props__ = ()
@@ -462,7 +466,9 @@ gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasD
 class GpuSoftmax (Op):
    """
    Implement Softmax on the gpu.
    """
    __props__ = ()
    _f16_ok = True
@@ -651,7 +657,9 @@ gpu_softmax = GpuSoftmax()
 class GpuSoftmaxWithBias (Op):
    """
    Implement SoftmaxWithBias on the gpu.
    """
    nin = 2
    nout = 1
    __props__ = ()

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -89,7 +89,9 @@ def safe_to_cpu(x):
 def op_lifter(OP, cuda_only=False):
    """
    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
    """
    def f(maker):
        def local_opt(node):
@@ -122,7 +124,10 @@ def op_lifter(OP, cuda_only=False):
 class InputToGpuOptimizer(Optimizer):
-    "Transfer the input to the gpu to start the rolling wave."
+    """
+    Transfer the input to the gpu to start the rolling wave.
+    """
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
@@ -173,6 +178,7 @@ def local_gpuaalloc2(node):
    Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
    Moves an alloc that is an input to join to the gpu.
    """
    if (isinstance(node.op, tensor.Alloc) and
        all(c != 'output' and
@@ -654,6 +660,7 @@ def local_gpu_conv(node):
    gpu_from_host(conv) -> gpu_conv(gpu_from_host)
    conv(host_from_gpu) -> host_from_gpu(gpu_conv)
    """
    def GpuConvOp_from_ConvOp(op):
        logical_img_hw = None
@@ -698,7 +705,8 @@ def local_gpu_conv(node):
        return ret
    def values_eq_approx(a, b):
-        """This fct is needed to don't have DebugMode raise useless
+        """
+        This fct is needed to don't have DebugMode raise useless
        error due to ronding error.
        This happen as We reduce on the two last dimensions, so this
@@ -736,7 +744,10 @@ register_opt()(conv_groupopt)
 @register_opt("low_memory")
 @local_optimizer([GpuCAReduceCuda])
 def local_gpu_elemwise_careduce(node):
-    """ Merge some GpuCAReduceCuda and GPUElemwise"""
+    """
+    Merge some GpuCAReduceCuda and GPUElemwise.
+    """
    if (isinstance(node.op, GpuCAReduceCuda) and
            node.op.pre_scalar_op is None and
            node.inputs[0].owner and
@@ -767,10 +778,11 @@ def tensor_to_gpu(x):
 def gpu_safe_new(x, tag=''):
    """
    Internal function that constructs a new variable from x with the same
-    type, but with a different name ( old name + tag). This function is used
+    type, but with a different name (old name + tag). This function is used
    by gradient, or the R-op to construct new variables for the inputs of
    the inner graph such that there is no interference between the original
    graph and the newly constructed graph.
    """
    if hasattr(x, 'name') and x.name is not None:
        nw_name = x.name + tag
@@ -788,8 +800,9 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
    """
    Different interface to clone, that allows you to pass inputs.
    Compared to clone, this method always replaces the inputs with
-    new variables of the same type, and returns those ( in the same
+    new variables of the same type, and returns those (in the same
    order as the original inputs).
    """
    if tag is None:
        tag = ''

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -163,12 +163,16 @@ class GpuIncSubtensor(IncSubtensor):
    """
    Implement IncSubtensor on the gpu.
-    Note: The optimization to make this inplace is in tensor/opt.
+    Notes
-          The same optimization handles IncSubtensor and GpuIncSubtensor.
+    ----- 
-          This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
+    The optimization to make this inplace is in tensor/opt.
-          The helper methods like do_type_checking, copy_of_x, etc. specialize
+    The same optimization handles IncSubtensor and GpuIncSubtensor.
-          the c_code for this Op.
+    This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
+    The helper methods like do_type_checking, copy_of_x, etc. specialize
+    the c_code for this Op.
    """
    @property
    def _f16_ok(self):
        return self.iadd_node.op._f16_ok
@@ -256,8 +260,10 @@ class GpuIncSubtensor(IncSubtensor):
        return d
    def do_type_checking(self, node):
-        """ Should raise NotImplementedError if c_code does not support
+        """
+        Should raise NotImplementedError if c_code does not support
        the types involved in this node.
        """
        if not isinstance(node.inputs[0].type, GpuArrayType):
@@ -265,13 +271,22 @@ class GpuIncSubtensor(IncSubtensor):
    def copy_of_x(self, x):
        """
-            :param x: a string giving the name of a C variable
-                pointing to an array
-            :return: C code expression to make a copy of x
+        Parameters
+        ----------
+        x
+            A string giving the name of a C variable pointing to an array.
+        Returns
+        -------
+        str
+            C code expression to make a copy of x.
+        Notes
+        -----
+        Base class uses `PyArrayObject *`, subclasses may override for
+        different types of arrays.
-            Base class uses `PyArrayObject *`, subclasses may override for
-            different types of arrays.
        """
        return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
@@ -279,13 +294,18 @@ class GpuIncSubtensor(IncSubtensor):
        return "PyGpuArrayObject* zview = NULL;"
    def make_view_array(self, x, view_ndim):
-        """//TODO
+        """
-            :param x: a string identifying an array to be viewed
+        //TODO
-            :param view_ndim: a string specifying the number of dimensions
-                to have in the view
+        Parameters
+        ----------
+        x
+            A string identifying an array to be viewed.
+        view_ndim
+            A string specifying the number of dimensions to have in the view.
            This doesn't need to actually set up the view with the
            right indexing; we'll do that manually later.
        """
        ret = """
        size_t dims[%(view_ndim)s];
@@ -305,18 +325,29 @@ class GpuIncSubtensor(IncSubtensor):
        return ret
    def get_helper_c_code_args(self):
-        """ Return a dictionary of arguments to use with helper_c_code"""
+        """
+        Return a dictionary of arguments to use with helper_c_code.
+        """
        return {'c_prefix': 'PyGpuArray',
                'strides_mul': 1
                }
    def copy_into(self, view, source):
        """
-            view: string, C code expression for an array
-            source: string, C code expression for an array
-            returns a C code expression to copy source into view, and
+        Parameters
-            return 0 on success
+        ----------
+        view : string
+            C code expression for an array.
+        source : string
+            C code expression for an array.
+        Returns
+        -------
+        str
+            C code expression to copy source into view, and 0 on success.
        """
        return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
@@ -365,7 +396,9 @@ class GpuIncSubtensor(IncSubtensor):
 class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
    """
    Implement AdvancedIncSubtensor1 on the gpu.
    """
    def make_node(self, x, y, ilist):
        x_ = as_gpuarray_variable(x)
        y_ = as_gpuarray_variable(y)
@@ -454,9 +487,12 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
 class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
-    """Implement AdvancedIncSubtensor1 on the gpu, but use function
+    """
+    Implement AdvancedIncSubtensor1 on the gpu, but use function
    only avail on compute capability 2.0 and more recent.
    """
    _f16_ok = True
    def make_node(self, x, y, ilist):

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -217,10 +217,12 @@ class GpuArrayType(Type):
        return (hash(self.typecode) ^ hash(self.broadcastable))
    def dtype_specs(self):
-        """Return a tuple (python type, c type, numpy typenum) that corresponds
+        """
+        Return a tuple (python type, c type, numpy typenum) that corresponds
        to self.dtype.
        This function is used internally as part of C code generation.
        """
        # TODO: add more type correspondances for e.g. int32, int64, float32,
        # complex64, etc.
@@ -386,7 +388,10 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable
 def gpuarray_shared_constructor(value, name=None, strict=False,
                                allow_downcast=None, borrow=False,
                                broadcastable=None):
-    """SharedVariable constructor for GpuArrayType"""
+    """
+    SharedVariable constructor for GpuArrayType.
+    """
    if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
        raise TypeError('ndarray or GpuArray required')

--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
@@ -64,7 +64,7 @@ except ImportError:
 class Hint(Op):
    """
-    Provide arbitrary information to the optimizer
+    Provide arbitrary information to the optimizer.
    These ops are removed from the graph during canonicalization
    in order to not interfere with other optimizations.
@@ -122,7 +122,7 @@ def remove_hint_nodes(node):
 class HintsFeature(object):
    """
-    FunctionGraph Feature to track matrix properties
+    FunctionGraph Feature to track matrix properties.
    This is a similar feature to variable 'tags'. In fact, tags are one way
    to provide hints.
@@ -209,8 +209,12 @@ class HintsFeature(object):
 class HintsOptimizer(Optimizer):
-    """Optimizer that serves to add HintsFeature as an fgraph feature.
    """
+    Optimizer that serves to add HintsFeature as an fgraph feature.
+    """
    def __init__(self):
        Optimizer.__init__(self)
@@ -231,6 +235,7 @@ def psd(v):
    """
    Apply a hint that the variable `v` is positive semi-definite, i.e.
    it is a symmetric matrix and :math:`x^T A x \ge 0` for any vector x.
    """
    return Hint(psd=True, symmetric=True)(v)
@@ -294,6 +299,7 @@ def tag_solve_triangular(node):
    """
    If a general solve() is applied to the output of a cholesky op, then
    replace it with a triangular solve.
    """
    if node.op == solve:
        if node.op.A_structure == 'general':
@@ -396,12 +402,13 @@ def spectral_radius_bound(X, log2_exponent):
    Returns upper bound on the largest eigenvalue of square symmetrix matrix X.
    log2_exponent must be a positive-valued integer. The larger it is, the
-    slower and tighter the bound.  Values up to 5 should usually suffice.  The
+    slower and tighter the bound. Values up to 5 should usually suffice. The
    algorithm works by multiplying X by itself this many times.
    From V.Pan, 1990. "Estimating the Extremal Eigenvalues of a Symmetric
    Matrix", Computers Math Applic. Vol 20 n. 2 pp 17-22.
    Rq: an efficient algorithm, not used here, is defined in this paper.
    """
    if X.type.ndim != 2:
        raise TypeError('spectral_radius_bound requires a matrix argument', X)

--- a/theano/sandbox/scan_module/__init__.py
+++ b/theano/sandbox/scan_module/__init__.py
 """
-This module provides the Scan Op
+This module provides the Scan Op.
 Scanning is a general form of recurrence, which can be used for looping.
 The idea is that you *scan* a function along some input sequence, producing
 an output at each time-step that can be seen (but not modified) by the
-function at the next time-step. (Technically, the function can see the
+function at the next time-step. Technically, the function can see the
 previous K  time-steps of your outputs and L time steps (from the past and
 future) of your inputs.
@@ -26,6 +26,7 @@ the symbolic graph.
 The Scan Op should typically be used by calling any of the following
 functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
 ``foldr()``.
 """

--- a/theano/sandbox/scan_module/scan.py
+++ b/theano/sandbox/scan_module/scan.py
 """
-This module provides the Scan Op
+This module provides the Scan Op.
 Scanning is a general form of recurrence, which can be used for looping.
 The idea is that you *scan* a function along some input sequence, producing
 an output at each time-step that can be seen (but not modified) by the
-function at the next time-step. (Technically, the function can see the
+function at the next time-step. Technically, the function can see the
 previous K  time-steps of your outputs and L time steps (from past and
 future) of your inputs.
@@ -32,6 +32,7 @@ host at each step
 The Scan Op should typically be used by calling any of the following
 functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
 ``foldr()``.
 """
 __docformat__ = 'restructedtext en'
 __authors__ = ("Razvan Pascanu "
@@ -76,7 +77,9 @@ def scan(fn,
    This function constructs and applies a Scan op to the provided
    arguments.
-    :param fn:
+    Parameters
+    ----------
+    fn
        ``fn`` is a function that describes the operations involved in one
        step of ``scan``. ``fn`` should construct variables describing the
        output of one iteration step. It should expect as input theano
@@ -167,7 +170,7 @@ def scan(fn,
        number of steps ) is still required even though a condition is
        passed (and it is used to allocate memory if needed). = {}):
-    :param sequences:
+    sequences
        ``sequences`` is the list of Theano variables or dictionaries
        describing the sequences ``scan`` has to iterate over. If a
        sequence is given as wrapped in a dictionary, then a set of optional
@@ -185,8 +188,7 @@ def scan(fn,
        Any Theano variable in the list ``sequences`` is automatically
        wrapped into a dictionary where ``taps`` is set to ``[0]``
+    outputs_info
-    :param outputs_info:
        ``outputs_info`` is the list of Theano variables or dictionaries
        describing the initial state of the outputs computed
        recurrently. When this initial states are given as dictionary
@@ -243,15 +245,13 @@ def scan(fn,
        raised (because there is no convention on how scan should map
        the provided information to the outputs of ``fn``)
+    non_sequences
-    :param non_sequences:
        ``non_sequences`` is the list of arguments that are passed to
        ``fn`` at each steps. One can opt to exclude variable
        used in ``fn`` from this list as long as they are part of the
        computational graph, though for clarity we encourage not to do so.
+    n_steps
-    :param n_steps:
        ``n_steps`` is the number of steps to iterate given as an int
        or Theano scalar. If any of the input sequences do not have
        enough elements, scan will raise an error. If the *value is 0* the
@@ -261,8 +261,7 @@ def scan(fn,
        in time. If n stpes is not provided, ``scan`` will figure
        out the amount of steps it should run given its input sequences.
+    truncate_gradient
-    :param truncate_gradient:
        ``truncate_gradient`` is the number of steps to use in truncated
        BPTT.  If you compute gradients through a scan op, they are
        computed using backpropagation through time. By providing a
@@ -270,16 +269,14 @@ def scan(fn,
        of classical BPTT, where you go for only ``truncate_gradient``
        number of steps back in time.
+    go_backwards
-    :param go_backwards:
        ``go_backwards`` is a flag indicating if ``scan`` should go
        backwards through the sequences. If you think of each sequence
        as indexed by time, making this flag True would mean that
        ``scan`` goes back in time, namely that for any sequence it
        starts from the end and goes towards 0.
+    name
-    :param name:
        When profiling ``scan``, it is crucial to provide a name for any
        instance of ``scan``. The profiler will produce an overall
        profile of your code as well as profiles for the computation of
@@ -287,7 +284,7 @@ def scan(fn,
        appears in those profiles and can greatly help to disambiguate
        information.
-    :param mode:
+    mode
        It is recommended to leave this argument to None, especially
        when profiling ``scan`` (otherwise the results are not going to
        be accurate). If you prefer the computations of one step of
@@ -296,7 +293,7 @@ def scan(fn,
        loop are done (see ``theano.function`` for details about
        possible values and their meaning).
-    :param profile:
+    profile
        Flag or string. If true, or different from the empty string, a
        profile object will be created and attached to the inner graph of
        scan. In case ``profile`` is True, the profile object will have the
@@ -305,18 +302,21 @@ def scan(fn,
        inner graph with the new cvm linker ( with default modes,
        other linkers this argument is useless)
-    :rtype: tuple
+    Returns
-    :return: tuple of the form (outputs, updates); ``outputs`` is either a
+    -------
-             Theano variable or a list of Theano variables representing the
+    tuple
-             outputs of ``scan`` (in the same order as in
+        Tuple of the form (outputs, updates); ``outputs`` is either a
-             ``outputs_info``). ``updates`` is a subclass of dictionary
+        Theano variable or a list of Theano variables representing the
-             specifying the
+        outputs of ``scan`` (in the same order as in
-             update rules for all shared variables used in scan
+        ``outputs_info``). ``updates`` is a subclass of dictionary
-             This dictionary should be passed to ``theano.function`` when
+        specifying the
-             you compile your function. The change compared to a normal
+        update rules for all shared variables used in scan
-             dictionary is that we validate that keys are SharedVariable
+        This dictionary should be passed to ``theano.function`` when
-             and addition of those dictionary are validated to be consistent.
+        you compile your function. The change compared to a normal
-    """
+        dictionary is that we validate that keys are SharedVariable
+        and addition of those dictionary are validated to be consistent.
+        """
    # Note : see the internal documentation of the scan op for naming
    # conventions and all other details
    if options is None:
@@ -544,6 +544,7 @@ def one_step_scan(fn,
                  truncate_gradient):
    """
    This function is evaluated if `n_steps` evaluates to either 1 or -1.
    """
    # 1. Grab slices of sequences
    inputs_slices = [input[0] for input in inputs]

--- a/theano/sandbox/scan_module/scan_op.py
+++ b/theano/sandbox/scan_module/scan_op.py
 """
-This module provides the Scan Op
+This module provides the Scan Op.
+See scan.py for details on scan.
-See scan.py for details on scan
 """
 from __future__ import print_function
@@ -157,25 +158,32 @@ class ScanOp(PureOp):
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        """
-        :param node: the Apply node returned by the ``make_node`` function
-                     of the scan op class
-        :param storage_map: dict variable -> one-element-list where a computed
-               value for this variable may be found.
-        :param compute_map: dict variable -> one-element-list where a boolean
-                value will be found.  The boolean indicates whether the
-                variable's storage_map container contains a valid value (True)
-                or if it has not been computed yet (False).
-        :param no_recycling: list of variables for which it is forbidden to
+        Parameters
-                reuse memory allocated by a previous call.
+        ----------
+        node
+            The Apply node returned by the ``make_node`` function of the scan
+            op class.
+        storage_map
+            dict variable -> one-element-list where a computed value for this
+            variable may be found.
+        compute_map
+            dict variable -> one-element-list where a boolean value will be
+            found. The boolean indicates whether the variable's storage_map
+            container contains a valid value (True) or if it has not been
+            computed yet (False).
+        no_recycling
+            List of variables for which it is forbidden to reuse memory
+            allocated by a previous call.
+        Notes
+        -----
+        If the thunk consults the storage_map on every call, it is safe
+        for it to ignore the no_recycling argument, because elements of the
+        no_recycling list will have a value of None in the storage map. If
+        the thunk can potentially cache return values (like CLinker does),
+        then it must not do so for variables in the no_recycling list.
-        :note: If the thunk consults the storage_map on every call, it is safe
-            for it to ignore the no_recycling argument, because elements of the
-            no_recycling list will have a value of None in the storage map.  If
-            the thunk can potentially cache return values (like CLinker does),
-            then it must not do so for variables in the no_recycling list.
        """
        # 1. Collect all memory buffers
        node_input_storage = [storage_map[r] for r in node.inputs]

--- a/theano/sandbox/scan_module/scan_utils.py
+++ b/theano/sandbox/scan_module/scan_utils.py
 """
-This module provides utility functions for the Scan Op
+This module provides utility functions for the Scan Op.
+See scan.py for details on scan.
-See scan.py for details on scan
 """
 from __future__ import print_function
 __docformat__ = 'restructedtext en'
@@ -41,8 +42,11 @@ def expand(tensor_var, size):
    ``tensor_var``, namely:
        rval[:d1] = tensor_var
-    :param tensor_var: Theano tensor variable
+    Parameters
-    :param size: int
+    ----------
+    tensor_var : Theano tensor variable.
+    size : int
    """
    # Corner case that I might use in an optimization
    if size == 0:
@@ -57,7 +61,8 @@ def expand(tensor_var, size):
 def to_list(ls):
    """
    Converts ``ls`` to list if it is a tuple, or wraps ``ls`` into a list if
-    it is not a list already
+    it is not a list already.
    """
    if isinstance(ls, (list, tuple)):
        return list(ls)
@@ -70,7 +75,9 @@ class until(object):
    Theano can end on a condition. In order to differentiate this condition
    from the other outputs of scan, this class is used to wrap the condition
    around it.
    """
    def __init__(self, condition):
        self.condition = tensor.as_tensor_variable(condition)
        assert self.condition.ndim == 0
@@ -78,10 +85,12 @@ class until(object):
 def get_updates_and_outputs(ls):
    """
-    Parses the list ``ls`` into outputs and updates. The semantics
+    Parses the list ``ls`` into outputs and updates. 
-    of ``ls`` is defined by the constructive function of scan.
+    The semantics of ``ls`` is defined by the constructive function of scan.
    The elemets of ``ls`` are either a list of expressions representing the
    outputs/states, a dictionary of updates or a condition.
    """
    def is_list_outputs(elem):
        if (isinstance(elem, (list, tuple)) and
@@ -150,23 +159,23 @@ def get_updates_and_outputs(ls):
 def clone(output, replace=None, strict=True, share_inputs=True):
    """
-    Function that allows replacing subgraphs of a computational
+    Function that allows replacing subgraphs of a computational graph.
-    graph. It returns a copy of the initial subgraph with the corresponding
+    It returns a copy of the initial subgraph with the corresponding
    substitutions.
-    :type output: Theano Variables (or Theano expressions)
+    Parameters
-    :param outputs: Theano expression that represents the computational
+    ----------
-                    graph
+    output : Theano Variables (or Theano expressions)
+        Theano expression that represents the computational graph.
-    :type replace: dict
+    replace: dict
-    :param replace: dictionary describing which subgraphs should be
+        Dictionary describing which subgraphs should be replaced by what.
-                    replaced by what
+    share_inputs : bool
+        If True, use the same inputs (and shared variables) as the original
+        graph. If False, clone them. Note that cloned shared variables still
+        use the same underlying storage, so they will always have the same
+        value.
-    :type share_inputs: bool
-    :param share_inputs: If True, use the same inputs (and shared variables)
-        as the original graph. If False, clone them. Note that cloned
-        shared variables still use the same underlying storage, so they
-        will always have the same value.
    """
    inps, outs, other_stuff = rebuild_collect_shared(output,
                                                     [],
@@ -189,6 +198,7 @@ def canonical_arguments(sequences,
    Mainly it makes sure that arguments are given as lists of dictionaries,
    and that the different fields of of a dictionary are set to default
    value if the user has not provided any.
    """
    states_info = to_list(outputs_info)
    parameters = [tensor.as_tensor_variable(x) for x in to_list(non_sequences)]
@@ -303,13 +313,14 @@ def canonical_arguments(sequences,
 def infer_shape(outs, inputs, input_shapes):
-    '''
+    """
    Compute the shape of the outputs given the shape of the inputs
    of a theano graph.
-    We do it this way to avoid compiling the inner function just to get
+    We do it this way to avoid compiling the inner function just to get the
-    the shape. Changes to ShapeFeature could require changes in this function.
+    shape. Changes to ShapeFeature could require changes in this function.
-    '''
+    """
    # We use a ShapeFeature because it has all the necessary logic
    # inside.  We don't use the full ShapeFeature interface, but we
    # let it initialize itself with an empty fgraph, otherwise we will
@@ -326,9 +337,10 @@ def infer_shape(outs, inputs, input_shapes):
        shape_feature.set_shape(inp, inp_shp)
    def local_traverse(out):
-        '''
+        """
        Go back in the graph, from out, adding computable shapes to shape_of.
-        '''
+        """
        if out in shape_feature.shape_of:
            # Its shape is already known
@@ -358,14 +370,17 @@ def allocate_memory(T, y_info, y):
    """
    Allocates memory for an output of scan.
-    :param T: scalar
+    Parameters
-        Variable representing the number of steps scan will run
+    ----------
-    :param y_info: dict
+    T : scalar
+        Variable representing the number of steps scan will run.
+    y_info : dict
        Dictionary describing the output (more specifically describing shape
-        information for the output
+        information for the output.
-    :param y: Tensor variable
+    y : Tensor variable
        Expression describing the computation resulting in out entry of y.
-        It can be used to infer the shape of y
+        It can be used to infer the shape of y.
    """
    if 'shape' in y_info:
        return tensor.zeros([T, ] + list(y_info['shape']),