Merge pull request #3988 from nouiz/small

A few small stuff

Merge pull request #3988 from nouiz/small
709c9440 · Pascal Lamblin · 19b6c0a6 · 29f4f906 · 709c9440 · 709c9440
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -51,7 +51,7 @@ Environment Variables
    .. code-block:: bash
-        THEANO_FLAGS='floatX=float32,device=gpu0,nvcc.fastmath=True'  python <myscript>.py
+        THEANO_FLAGS='floatX=float32,device=gpu0,lib.cnmem=1'  python <myscript>.py
    If a value is defined several times in ``THEANO_FLAGS``,
    the right-most definition is used. So, for instance, if
@@ -72,15 +72,15 @@ Environment Variables
        floatX = float32
        device = gpu0
-        [nvcc]
+        [lib]
-        fastmath = True
+        cnmem = True
    Configuration attributes that are available directly in ``config``
    (e.g. ``config.device``, ``config.mode``) should be defined in the
    ``[global]`` section.
-    Attributes from a subsection of ``config`` (e.g. ``config.nvcc.fastmath``,
+    Attributes from a subsection of ``config`` (e.g. ``config.lib.cnmem``,
-    ``config.blas.ldflags``) should be defined in their corresponding section
+    ``config.dnn.conv.algo_fwd``) should be defined in their corresponding
-    (e.g. ``[nvcc]``, ``[blas]``).
+    section (e.g. ``[nvcc]``, ``[dnn.conv]``).
    Multiple configuration files can be specified by separating them with ':'
    characters (as in $PATH).  Multiple configuration files will be merged,
@@ -644,7 +644,8 @@ import theano and print the config variable, as in:
    <http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#options-for-steering-cuda-compilation>`_)
    mode for compiled cuda code which makes div and sqrt faster at the
    cost of precision.  This also disables support for denormal
-    numbers.
+    numbers. This can cause NaN. So if you have NaN and use this flag,
+    try to disable it.
 .. attribute:: config.optimizer_excluding

--- a/doc/tutorial/nan_tutorial.txt
+++ b/doc/tutorial/nan_tutorial.txt
@@ -69,3 +69,10 @@ In the most difficult situations, you may go through the above steps and find
 nothing wrong. If the above methods fail to uncover the cause, there is a good
 chance that something is wrong with your algorithm. Go back to the mathematics
 and find out if everything is derived correctly.
+Cuda Specific Option
+--------------------
+The Theano flags ``nvcc.fastmath=True``, can genarate NaN. Don't set
+this flag while debugging nan.
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -967,9 +967,14 @@ class Function(object):
            for node in self.nodes_with_inner_function:
                ops_with_inner_function[node.op].free()
+    def get_shared(self):
+        """
+        Return the shared variable read or updated by by this function.
+        """
+        return [i.variable for i in self.maker.inputs if i.implicit]
-# pickling/deepcopy support for Function
+# pickling/deepcopy support for Function
 def _pickle_Function(f):
    # copy of the input storage list
    ins = list(f.input_storage)

--- a/theano/gof/compiledir.py
+++ b/theano/gof/compiledir.py
@@ -362,6 +362,8 @@ def print_compiledir_content():
    nb_keys = {}
    for dir in os.listdir(compiledir):
        filename = os.path.join(compiledir, dir, "key.pkl")
+        if not os.path.exists(filename):
+            continue
        with open(filename, 'rb') as file:
            try:
                keydata = pickle.load(file)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -493,6 +493,8 @@ def use(device,
                       'fast_run')
        optdb.add_tags('gpu_after_fusion',
                       'fast_run')
+        optdb.add_tags('gpu_scanOp_make_inplace',
+                       'fast_run')
    if force:
        try:

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -1948,10 +1948,8 @@ class GpuConv(GpuOp):
                     images[2] * images[3] * 2)
        return flops
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def prepare_node(self, node):
-        node_ = copy.copy(node)
+        if node.op.max_threads_dim0 is None:
-        assert node.op is node_.op
-        if node_.op.max_threads_dim0 is None:
            cuda = theano.sandbox.cuda
            device_id = cuda.use.device_number
            if device_id is None:
@@ -1964,9 +1962,7 @@ class GpuConv(GpuOp):
                device_id = cuda.use.device_number
            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
            prop = cuda_ndarray.device_properties(device_id)
-            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+            node.op.max_threads_dim0 = prop['maxThreadsDim0']
-        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
-                                                   compute_map, no_recycling)
    def c_compile_args(self):
        nb = 0

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1145,6 +1145,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        :attr:`config.dnn.conv.precision`.
    """
+    # For consistence, when using direction_hint too.
+    if border_mode == (0, 0):
+        border_mode = 'valid'
    # Establish dtype in which to perform the computation of the convolution
    if precision is None:
@@ -1255,6 +1258,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    :warning: dnn_conv3d only works with cuDNN library 3.0
    """
+    if border_mode == (0, 0):
+        border_mode = 'valid'
    # Establish dtype in which to perform the computation of the convolution
    if precision is None:

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2535,12 +2535,13 @@ def local_gpu_allocempty(node):
 def typeInfer(node):
    return typeConstructor
+# Do not register in fast_run or fast_compile.
+# It will be added to fast_run if the GPU is enabled.
 optdb.register('gpu_scanOp_make_inplace',
               scan_opt.ScanInplaceOptimizer(typeInfer=typeInfer,
                                             gpu_flag=True),
               75,
               'gpu',
-               'fast_run',
               'inplace',
               'scan')

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -78,6 +78,7 @@ if pygpu:
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
+            optdb.add_tags('gpua_scanOp_make_inplace', 'fast_run')
        elif (config.init_gpu_device.startswith('cuda') or
              config.init_gpu_device.startswith('opencl')):
            if config.device != 'cpu':
@@ -91,6 +92,7 @@ if pygpu:
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
+            optdb.add_tags('gpua_scanOp_make_inplace', 'fast_run')
        from .basic_ops import (GpuAlloc, GpuAllocEmpty, GpuContiguous, GpuEye,
                                GpuFromHost, GpuJoin, GpuReshape, GpuSplit,

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -977,11 +977,12 @@ def _scan_type_infer(node):
                            context_name=context_name)
    return typebuild
+# Do not register in fast_run or fast_compile.
+# It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',
               scan_opt.ScanInplaceOptimizer(typeInfer=_scan_type_infer,
                                             gpua_flag=True),
               75,
               'gpuarray',
-               'fast_run',
               'inplace',
               'scan')
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -1449,7 +1449,7 @@ from theano.sandbox.gpuarray.opt import (register_opt as register_gpua,
                                         host_from_gpu as host_from_gpua)
-@register_gpua()
+@register_gpua('fast_compile')
 @local_optimizer([mrg_uniform])
 def local_gpua_mrg(node):
    if (type(node.op) == mrg_uniform and

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -392,6 +392,8 @@ class BaseAbstractConv2d(Op):
        if isinstance(border_mode, tuple):
            pad_h, pad_w = map(int, border_mode)
            border_mode = (pad_h, pad_w)
+        if border_mode == (0, 0):
+            border_mode = 'valid'
        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
                border_mode in ('valid', 'full', 'half')):
            raise ValueError(