提交 709c9440 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3988 from nouiz/small

A few small stuff
...@@ -51,7 +51,7 @@ Environment Variables ...@@ -51,7 +51,7 @@ Environment Variables
.. code-block:: bash .. code-block:: bash
THEANO_FLAGS='floatX=float32,device=gpu0,nvcc.fastmath=True' python <myscript>.py THEANO_FLAGS='floatX=float32,device=gpu0,lib.cnmem=1' python <myscript>.py
If a value is defined several times in ``THEANO_FLAGS``, If a value is defined several times in ``THEANO_FLAGS``,
the right-most definition is used. So, for instance, if the right-most definition is used. So, for instance, if
...@@ -72,15 +72,15 @@ Environment Variables ...@@ -72,15 +72,15 @@ Environment Variables
floatX = float32 floatX = float32
device = gpu0 device = gpu0
[nvcc] [lib]
fastmath = True cnmem = True
Configuration attributes that are available directly in ``config`` Configuration attributes that are available directly in ``config``
(e.g. ``config.device``, ``config.mode``) should be defined in the (e.g. ``config.device``, ``config.mode``) should be defined in the
``[global]`` section. ``[global]`` section.
Attributes from a subsection of ``config`` (e.g. ``config.nvcc.fastmath``, Attributes from a subsection of ``config`` (e.g. ``config.lib.cnmem``,
``config.blas.ldflags``) should be defined in their corresponding section ``config.dnn.conv.algo_fwd``) should be defined in their corresponding
(e.g. ``[nvcc]``, ``[blas]``). section (e.g. ``[nvcc]``, ``[dnn.conv]``).
Multiple configuration files can be specified by separating them with ':' Multiple configuration files can be specified by separating them with ':'
characters (as in $PATH). Multiple configuration files will be merged, characters (as in $PATH). Multiple configuration files will be merged,
...@@ -644,7 +644,8 @@ import theano and print the config variable, as in: ...@@ -644,7 +644,8 @@ import theano and print the config variable, as in:
<http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#options-for-steering-cuda-compilation>`_) <http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#options-for-steering-cuda-compilation>`_)
mode for compiled cuda code which makes div and sqrt faster at the mode for compiled cuda code which makes div and sqrt faster at the
cost of precision. This also disables support for denormal cost of precision. This also disables support for denormal
numbers. numbers. This can cause NaN. So if you have NaN and use this flag,
try to disable it.
.. attribute:: config.optimizer_excluding .. attribute:: config.optimizer_excluding
......
...@@ -69,3 +69,10 @@ In the most difficult situations, you may go through the above steps and find ...@@ -69,3 +69,10 @@ In the most difficult situations, you may go through the above steps and find
nothing wrong. If the above methods fail to uncover the cause, there is a good nothing wrong. If the above methods fail to uncover the cause, there is a good
chance that something is wrong with your algorithm. Go back to the mathematics chance that something is wrong with your algorithm. Go back to the mathematics
and find out if everything is derived correctly. and find out if everything is derived correctly.
Cuda Specific Option
--------------------
The Theano flags ``nvcc.fastmath=True``, can genarate NaN. Don't set
this flag while debugging nan.
...@@ -967,9 +967,14 @@ class Function(object): ...@@ -967,9 +967,14 @@ class Function(object):
for node in self.nodes_with_inner_function: for node in self.nodes_with_inner_function:
ops_with_inner_function[node.op].free() ops_with_inner_function[node.op].free()
def get_shared(self):
"""
Return the shared variable read or updated by by this function.
"""
return [i.variable for i in self.maker.inputs if i.implicit]
# pickling/deepcopy support for Function
# pickling/deepcopy support for Function
def _pickle_Function(f): def _pickle_Function(f):
# copy of the input storage list # copy of the input storage list
ins = list(f.input_storage) ins = list(f.input_storage)
......
...@@ -362,6 +362,8 @@ def print_compiledir_content(): ...@@ -362,6 +362,8 @@ def print_compiledir_content():
nb_keys = {} nb_keys = {}
for dir in os.listdir(compiledir): for dir in os.listdir(compiledir):
filename = os.path.join(compiledir, dir, "key.pkl") filename = os.path.join(compiledir, dir, "key.pkl")
if not os.path.exists(filename):
continue
with open(filename, 'rb') as file: with open(filename, 'rb') as file:
try: try:
keydata = pickle.load(file) keydata = pickle.load(file)
......
...@@ -493,6 +493,8 @@ def use(device, ...@@ -493,6 +493,8 @@ def use(device,
'fast_run') 'fast_run')
optdb.add_tags('gpu_after_fusion', optdb.add_tags('gpu_after_fusion',
'fast_run') 'fast_run')
optdb.add_tags('gpu_scanOp_make_inplace',
'fast_run')
if force: if force:
try: try:
......
...@@ -1948,10 +1948,8 @@ class GpuConv(GpuOp): ...@@ -1948,10 +1948,8 @@ class GpuConv(GpuOp):
images[2] * images[3] * 2) images[2] * images[3] * 2)
return flops return flops
def make_thunk(self, node, storage_map, compute_map, no_recycling): def prepare_node(self, node):
node_ = copy.copy(node) if node.op.max_threads_dim0 is None:
assert node.op is node_.op
if node_.op.max_threads_dim0 is None:
cuda = theano.sandbox.cuda cuda = theano.sandbox.cuda
device_id = cuda.use.device_number device_id = cuda.use.device_number
if device_id is None: if device_id is None:
...@@ -1964,9 +1962,7 @@ class GpuConv(GpuOp): ...@@ -1964,9 +1962,7 @@ class GpuConv(GpuOp):
device_id = cuda.use.device_number device_id = cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id) prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0'] node.op.max_threads_dim0 = prop['maxThreadsDim0']
return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling)
def c_compile_args(self): def c_compile_args(self):
nb = 0 nb = 0
......
...@@ -1145,6 +1145,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -1145,6 +1145,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
:attr:`config.dnn.conv.precision`. :attr:`config.dnn.conv.precision`.
""" """
# For consistence, when using direction_hint too.
if border_mode == (0, 0):
border_mode = 'valid'
# Establish dtype in which to perform the computation of the convolution # Establish dtype in which to perform the computation of the convolution
if precision is None: if precision is None:
...@@ -1255,6 +1258,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), ...@@ -1255,6 +1258,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
:warning: dnn_conv3d only works with cuDNN library 3.0 :warning: dnn_conv3d only works with cuDNN library 3.0
""" """
if border_mode == (0, 0):
border_mode = 'valid'
# Establish dtype in which to perform the computation of the convolution # Establish dtype in which to perform the computation of the convolution
if precision is None: if precision is None:
......
...@@ -2535,12 +2535,13 @@ def local_gpu_allocempty(node): ...@@ -2535,12 +2535,13 @@ def local_gpu_allocempty(node):
def typeInfer(node): def typeInfer(node):
return typeConstructor return typeConstructor
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
optdb.register('gpu_scanOp_make_inplace', optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeInfer=typeInfer, scan_opt.ScanInplaceOptimizer(typeInfer=typeInfer,
gpu_flag=True), gpu_flag=True),
75, 75,
'gpu', 'gpu',
'fast_run',
'inplace', 'inplace',
'scan') 'scan')
......
...@@ -78,6 +78,7 @@ if pygpu: ...@@ -78,6 +78,7 @@ if pygpu:
import theano.compile import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor) theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile') optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
optdb.add_tags('gpua_scanOp_make_inplace', 'fast_run')
elif (config.init_gpu_device.startswith('cuda') or elif (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')): config.init_gpu_device.startswith('opencl')):
if config.device != 'cpu': if config.device != 'cpu':
...@@ -91,6 +92,7 @@ if pygpu: ...@@ -91,6 +92,7 @@ if pygpu:
import theano.compile import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor) theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile') optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
optdb.add_tags('gpua_scanOp_make_inplace', 'fast_run')
from .basic_ops import (GpuAlloc, GpuAllocEmpty, GpuContiguous, GpuEye, from .basic_ops import (GpuAlloc, GpuAllocEmpty, GpuContiguous, GpuEye,
GpuFromHost, GpuJoin, GpuReshape, GpuSplit, GpuFromHost, GpuJoin, GpuReshape, GpuSplit,
......
...@@ -977,11 +977,12 @@ def _scan_type_infer(node): ...@@ -977,11 +977,12 @@ def _scan_type_infer(node):
context_name=context_name) context_name=context_name)
return typebuild return typebuild
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeInfer=_scan_type_infer, scan_opt.ScanInplaceOptimizer(typeInfer=_scan_type_infer,
gpua_flag=True), gpua_flag=True),
75, 75,
'gpuarray', 'gpuarray',
'fast_run',
'inplace', 'inplace',
'scan') 'scan')
...@@ -1449,7 +1449,7 @@ from theano.sandbox.gpuarray.opt import (register_opt as register_gpua, ...@@ -1449,7 +1449,7 @@ from theano.sandbox.gpuarray.opt import (register_opt as register_gpua,
host_from_gpu as host_from_gpua) host_from_gpu as host_from_gpua)
@register_gpua() @register_gpua('fast_compile')
@local_optimizer([mrg_uniform]) @local_optimizer([mrg_uniform])
def local_gpua_mrg(node): def local_gpua_mrg(node):
if (type(node.op) == mrg_uniform and if (type(node.op) == mrg_uniform and
......
...@@ -392,6 +392,8 @@ class BaseAbstractConv2d(Op): ...@@ -392,6 +392,8 @@ class BaseAbstractConv2d(Op):
if isinstance(border_mode, tuple): if isinstance(border_mode, tuple):
pad_h, pad_w = map(int, border_mode) pad_h, pad_w = map(int, border_mode)
border_mode = (pad_h, pad_w) border_mode = (pad_h, pad_w)
if border_mode == (0, 0):
border_mode = 'valid'
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full', 'half')): border_mode in ('valid', 'full', 'half')):
raise ValueError( raise ValueError(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论