提交 72d2cbdc authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5814 from abergeron/rm_-r

Remove the old backend and all of its tentacles
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
BUILDBOT_DIR=$WORKSPACE/nightly_build BUILDBOT_DIR=$WORKSPACE/nightly_build
THEANO_PARAM="theano --with-timer --timer-top-n 10" THEANO_PARAM="theano --with-timer --timer-top-n 10"
export THEANO_FLAGS=init_gpu_device=gpu export THEANO_FLAGS=init_gpu_device=cuda
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
BUILDBOT_DIR=$WORKSPACE/nightly_build BUILDBOT_DIR=$WORKSPACE/nightly_build
THEANO_PARAM="theano --with-timer --timer-top-n 10 -v" THEANO_PARAM="theano --with-timer --timer-top-n 10 -v"
export THEANO_FLAGS=init_gpu_device=gpu export THEANO_FLAGS=init_gpu_device=cuda
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
......
...@@ -4,7 +4,7 @@ BUILDBOT_DIR=$WORKSPACE/nightly_build ...@@ -4,7 +4,7 @@ BUILDBOT_DIR=$WORKSPACE/nightly_build
THEANO_PARAM="theano --with-timer --timer-top-n 10" THEANO_PARAM="theano --with-timer --timer-top-n 10"
# Set test reports using nosetests xunit # Set test reports using nosetests xunit
XUNIT="--with-xunit --xunit-file=" XUNIT="--with-xunit --xunit-file="
export THEANO_FLAGS=init_gpu_device=gpu export THEANO_FLAGS=init_gpu_device=cuda
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
BUILDBOT_DIR=$WORKSPACE/nightly_build BUILDBOT_DIR=$WORKSPACE/nightly_build
THEANO_PARAM="theano --with-timer --timer-top-n 10" THEANO_PARAM="theano --with-timer --timer-top-n 10"
export THEANO_FLAGS=init_gpu_device=gpu export THEANO_FLAGS=init_gpu_device=cuda
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
......
...@@ -11,7 +11,7 @@ export PATH=/usr/local/miniconda2/bin:$PATH ...@@ -11,7 +11,7 @@ export PATH=/usr/local/miniconda2/bin:$PATH
echo "===== Testing theano core" echo "===== Testing theano core"
# Test theano core # Test theano core
PARTS="theano -e cuda -e gpuarray" PARTS="theano -e gpuarray"
THEANO_PARAM="${PARTS} --with-timer --timer-top-n 10 --with-xunit --xunit-file=theanocore_tests.xml" THEANO_PARAM="${PARTS} --with-timer --timer-top-n 10 --with-xunit --xunit-file=theanocore_tests.xml"
FLAGS="mode=FAST_RUN,floatX=float32,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800" FLAGS="mode=FAST_RUN,floatX=float32,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800"
THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM} THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
...@@ -13,27 +13,6 @@ export PATH=/usr/local/cuda/bin:$PATH ...@@ -13,27 +13,6 @@ export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
echo "===== Testing old theano.sandbox.cuda backend"
THEANO_CUDA_TESTS="theano/sandbox/cuda/tests \
theano/misc/tests/test_pycuda_example.py \
theano/misc/tests/test_pycuda_theano_simple.py \
theano/misc/tests/test_pycuda_utils.py \
theano/tensor/tests/test_opt.py:TestCompositeCodegen \
theano/tensor/tests/test_opt.py:test_shapeoptimizer \
theano/tensor/tests/test_opt.py:test_fusion \
theano/compile/tests/test_debugmode.py:Test_preallocated_output \
theano/sparse/tests/test_basic.py:DotTests \
theano/sandbox/tests/test_multinomial.py:test_gpu_opt \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPU_serial \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPU_parallel \
theano/sandbox/tests/test_rng_mrg.py:test_GPU_nstreams_limit \
theano/sandbox/tests/test_rng_mrg.py:test_overflow_gpu_old_backend \
theano/scan_module/tests/test_scan.py:T_Scan_Cuda"
THEANO_PARAM="${THEANO_CUDA_TESTS} --with-timer --timer-top-n 10 --with-xunit --xunit-file=theanocuda_tests.xml"
FLAGS="mode=FAST_RUN,init_gpu_device=gpu,floatX=float32"
THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
echo "===== Testing gpuarray backend" echo "===== Testing gpuarray backend"
GPUARRAY_CONFIG="Release" GPUARRAY_CONFIG="Release"
...@@ -72,9 +51,6 @@ python -c 'import pygpu; print(pygpu.__file__)' ...@@ -72,9 +51,6 @@ python -c 'import pygpu; print(pygpu.__file__)'
# Testing theano (the gpuarray parts) # Testing theano (the gpuarray parts)
THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \ THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \
theano/sandbox/tests/test_rng_mrg.py:test_GPUA_full_fill \
theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray \ theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray \
theano/scan_module/tests/test_scan_checkpoints.py:TestScanCheckpoint.test_memory" theano/scan_module/tests/test_scan_checkpoints.py:TestScanCheckpoint.test_memory"
FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800" FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN,on_opt_error=raise,on_shape_error=raise,cmodule.age_thresh_use=604800"
......
...@@ -537,8 +537,8 @@ You can implement c_code for this op. You register it like this: ...@@ -537,8 +537,8 @@ You can implement c_code for this op. You register it like this:
In your C code, you should use %(iname)s and %(oname)s to represent In your C code, you should use %(iname)s and %(oname)s to represent
the C variable names of the DeepCopyOp input and output the C variable names of the DeepCopyOp input and output
respectively. See an example for the type ``CudaNdarrayType`` (GPU respectively. See an example for the type ``GpuArrayType`` (GPU
array) in the file `theano/sandbox/cuda/type.py`. The version array) in the file `theano/gpuarray/type.py`. The version
parameter is what is returned by DeepCopyOp.c_code_cache_version(). By parameter is what is returned by DeepCopyOp.c_code_cache_version(). By
default, it will recompile the c code for each process. default, it will recompile the c code for each process.
...@@ -559,8 +559,8 @@ calling: ...@@ -559,8 +559,8 @@ calling:
In your C code, you should use %(iname)s and %(oname)s to represent In your C code, you should use %(iname)s and %(oname)s to represent
the C variable names of the ViewOp input and output the C variable names of the ViewOp input and output
respectively. See an example for the type ``CudaNdarrayType`` (GPU respectively. See an example for the type ``GpuArrayType`` (GPU
array) in the file `theano/sandbox/cuda/type.py`. The version array) in the file `thean/gpuarray/type.py`. The version
parameter is what is returned by ViewOp.c_code_cache_version(). By parameter is what is returned by ViewOp.c_code_cache_version(). By
default, it will recompile the c code for each process. default, it will recompile the c code for each process.
......
...@@ -98,7 +98,7 @@ possibilities you may encounter or need. For that refer to ...@@ -98,7 +98,7 @@ possibilities you may encounter or need. For that refer to
def c_code(self, node, inputs, outputs, sub): def c_code(self, node, inputs, outputs, sub):
pass pass
# Other implementations (pycuda, ...): # Other implementations:
def make_thunk(self, node, storage_map, _, _2, impl=None): def make_thunk(self, node, storage_map, _, _2, impl=None):
pass pass
...@@ -194,8 +194,7 @@ or :func:`make_thunk`. ...@@ -194,8 +194,7 @@ or :func:`make_thunk`.
It should have a default value of None. It should have a default value of None.
:func:`make_thunk` is useful if you want to generate code and compile :func:`make_thunk` is useful if you want to generate code and compile
it yourself. For example, this allows you to use PyCUDA to compile GPU it yourself.
code and keep state in the thunk.
If :func:`make_thunk()` is defined by an op, it will be used by Theano If :func:`make_thunk()` is defined by an op, it will be used by Theano
to obtain the op's implementation. to obtain the op's implementation.
...@@ -674,14 +673,6 @@ For instance, to verify the Rop method of the DoubleOp, you can use this: ...@@ -674,14 +673,6 @@ For instance, to verify the Rop method of the DoubleOp, you can use this:
def test_double_rop(self): def test_double_rop(self):
self.check_rop_lop(DoubleRop()(self.x), self.in_shape) self.check_rop_lop(DoubleRop()(self.x), self.in_shape)
Testing GPU Ops
^^^^^^^^^^^^^^^
When using the old GPU backend, Ops to be executed on the GPU should inherit
from ``theano.sandbox.cuda.GpuOp`` and not ``theano.Op``. This allows
Theano to distinguish them. Currently, we use this to test if the
NVIDIA driver works correctly with our sum reduction code on the GPU.
Running Your Tests Running Your Tests
^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^
......
...@@ -309,5 +309,5 @@ As long as the computations happen on the NULL stream there are no ...@@ -309,5 +309,5 @@ As long as the computations happen on the NULL stream there are no
special considerations to watch for with regards to synchronization. special considerations to watch for with regards to synchronization.
Otherwise, you will have to make sure that you synchronize the pygpu Otherwise, you will have to make sure that you synchronize the pygpu
objects by calling the `.sync()` method before scheduling any work and objects by calling the `.sync()` method before scheduling any work and
synchronize with the work that happends in the library after all the synchronize with the work that happens in the library after all the
work is scheduled. work is scheduled.
...@@ -51,28 +51,29 @@ optimizations and disables the generation of any c/cuda code. This is useful ...@@ -51,28 +51,29 @@ optimizations and disables the generation of any c/cuda code. This is useful
for quickly testing a simple idea. for quickly testing a simple idea.
If c/cuda code is necessary, as when using a GPU, the flag If c/cuda code is necessary, as when using a GPU, the flag
``optimizer=fast_compile`` can be used instead. It instructs Theano to skip time ``optimizer=fast_compile`` can be used instead. It instructs Theano to
consuming optimizations but still generate c/cuda code. To get the most out of skip time consuming optimizations but still generate c/cuda code.
this flag requires using a development version of Theano instead of the latest
release (0.6).
Similarly using the flag ``optimizer_excluding=inplace`` will speed up Similarly using the flag ``optimizer_excluding=inplace`` will speed up
compilation by preventing optimizations that replace operations with a version compilation by preventing optimizations that replace operations with a
that reuses memory where it will not negatively impact the integrity of the version that reuses memory where it will not negatively impact the
operation. Such optimizations can be time consuming. However using this flag will integrity of the operation. Such optimizations can be time
result in greater memory usage because space must be allocated for the results consuming. However using this flag will result in greater memory usage
which would be unnecessary otherwise. In short, using this flag will speed up because space must be allocated for the results which would be
unnecessary otherwise. In short, using this flag will speed up
compilation but it will also use more memory because compilation but it will also use more memory because
``optimizer_excluding=inplace`` excludes inplace optimizations resulting ``optimizer_excluding=inplace`` excludes inplace optimizations
in a trade off between speed of compilation and memory usage. resulting in a trade off between speed of compilation and memory
usage.
Theano flag `reoptimize_unpickled_function` controls if an unpickled theano function
should reoptimize its graph or not. Theano users can use the standard python pickle Theano flag `reoptimize_unpickled_function` controls if an unpickled
tools to save a compiled theano function. When pickling, both graph before and theano function should reoptimize its graph or not. Theano users can
after the optimization are saved, including shared variables. When set to True, use the standard python pickle tools to save a compiled theano
the graph is reoptimized when being unpickled. Otherwise, skip the graph optimization function. When pickling, both graph before and after the optimization
and use directly the optimized graph from the pickled file. After Theano 0.7, are saved, including shared variables. When set to True, the graph is
the default changed to False. reoptimized when being unpickled. Otherwise, skip the graph
optimization and use directly the optimized graph from the pickled
file. The default is False.
Faster Theano function Faster Theano function
---------------------- ----------------------
......
...@@ -21,6 +21,12 @@ learning/machine learning <https://mila.umontreal.ca/en/cours/>`_ classes). ...@@ -21,6 +21,12 @@ learning/machine learning <https://mila.umontreal.ca/en/cours/>`_ classes).
News News
==== ====
* Removed support for the old (device=gpu) backend. Use the new
backend (device=cuda) for gpu computing. See `Converting to the new
gpu back end(gpuarray)
<https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29>`_
for help with conversion.
* 2017/03/20: Release of Theano 0.9.0. Everybody is encouraged to update. * 2017/03/20: Release of Theano 0.9.0. Everybody is encouraged to update.
* 2017/03/13: Release of Theano 0.9.0rc4, with crash fixes and bug fixes. * 2017/03/13: Release of Theano 0.9.0rc4, with crash fixes and bug fixes.
...@@ -37,7 +43,7 @@ News ...@@ -37,7 +43,7 @@ News
`Theano: A Python framework for fast computation of mathematical expressions <http://arxiv.org/abs/1605.02688>`_. `Theano: A Python framework for fast computation of mathematical expressions <http://arxiv.org/abs/1605.02688>`_.
This is the new preferred reference. This is the new preferred reference.
* 2016/04/21: Release of Theano 0.8.2, adding support for :ref:`CuDNN v5 <libdoc_cuda_dnn>`. * 2016/04/21: Release of Theano 0.8.2, adding support for CuDNN v5.
* 2016/03/29: Release of Theano 0.8.1, fixing a compilation issue on MacOS X with XCode 7.3. * 2016/03/29: Release of Theano 0.8.1, fixing a compilation issue on MacOS X with XCode 7.3.
...@@ -45,12 +51,11 @@ News ...@@ -45,12 +51,11 @@ News
* Multi-GPU. * Multi-GPU.
* We added support for :attr:`CNMeM <config.lib.cnmem>` to speed up * We added support for CNMeM to speed up the GPU memory allocation.
the GPU memory allocation.
* Theano 0.7 was released 26th March 2015. Everybody is encouraged to update. * Theano 0.7 was released 26th March 2015. Everybody is encouraged to update.
* We support `cuDNN <http://deeplearning.net/software/theano/library/sandbox/cuda/dnn.html>`_ if it is installed by the user. * We support cuDNN if it is installed by the user.
* Open Machine Learning Workshop 2014 `presentation <omlw2014/omlw_presentation.pdf>`_. * Open Machine Learning Workshop 2014 `presentation <omlw2014/omlw_presentation.pdf>`_.
...@@ -276,4 +281,3 @@ StackOverflow, follow their guidance for `answering questions <http://stackoverf ...@@ -276,4 +281,3 @@ StackOverflow, follow their guidance for `answering questions <http://stackoverf
.. _LISA: http://www.iro.umontreal.ca/~lisa .. _LISA: http://www.iro.umontreal.ca/~lisa
.. _University of Montreal: http://www.umontreal.ca .. _University of Montreal: http://www.umontreal.ca
...@@ -24,8 +24,8 @@ Ubuntu Installation Instructions ...@@ -24,8 +24,8 @@ Ubuntu Installation Instructions
Prerequisites through System Packages (not recommended) Prerequisites through System Packages (not recommended)
------------------------------------------------------- -------------------------------------------------------
If you want to acquire the requirements through your system packages and install If you want to acquire the requirements through your system packages
them system wide follow these instructions: and install them system wide follow these instructions:
For Ubuntu 16.04 with cuda 7.5 For Ubuntu 16.04 with cuda 7.5
...@@ -49,9 +49,6 @@ For Ubuntu 16.04 with cuda 7.5 ...@@ -49,9 +49,6 @@ For Ubuntu 16.04 with cuda 7.5
sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30 sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30
sudo update-alternatives --set c++ /usr/bin/g++ sudo update-alternatives --set c++ /usr/bin/g++
# Work around a glibc bug
echo -e "\n[nvcc]\nflags=-D_FORCE_INLINES\n" >> ~/.theanorc
For Ubuntu 11.10 through 14.04: For Ubuntu 11.10 through 14.04:
.. code-block:: bash .. code-block:: bash
......
...@@ -54,8 +54,8 @@ You must reboot the computer after the driver installation. ...@@ -54,8 +54,8 @@ You must reboot the computer after the driver installation.
Instructions for other Python distributions (not recommended) Instructions for other Python distributions (not recommended)
============================================================= =============================================================
If you plan to use Theano with other Python distributions, these are generic guidelines to get If you plan to use Theano with other Python distributions, these are
a working environment: generic guidelines to get a working environment:
* Look for the mandatory requirements in the package manager's repositories of your distribution. Many * Look for the mandatory requirements in the package manager's repositories of your distribution. Many
distributions come with ``pip`` package manager which use `PyPI repository <https://pypi.python.org/pypi>`__. distributions come with ``pip`` package manager which use `PyPI repository <https://pypi.python.org/pypi>`__.
......
...@@ -9,8 +9,7 @@ out Theano easy. You can install a stable version of Theano, without having to ...@@ -9,8 +9,7 @@ out Theano easy. You can install a stable version of Theano, without having to
worry about the current state of the repository. While we usually try NOT to worry about the current state of the repository. While we usually try NOT to
break the trunk, mistakes can happen. This also greatly simplifies the break the trunk, mistakes can happen. This also greatly simplifies the
installation process: mercurial is no longer required and certain python installation process: mercurial is no longer required and certain python
dependencies can be handled automatically (numpy for now, maybe pycuda, cython dependencies can be handled automatically (numpy for now, cython later).
later).
The Theano release plan is detailed below. Comments and/or suggestions are The Theano release plan is detailed below. Comments and/or suggestions are
welcome on the mailing list. welcome on the mailing list.
......
...@@ -51,7 +51,7 @@ Environment Variables ...@@ -51,7 +51,7 @@ Environment Variables
.. code-block:: bash .. code-block:: bash
THEANO_FLAGS='floatX=float32,device=cuda0,lib.cnmem=1' python <myscript>.py THEANO_FLAGS='floatX=float32,device=cuda0,gpuarray.preallocate=1' python <myscript>.py
If a value is defined several times in ``THEANO_FLAGS``, If a value is defined several times in ``THEANO_FLAGS``,
the right-most definition is used. So, for instance, if the right-most definition is used. So, for instance, if
...@@ -72,15 +72,15 @@ Environment Variables ...@@ -72,15 +72,15 @@ Environment Variables
floatX = float32 floatX = float32
device = cuda0 device = cuda0
[lib] [gpuarray]
cnmem = 1 preallocate = 1
Configuration attributes that are available directly in ``config`` Configuration attributes that are available directly in ``config``
(e.g. ``config.device``, ``config.mode``) should be defined in the (e.g. ``config.device``, ``config.mode``) should be defined in the
``[global]`` section. ``[global]`` section.
Attributes from a subsection of ``config`` (e.g. ``config.lib.cnmem``, Attributes from a subsection of ``config`` (e.g. ``config.gpuarray.preallocate``,
``config.dnn.conv.algo_fwd``) should be defined in their corresponding ``config.dnn.conv.algo_fwd``) should be defined in their corresponding
section (e.g. ``[nvcc]``, ``[dnn.conv]``). section (e.g. ``[gpuarray]``, ``[dnn.conv]``).
Multiple configuration files can be specified by separating them with ':' Multiple configuration files can be specified by separating them with ':'
characters (as in $PATH). Multiple configuration files will be merged, characters (as in $PATH). Multiple configuration files will be merged,
...@@ -105,14 +105,12 @@ import theano and print the config variable, as in: ...@@ -105,14 +105,12 @@ import theano and print the config variable, as in:
.. attribute:: device .. attribute:: device
String value: either ``'cpu'``, ``'cuda'``, ``'cuda0'``, ``'cuda1'``, String value: either ``'cpu'``, ``'cuda'``, ``'cuda0'``, ``'cuda1'``,
``'opencl0:0'``, ``'opencl0:1'``, ``'gpu'``, ``'gpu0'`` ... ``'opencl0:0'``, ``'opencl0:1'``, ...
Default device for computations. If ``'cuda*``, change the default to try Default device for computations. If ``'cuda*``, change the default to try
to move computation to the GPU using CUDA libraries. If ``'opencl*'``, to move computation to the GPU using CUDA libraries. If ``'opencl*'``,
the openCL libraries will be used. To let the driver select the device, the openCL libraries will be used. To let the driver select the device,
use ``'cuda'`` or ``'opencl'``. If ``'gpu*'``, the old gpu backend will use ``'cuda'`` or ``'opencl'``. If we are not able to use the GPU,
be used, although users are encouraged to migrate to the new GpuArray
backend. If we are not able to use the GPU,
either we fall back on the CPU, or an error is raised, depending either we fall back on the CPU, or an error is raised, depending
on the :attr:`force_device` flag. on the :attr:`force_device` flag.
...@@ -140,10 +138,10 @@ import theano and print the config variable, as in: ...@@ -140,10 +138,10 @@ import theano and print the config variable, as in:
.. attribute:: init_gpu_device .. attribute:: init_gpu_device
String value: either ``''``, ``'cuda'``, ``'cuda0'``, ``'cuda1'``, String value: either ``''``, ``'cuda'``, ``'cuda0'``, ``'cuda1'``,
``'opencl0:0'``, ``'opencl0:1'``, ``'gpu'``, ``'gpu0'`` ... ``'opencl0:0'``, ``'opencl0:1'``, ...
Initialize the gpu device to use. Initialize the gpu device to use.
When its value is ``'cuda*'``, ``'opencl*'`` or ``'gpu*'``, the theano When its value is ``'cuda*'`` or ``'opencl*'``, the theano
flag :attr:`device` must be ``'cpu'``. flag :attr:`device` must be ``'cpu'``.
Unlike :attr:`device`, setting this flag to a specific GPU will not Unlike :attr:`device`, setting this flag to a specific GPU will not
try to use this device by default, in particular it will **not** move try to use this device by default, in particular it will **not** move
...@@ -154,20 +152,6 @@ import theano and print the config variable, as in: ...@@ -154,20 +152,6 @@ import theano and print the config variable, as in:
This flag's value cannot be modified during the program execution. This flag's value cannot be modified during the program execution.
.. attribute:: config.pycuda.init
Bool value: either ``True`` or ``False``
Default: ``False``
If True, always initialize PyCUDA when Theano want to initialize
the GPU. With PyCUDA version 2011.2.2 or earlier, PyCUDA must
initialize the GPU before Theano does it. Setting
this flag to True, ensure that, but always import PyCUDA. It can
be done manually by importing ``theano.misc.pycuda_init`` before
Theano initialize the GPU device. Newer version of PyCUDA
(currently only in the trunk) don't have this restriction.
.. attribute:: print_active_device .. attribute:: print_active_device
Bool value: either ``True`` or ``False`` Bool value: either ``True`` or ``False``
...@@ -176,14 +160,6 @@ import theano and print the config variable, as in: ...@@ -176,14 +160,6 @@ import theano and print the config variable, as in:
Print active device at when the GPU device is initialized. Print active device at when the GPU device is initialized.
.. attribute:: enable_initial_driver_test
Bool value: either ``True`` or ``False``
Default: ``True``
Tests the nvidia driver when a GPU device is initialized.
.. attribute:: floatX .. attribute:: floatX
String value: ``'float64'``, ``'float32'``, or ``'float16'`` (with limited support) String value: ``'float64'``, ``'float32'``, or ``'float16'`` (with limited support)
...@@ -455,48 +431,6 @@ import theano and print the config variable, as in: ...@@ -455,48 +431,6 @@ import theano and print the config variable, as in:
automatically to get more memory. But this can cause automatically to get more memory. But this can cause
fragmentation, see note above. fragmentation, see note above.
.. attribute:: config.lib.cnmem
.. note::
This value allocates GPU memory ONLY when using (:ref:`cuda`)
and has no effect when the GPU backend is (:ref:`gpuarray`).
For the new backend, please see ``config.gpuarray.preallocate``
Float value: >= 0
Controls the use of `CNMeM <https://github.com/NVIDIA/cnmem>`_ (a
faster CUDA memory allocator). Applies to the old GPU backend
:ref:`cuda` up to Theano release 0.8.
The CNMeM library is included in Theano and does not need to be
separately installed.
The value represents the start size (either in MB or the fraction of total GPU
memory) of the memory pool. If more memory is needed, Theano will
try to obtain more, but this can cause memory fragmentation.
* 0: not enabled.
* 0 < N <= 1: use this fraction of the total GPU memory (clipped to .95 for driver memory).
* > 1: use this number in megabytes (MB) of memory.
Default: 0
.. note::
This could cause memory fragmentation. So if you have a
memory error while using CNMeM, try to allocate more memory at
the start or disable it. If you try this, report your result
on :ref`theano-dev`.
.. note::
The clipping at 95% can be bypassed by specifing the exact
number of megabytes. If more then 95% are needed, it will try
automatically to get more memory. But this can cause
fragmentation, see note above.
.. attribute:: config.gpuarray.sched .. attribute:: config.gpuarray.sched
String value: ``'default'``, ``'multi'``, ``'single'`` String value: ``'default'``, ``'multi'``, ``'single'``
...@@ -664,20 +598,6 @@ import theano and print the config variable, as in: ...@@ -664,20 +598,6 @@ import theano and print the config variable, as in:
As such this optimization does not always introduce an assert in the graph. As such this optimization does not always introduce an assert in the graph.
Removing the assert could speed up execution. Removing the assert could speed up execution.
.. attribute:: config.cuda.root
Default: $CUDA_ROOT or failing that, ``"/usr/local/cuda"``
A directory with bin/, lib/, include/ folders containing cuda utilities.
.. attribute:: config.cuda.enabled
Bool value: either ``True`` of ``False``
Default: ``True``
If set to `False`, C code in old backend is not compiled.
.. attribute:: config.dnn.enabled .. attribute:: config.dnn.enabled
String value: ``'auto'``, ``'True'``, ``'False'`` String value: ``'auto'``, ``'True'``, ``'False'``
...@@ -774,19 +694,6 @@ import theano and print the config variable, as in: ...@@ -774,19 +694,6 @@ import theano and print the config variable, as in:
This can be any compiler binary (full path or not) but things may This can be any compiler binary (full path or not) but things may
break if the interface is not g++-compatible to some degree. break if the interface is not g++-compatible to some degree.
.. attribute:: config.nvcc.fastmath
Bool value, default: ``False``
If true, this will enable fastmath (|use_fast_math|_)
mode for compiled cuda code which makes div and sqrt faster at the
cost of precision. This also disables support for denormal
numbers. This can cause NaN. So if you have NaN and use this flag,
try to disable it.
.. |use_fast_math| replace:: ``--use-fast-math``
.. _use_fast_math: http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#options-for-steering-cuda-compilation
.. attribute:: config.optimizer_excluding .. attribute:: config.optimizer_excluding
Default: ``""`` Default: ``""``
......
.. _libdoc_cuda_dnn:
=======================================
:mod:`theano.sandbox.cuda.dnn` -- cuDNN
=======================================
.. moduleauthor:: LISA
`cuDNN <https://developer.nvidia.com/cuDNN>`_ is an NVIDIA library with
functionality used by deep neural network. It provides optimized versions
of some operations like the convolution. cuDNN is not currently
installed with CUDA. You must download and install it
yourself.
To install it, decompress the downloaded file and make the ``*.h`` and
``*.so*`` files available to the compilation environment.
There are at least three possible ways of doing so:
- The easiest is to include them in your CUDA installation. Copy the
``*.h`` files to ``CUDA_ROOT/include`` and the ``*.so*`` files to
``CUDA_ROOT/lib64`` (by default, ``CUDA_ROOT`` is ``/usr/local/cuda``
on Linux).
- Alternatively, on Linux, you can set the environment variables
``LD_LIBRARY_PATH``, ``LIBRARY_PATH`` and ``CPATH`` to the directory
extracted from the download. If needed, separate multiple directories
with ``:`` as in the ``PATH`` environment variable.
example::
export LD_LIBRARY_PATH=/home/user/path_to_CUDNN_folder/lib64:$LD_LIBRARY_PATH
export CPATH=/home/user/path_to_CUDNN_folder/include:$CPATH
export LIBRARY_PATH=/home/user/path_to_CUDNN_folder/lib64:$LIBRARY_PATH
- And as a third way, also on Linux, you can copy the ``*.h`` files
to ``/usr/include`` and the ``*.so*`` files to ``/lib64``.
By default, Theano will detect if it can use cuDNN. If so, it will use
it. If not, Theano optimizations will not introduce cuDNN ops. So
Theano will still work if the user did not introduce them manually.
The recently added Theano flag :attr:`dnn.enabled
<config.dnn.enabled>` allows to change the default behavior to force
it or disable it. Older Theano version do not support this flag. To
get an error when cuDNN can not be used with them, use this flag:
``optimizer_including=cudnn``.
.. note::
cuDNN v5.1 is supported in Theano master version. So it dropped cuDNN v3 support.
Theano 0.8.0 and 0.8.1 support only cuDNN v3 and v4.
Theano 0.8.2 will support only v4 and v5.
.. note::
Starting in cuDNN v3, multiple convolution implementations are offered and
it is possible to use heuristics to automatically choose a convolution
implementation well suited to the parameters of the convolution.
The Theano flag ``dnn.conv.algo_fwd`` allows to specify the cuDNN
convolution implementation that Theano should use for forward convolutions.
Possible values include :
* ``small`` (default) : use a convolution implementation with small memory
usage
* ``none`` : use a slower implementation with minimal memory usage
* ``large`` : use a sometimes faster implementation with large memory usage
* ``fft`` : use the Fast Fourier Transform implementation of convolution
(very high memory usage)
* ``fft_tiling`` : use the Fast Fourier Transform implementation of convolution
with tiling (high memory usage, but less then fft)
* ``guess_once`` : the first time a convolution is executed, the
implementation to use is chosen according to cuDNN's heuristics and reused
for every subsequent execution of the convolution.
* ``guess_on_shape_change`` : like ``guess_once`` but a new convolution
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
* ``time_once`` : the first time a convolution is executed, every convolution
implementation offered by cuDNN is executed and timed. The fastest is
reused for every subsequent execution of the convolution.
* ``time_on_shape_change`` : like ``time_once`` but a new convolution
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
The Theano flag ``dnn.conv.algo_bwd_filter`` and
``dnn.conv.algo_bwd_data`` allows to specify the cuDNN
convolution implementation that Theano should use for gradient
convolutions. Possible values include :
* ``none`` (default) : use the default non-deterministic convolution
implementation
* ``deterministic`` : use a slower but deterministic implementation
* ``fft`` : use the Fast Fourier Transform implementation of convolution
(very high memory usage)
* ``guess_once`` : the first time a convolution is executed, the
implementation to use is chosen according to cuDNN's heuristics and reused
for every subsequent execution of the convolution.
* ``guess_on_shape_change`` : like ``guess_once`` but a new convolution
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
* ``time_once`` : the first time a convolution is executed, every convolution
implementation offered by cuDNN is executed and timed. The fastest is
reused for every subsequent execution of the convolution.
* ``time_on_shape_change`` : like ``time_once`` but a new convolution
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
* (algo_bwd_data only) ``fft_tiling`` : use the Fast Fourier
Transform implementation of convolution with tiling (high memory
usage, but less then fft)
* (algo_bwd_data only) ``small`` : use a convolution implementation
with small memory usage
``guess_*`` and ``time_*`` flag values take into account the amount of
available memory when selecting an implementation. This means that slower
implementations might be selected if not enough memory is available for the
faster implementations.
.. note::
Normally you should not call GPU Ops directly, but the CPU interface
currently does not allow all options supported by cuDNN ops. So it is
possible that you will need to call them manually.
.. note::
The documentation of CUDNN tells that, for the 2 following operations, the
reproducibility is not guaranteed with the default implementation:
`cudnnConvolutionBackwardFilter` and `cudnnConvolutionBackwardData`.
Those correspond to the gradient wrt the weights and the gradient wrt the
input of the convolution. They are also used sometimes in the forward
pass, when they give a speed up.
The Theano flag ``dnn.conv.algo_bwd`` can be use to force the use of a
slower but deterministic convolution implementation.
.. note::
There is a problem we do not understand yet when cudnn paths are
used with symbolic links. So avoid using that.
.. note::
cudnn.so* must be readable and executable by everybody.
cudnn.h must be readable by everybody.
- Convolution:
- :func:`theano.sandbox.cuda.dnn.dnn_conv`, :func:`theano.sandbox.cuda.dnn.dnn_conv3d`.
- :func:`theano.sandbox.cuda.dnn.dnn_gradweight`.
- :func:`theano.sandbox.cuda.dnn.dnn_gradinput`.
- Pooling:
- :func:`theano.sandbox.cuda.dnn.dnn_pool`.
- Batch Normalization:
- :func:`theano.sandbox.cuda.dnn.dnn_batch_normalization_train`
- :func:`theano.sandbox.cuda.dnn.dnn_batch_normalization_test`.
- RNN:
- :class:`New back-end only! <theano.gpuarray.dnn.RNNBlock>`.
- Softmax:
- You can manually use the op :class:`GpuDnnSoftmax
<theano.sandbox.cuda.dnn.GpuDnnSoftmax>` to use its extra feature.
List of Implemented Operations
==============================
.. automodule:: theano.sandbox.cuda.dnn
:members:
.. _libdoc_sandbox_cuda:
===========================================
:mod:`sandbox.cuda` -- The CUDA GPU backend
===========================================
.. module:: sandbox.cuda
:platform: Unix, Windows
:synopsis: Code for GPU programming
.. moduleauthor:: LISA
.. toctree::
:maxdepth: 1
op
var
type
dnn
.. _libdoc_cuda_op:
======================================================
:mod:`sandbox.cuda` -- List of CUDA GPU Op implemented
======================================================
.. moduleauthor:: LISA
Normally you should not call directly those Ops! Theano should automatically transform cpu ops to their gpu equivalent. So this list is just useful to let people know what is implemented on the gpu.
Basic Op
========
.. automodule:: theano.sandbox.cuda.basic_ops
:members:
Blas Op
=======
.. automodule:: theano.sandbox.cuda.blas
:members:
.. autoclass:: theano.sandbox.cuda.blas.GpuBatchedDot
Nnet Op
=======
.. automodule:: theano.sandbox.cuda.nnet
:members:
Curand Op
=========
Random generator based on the CURAND libraries. It is not inserted automatically.
.. automodule:: theano.sandbox.cuda.rng_curand
:members:
.. ../../../../theano/sandbox/cuda/type.py
.. ../../../../theano/sandbox/cuda/var.py
.. ../../../../theano/sandbox/cuda/
.. _libdoc_cuda_type:
======================================================================
:mod:`sandbox.cuda.type` -- The Type object for Cuda-allocated arrays
======================================================================
.. module:: sandbox.cuda.type
:platform: Unix, Windows
:synopsis: The Type object for CUDA-allocated arrays
.. moduleauthor:: LISA
API
===
.. ../../../../theano/sandbox/cuda/type.py
.. ../../../../theano/sandbox/cuda/var.py
.. ../../../../theano/sandbox/cuda/
.. _libdoc_cuda_var:
===================================================================
:mod:`sandbox.cuda.var` -- The Variables for Cuda-allocated arrays
===================================================================
.. module:: sandbox.cuda.var
:platform: Unix, Windows
:synopsis: The Variables object for CUDA-allocated arrays
.. moduleauthor:: LISA
API
===
.. autoclass:: theano.sandbox.cuda.var.CudaNdarraySharedVariable
:members: get_value, set_value
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
cuda/index
linalg linalg
neighbours neighbours
rng_mrg rng_mrg
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
computations in the un-optimized graph, and cause problems with DebugMode, computations in the un-optimized graph, and cause problems with DebugMode,
test values, and when compiling with optimizer=None. test values, and when compiling with optimizer=None.
By default, if :ref:`cuDNN <libdoc_cuda_dnn>` By default, if :ref:`cuDNN <libdoc_gpuarray_dnn>`
is available, we will use it, otherwise we will fall back to using the is available, we will use it, otherwise we will fall back to using the
gemm version (slower than cuDNN in most cases and uses more memory). gemm version (slower than cuDNN in most cases and uses more memory).
...@@ -70,61 +70,27 @@ TODO: Give examples on how to use these things! They are pretty complicated. ...@@ -70,61 +70,27 @@ TODO: Give examples on how to use these things! They are pretty complicated.
- Implemented operators for neural network 2D / image convolution: - Implemented operators for neural network 2D / image convolution:
- :func:`nnet.conv.conv2d <theano.tensor.nnet.conv.conv2d>`. - :func:`nnet.conv.conv2d <theano.tensor.nnet.conv.conv2d>`.
CPU convolution implementation, previously used as the convolution interface. CPU convolution implementation, previously used as the
This is the standard operator for convolutional neural networks working convolution interface. This is the standard operator for
with batches of multi-channel 2D images, available. It convolutional neural networks working with batches of
computes a convolution, i.e., it flips the kernel. multi-channel 2D images, available. It computes a convolution,
i.e., it flips the kernel.
Most of the more efficient GPU implementations listed below can be Most of the more efficient GPU implementations listed below can be
inserted automatically as a replacement for nnet.conv.conv2d via graph inserted automatically as a replacement for nnet.conv.conv2d via graph
optimizations. Some of these graph optimizations are enabled by default, optimizations. Some of these graph optimizations are enabled by default,
others can be enabled via Theano flags. others can be enabled via Theano flags.
Since November 24th, 2014, you can also use a meta-optimizer to You can also use a meta-optimizer to automatically choose the
automatically choose the fastest implementation for each specific fastest implementation for each specific convolution in your
convolution in your graph using the old interface. For each instance, graph using the old interface. For each instance, it will
it will compile and benchmark each applicable implementation of the ones compile and benchmark each applicable implementation of the ones
listed below and choose the fastest one. listed below and choose the fastest one.
As performance is dependent on input and filter shapes, this As performance is dependent on input and filter shapes, this
only works for operations introduced via nnet.conv.conv2d with fully specified only works for operations introduced via nnet.conv.conv2d with
shape information. fully specified shape information. Enable it via the Theano
Enable it via the Theano flag ``optimizer_including=conv_meta``, and flag ``optimizer_including=conv_meta``, and optionally set it to
optionally set it to verbose mode via the flag `metaopt.verbose=1`. verbose mode via the flag `metaopt.verbose=1`.
- :func:`conv2d_fft <theano.sandbox.cuda.fftconv.conv2d_fft>` This
is a GPU-only version of nnet.conv2d that uses an FFT transform - :func:`GpuCorrMM <theano.gpuarray.blas.GpuCorrMM>`
to perform the work. It flips the kernel just like ``conv2d``.
conv2d_fft should not be used directly as
it does not provide a gradient. Instead, use nnet.conv2d and
allow Theano's graph optimizer to replace it by the FFT version
by setting 'THEANO_FLAGS=optimizer_including=conv_fft'
in your environment. If enabled, it will take precedence over cuDNN
and the gemm version. It is not enabled by default because it
has some restrictions on input and uses a lot more memory. Also
note that it requires CUDA >= 5.0, scikits.cuda >= 0.5.0 and
PyCUDA to run. To deactivate the FFT optimization on a specific
nnet.conv2d while the optimization flag is active, you can set
its ``version`` parameter to ``'no_fft'``. To enable it for just
one Theano function:
.. code-block:: python
mode = theano.compile.get_default_mode()
mode = mode.including('conv_fft')
f = theano.function(..., mode=mode)
- `cuda-convnet wrapper for 2d correlation <http://deeplearning.net/software/pylearn2/library/alex.html>`_
Wrapper for an open-source GPU-only implementation of conv2d by Alex
Krizhevsky, very fast, but with several restrictions on input and kernel
shapes, and with a different memory layout for the input. It does not
flip the kernel.
This is in Pylearn2, where it is normally called from the `linear transform
<http://deeplearning.net/software/pylearn2/library/linear.html>`_
implementation, but it can also be used `directly from within Theano
<http://benanne.github.io/2014/04/03/faster-convolutions-in-theano.html>`_
as a manual replacement for nnet.conv2d.
- :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`
This is a GPU-only 2d correlation implementation taken from This is a GPU-only 2d correlation implementation taken from
`caffe's CUDA implementation <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu>`_ `caffe's CUDA implementation <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu>`_
and also used by Torch. It does not flip the kernel. and also used by Torch. It does not flip the kernel.
...@@ -149,7 +115,7 @@ TODO: Give examples on how to use these things! They are pretty complicated. ...@@ -149,7 +115,7 @@ TODO: Give examples on how to use these things! They are pretty complicated.
you can use it as a replacement for nnet.conv2d. For convolutions done on you can use it as a replacement for nnet.conv2d. For convolutions done on
CPU, nnet.conv2d will be replaced by CorrMM. To explicitly disable it, set CPU, nnet.conv2d will be replaced by CorrMM. To explicitly disable it, set
``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment. ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
- :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only - :func:`dnn_conv <theano.gpuarray.dnn.dnn_conv>` GPU-only
convolution using NVIDIA's cuDNN library. This requires that you have convolution using NVIDIA's cuDNN library. This requires that you have
cuDNN 4.0 or newer installed and available, which in turn requires CUDA 7.0 cuDNN 4.0 or newer installed and available, which in turn requires CUDA 7.0
and a GPU with compute capability 3.0 or more. and a GPU with compute capability 3.0 or more.
...@@ -162,25 +128,7 @@ TODO: Give examples on how to use these things! They are pretty complicated. ...@@ -162,25 +128,7 @@ TODO: Give examples on how to use these things! They are pretty complicated.
- :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>` - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`
3D Convolution applying multi-channel 3D filters to batches of 3D Convolution applying multi-channel 3D filters to batches of
multi-channel 3D images. It does not flip the kernel. multi-channel 3D images. It does not flip the kernel.
- :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>` - :func:`GpuCorr3dMM <theano.gpuarray.blas.GpuCorr3dMM>`
GPU-only version of conv3D using FFT transform. conv3d_fft should
not be called directly as it does not provide a gradient.
Instead, use conv3D and allow Theano's graph optimizer to replace it by
the FFT version by setting
``THEANO_FLAGS=optimizer_including=conv3d_fft:convgrad3d_fft:convtransp3d_fft``
in your environment. This is not enabled by default because it does not
support strides and uses more memory. Also note that it requires
CUDA >= 5.0, scikits.cuda >= 0.5.0 and PyCUDA to run.
To enable for just one Theano function:
.. code-block:: python
mode = theano.compile.get_default_mode()
mode = mode.including('conv3d_fft', 'convgrad3d_fft', 'convtransp3d_fft')
f = theano.function(..., mode=mode)
- :func:`GpuCorr3dMM <theano.sandbox.cuda.blas.GpuCorr3dMM>`
This is a GPU-only 3d correlation relying on a Toeplitz matrix This is a GPU-only 3d correlation relying on a Toeplitz matrix
and gemm implementation (see :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`) and gemm implementation (see :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`)
It needs extra memory for the Toeplitz matrix, which is a 2D matrix of shape It needs extra memory for the Toeplitz matrix, which is a 2D matrix of shape
...@@ -203,27 +151,24 @@ TODO: Give examples on how to use these things! They are pretty complicated. ...@@ -203,27 +151,24 @@ TODO: Give examples on how to use these things! They are pretty complicated.
nnet.conv3d will be replaced by Corr3dMM. To explicitly disable it, set nnet.conv3d will be replaced by Corr3dMM. To explicitly disable it, set
``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment. ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
- :func:`dnn_conv3d <theano.sandbox.cuda.dnn.dnn_conv3d>` GPU-only - :func:`dnn_conv <theano.gpuarray.dnn.dnn_conv>` GPU-only
convolution using NVIDIA's cuDNN library. This requires that you have convolution using NVIDIA's cuDNN library. This requires that you have
cuDNN 4.0 or newer installed and available, which in turn requires CUDA 7.0 cuDNN 4.0 or newer installed and available, which in turn requires CUDA 7.0
and a GPU with compute capability 3.0 or more. and a GPU with compute capability 3.0 or more.
If cuDNN is available, by default, Theano will replace all nnet.conv3d If cuDNN is available, by default, Theano will replace all nnet.conv3d
operations with dnn_conv3d. To explicitly disable it, set operations with dnn_conv. To explicitly disable it, set
``THEANO_FLAGS=optimizer_excluding=conv_dnn`` in your environment. ``THEANO_FLAGS=optimizer_excluding=conv_dnn`` in your environment.
As dnn_conv3d has a gradient defined, you can also use it manually. As dnn_conv3d has a gradient defined, you can also use it manually.
- :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
Another conv3d implementation that uses the conv2d with data reshaping. Another conv3d implementation that uses the conv2d with data reshaping.
It is faster in some cases than conv3d, and work on the GPU. It is faster in some cases than conv3d. It flips the kernel.
It flip the kernel.
.. autofunction:: theano.tensor.nnet.conv2d .. autofunction:: theano.tensor.nnet.conv2d
.. autofunction:: theano.tensor.nnet.conv2d_transpose .. autofunction:: theano.tensor.nnet.conv2d_transpose
.. autofunction:: theano.tensor.nnet.conv3d .. autofunction:: theano.tensor.nnet.conv3d
.. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
.. autofunction:: theano.tensor.nnet.Conv3D.conv3D .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
.. autofunction:: theano.sandbox.cuda.fftconv.conv3d_fft
.. autofunction:: theano.tensor.nnet.conv3d2d.conv3d .. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
.. autofunction:: theano.tensor.nnet.conv.conv2d .. autofunction:: theano.tensor.nnet.conv.conv2d
......
...@@ -107,14 +107,3 @@ Install and configure the GPU drivers (recommended) ...@@ -107,14 +107,3 @@ Install and configure the GPU drivers (recommended)
* Add the 'lib' subdirectory (and/or 'lib64' subdirectory if you have a * Add the 'lib' subdirectory (and/or 'lib64' subdirectory if you have a
64-bit OS) to your ``$LD_LIBRARY_PATH`` environment 64-bit OS) to your ``$LD_LIBRARY_PATH`` environment
variable. variable.
3. Set Theano's config flags
To use the GPU you need to define the *cuda root*. You can do it in one
of the following ways:
* Define a $CUDA_ROOT environment variable to equal the cuda root directory, as in ``CUDA_ROOT=/path/to/cuda/root``, or
* add a ``cuda.root`` flag to :envvar:`THEANO_FLAGS`, as in ``THEANO_FLAGS='cuda.root=/path/to/cuda/root'``, or
* add a [cuda] section to your .theanorc file containing the option ``root = /path/to/cuda/root``.
...@@ -120,7 +120,7 @@ some test fails on your machine, you are encouraged to tell us what went ...@@ -120,7 +120,7 @@ some test fails on your machine, you are encouraged to tell us what went
wrong on the ``theano-users@googlegroups.com`` mailing list. wrong on the ``theano-users@googlegroups.com`` mailing list.
.. warning:: .. warning::
Theano's test should **NOT** be run with ``device=cuda`` or ``device=gpu`` Theano's test should **NOT** be run with ``device=cuda``
or they will fail. The tests automatically use the gpu, if any, when or they will fail. The tests automatically use the gpu, if any, when
needed. If you don't want Theano to ever use the gpu when running tests, needed. If you don't want Theano to ever use the gpu when running tests,
you can set :attr:`config.device` to ``cpu`` and you can set :attr:`config.device` to ``cpu`` and
...@@ -137,24 +137,22 @@ CPU and GPU memory usage. ...@@ -137,24 +137,22 @@ CPU and GPU memory usage.
Could speed up and lower memory usage: Could speed up and lower memory usage:
- :ref:`cuDNN <libdoc_cuda_dnn>` default cuDNN convolution use less - :ref:`cuDNN <libdoc_gpuarray_dnn>` default cuDNN convolution use less
memory then Theano version. But some flags allow it to use more memory then Theano version. But some flags allow it to use more
memory. GPU only. memory. GPU only.
- Shortly avail, multi-GPU.
Could raise memory usage but speed up computation: Could raise memory usage but speed up computation:
- :attr:`config.gpuarray.preallocate` =1 # Preallocates the GPU memory for the new backend(:ref:`gpuarray`) - :attr:`config.gpuarray.preallocate` = 1 # Preallocates the GPU memory
and then manages it in a smart way. Does not raise much the memory usage, but if and then manages it in a smart way. Does not raise much the memory
you are at the limit of GPU memory available you might need to specify a usage, but if you are at the limit of GPU memory available you might
lower value. GPU only. need to specify a lower value. GPU only.
- :attr:`config.lib.cnmem` =1 # Equivalent on the old backend (:ref:`cuda`). GPU only.
- :attr:`config.allow_gc` =False - :attr:`config.allow_gc` =False
- :attr:`config.optimizer_excluding` =low_memory , GPU only for now. - :attr:`config.optimizer_excluding` =low_memory , GPU only for now.
Could lower the memory usage, but raise computation time: Could lower the memory usage, but raise computation time:
- :attr:`config.scan.allow_gc` =True # Probably not significant slowdown if config.lib.cnmem is used. - :attr:`config.scan.allow_gc` = True # Probably not significant slowdown on the GPU if memory cache is not disabled
- :attr:`config.scan.allow_output_prealloc` =False - :attr:`config.scan.allow_output_prealloc` =False
- Use :func:`batch_normalization() - Use :func:`batch_normalization()
<theano.tensor.nnet.bn.batch_normalization>`. It use less memory <theano.tensor.nnet.bn.batch_normalization>`. It use less memory
......
...@@ -211,15 +211,16 @@ be costly. Here are a few tips to ensure fast and efficient use of GPU memory a ...@@ -211,15 +211,16 @@ be costly. Here are a few tips to ensure fast and efficient use of GPU memory a
*Solution*: upgrade to a recent version of Theano (>0.3.0) and consider padding your source *Solution*: upgrade to a recent version of Theano (>0.3.0) and consider padding your source
data to make sure that every chunk is the same size. data to make sure that every chunk is the same size.
* It is also worth mentioning that, current GPU copying routines support only contiguous memory. * It is also worth mentioning that, current GPU copying routines
So Theano must make the value you provide *C-contiguous* prior to copying it. support only contiguous memory. So Theano must make the value you
This can require an extra copy of the data on the host. provide *C-contiguous* prior to copying it. This can require an
extra copy of the data on the host.
*Solution*: make sure that the value *Solution*: make sure that the value
you assign to a CudaNdarraySharedVariable is *already* *C-contiguous*. you assign to a GpuArraySharedVariable is *already* *C-contiguous*.
(Further information on the current implementation of the GPU version of ``set_value()`` can be found (Further information on the current implementation of the GPU version
here: :ref:`libdoc_cuda_var`) of ``set_value()`` can be found here: :ref:`libdoc_gpuarray_type`)
.. _borrowfunction: .. _borrowfunction:
......
...@@ -508,10 +508,9 @@ There are :ref:`other distributions implemented <libdoc_tensor_raw_random>`. ...@@ -508,10 +508,9 @@ There are :ref:`other distributions implemented <libdoc_tensor_raw_random>`.
Other Implementations Other Implementations
--------------------- ---------------------
There are 2 other implementations based on :ref:`MRG31k3p There is another implementations based on :ref:`MRG31k3p
<libdoc_rng_mrg>` and :class:`CURAND <theano.sandbox.cuda.rng_curand>`. <libdoc_rng_mrg>`.
The RandomStream only work on the CPU, MRG31k3p The RandomStream only work on the CPU, MRG31k3p work on the CPU and GPU.
work on the CPU and GPU. CURAND only work on the GPU.
.. note:: .. note::
......
.. _gpu_data_convert:
===================================
PyCUDA/CUDAMat/Gnumpy compatibility
===================================
PyCUDA
======
Currently, PyCUDA and Theano have different objects to store GPU
data. The two implementations do not support the same set of features.
Theano's implementation is called *CudaNdarray* and supports
*strides*. It also only supports the *float32* dtype. PyCUDA's implementation
is called *GPUArray* and doesn't support *strides*. However, it can deal with
all NumPy and CUDA dtypes.
We are currently working on having the same base object for both that will
also mimic Numpy. Until this is ready, here is some information on how to
use both objects in the same script.
Transfer
--------
You can use the ``theano.misc.pycuda_utils`` module to convert GPUArray to and
from CudaNdarray. The functions ``to_cudandarray(x, copyif=False)`` and
``to_gpuarray(x)`` return a new object that occupies the same memory space
as the original. Otherwise it raises a *ValueError*. Because GPUArrays don't
support strides, if the CudaNdarray is strided, we could copy it to
have a non-strided copy. The resulting GPUArray won't share the same
memory region. If you want this behavior, set ``copyif=True`` in
``to_gpuarray``.
Compiling with PyCUDA
---------------------
You can use PyCUDA to compile CUDA functions that work directly on
CudaNdarrays. Here is an example from the file ``theano/misc/tests/test_pycuda_theano_simple.py``:
.. code-block:: python
import sys
import numpy
import theano
import theano.sandbox.cuda as cuda_ndarray
import theano.misc.pycuda_init
import pycuda
import pycuda.driver as drv
import pycuda.gpuarray
def test_pycuda_theano():
"""Simple example with pycuda function and Theano CudaNdarray object."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(100).astype(numpy.float32)
b = numpy.random.randn(100).astype(numpy.float32)
# Test with Theano object
ga = cuda_ndarray.CudaNdarray(a)
gb = cuda_ndarray.CudaNdarray(b)
dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
multiply_them(dest, ga, gb,
block=(400, 1, 1), grid=(1, 1))
assert (numpy.asarray(dest) == a * b).all()
Theano Op using a PyCUDA function
---------------------------------
You can use a GPU function compiled with PyCUDA in a Theano op:
.. code-block:: python
import numpy, theano
import theano.misc.pycuda_init
from pycuda.compiler import SourceModule
import theano.sandbox.cuda as cuda
class PyCUDADoubleOp(theano.Op):
__props__ = ()
def make_node(self, inp):
inp = cuda.basic_ops.gpu_contiguous(
cuda.basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2, impl=None):
mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i<size){
o0[i] = i0[i] * 2;
}
}""")
pycuda_fct = mod.get_function("my_fct")
inputs = [ storage_map[v] for v in node.inputs]
outputs = [ storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape!=inputs[0][0].shape:
z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
block=(512, 1, 1), grid=grid)
thunk.lazy = False
return thunk
CUDAMat
=======
There are functions for conversion between CUDAMat objects and Theano's CudaNdArray objects.
They obey the same principles as Theano's PyCUDA functions and can be found in
``theano.misc.cudamat_utils.py``.
.. TODO: this statement is unclear:
WARNING: There is a peculiar problem associated with stride/shape with those converters.
In order to work, the test needs a *transpose* and *reshape*...
Gnumpy
======
There are conversion functions between Gnumpy *garray* objects and Theano CudaNdArray objects.
They are also similar to Theano's PyCUDA functions and can be found in ``theano.misc.gnumpy_utils.py``.
...@@ -70,7 +70,6 @@ Further readings ...@@ -70,7 +70,6 @@ Further readings
../extending/graphstructures ../extending/graphstructures
loading_and_saving loading_and_saving
gpu_data_convert
aliasing aliasing
python-memory-management python-memory-management
multi_cores multi_cores
......
...@@ -82,21 +82,6 @@ Here is an example output when we disable some Theano optimizations to ...@@ -82,21 +82,6 @@ Here is an example output when we disable some Theano optimizations to
give you a better idea of the difference between sections. With all give you a better idea of the difference between sections. With all
optimizations enabled, there would be only one op left in the graph. optimizations enabled, there would be only one op left in the graph.
.. note::
To profile the peak memory usage on the GPU you need to do::
* In the file theano/sandbox/cuda/cuda_ndarray.cu, set the macro
COMPUTE_GPU_MEM_USED to 1.
* Then call theano.sandbox.cuda.theano_allocated()
It return a tuple with two ints. The first is the current GPU
memory allocated by Theano. The second is the peak GPU memory
that was allocated by Theano.
Do not always enable this, as this slows down memory allocation and
free. As this slows down the computation, this will affect speed
profiling. So don't use both at the same time.
to run the example: to run the example:
THEANO_FLAGS=optimizer_excluding=fusion:inplace,profile=True python doc/tutorial/profiling_example.py THEANO_FLAGS=optimizer_excluding=fusion:inplace,profile=True python doc/tutorial/profiling_example.py
......
...@@ -14,16 +14,14 @@ about how to carry out those computations. One of the ways we take ...@@ -14,16 +14,14 @@ about how to carry out those computations. One of the ways we take
advantage of this flexibility is in carrying out calculations on a advantage of this flexibility is in carrying out calculations on a
graphics card. graphics card.
There are two ways currently to use a gpu, one that should support any OpenCL Using the GPU in Theano is as simple as setting the ``device``
device as well as NVIDIA cards (:ref:`gpuarray`), and the old backend that configuration flag to ``device=cuda``. You can optionally target a
only supports NVIDIA cards (:ref:`cuda`). specific gpu by specifying the number of the gpu as in
e.g. ``device=cuda2``. It is also encouraged to set the floating
Using the GPU in Theano is as simple as setting the ``device`` configuration point precision to float32 when working on the GPU as that is usually
flag to ``device=cuda`` (or ``device=gpu`` for the old backend). You can optionally target a specific gpu by specifying much faster. For example:
the number of the gpu as in e.g. ``device=cuda2``. You also need to set the ``THEANO_FLAGS='device=cuda,floatX=float32'``. You can also set these
default floating point precision. options in the .theanorc file's ``[global]`` section:
For example: ``THEANO_FLAGS='cuda.root=/path/to/cuda/root,device=cuda,floatX=float32'``.
You can also set these options in the .theanorc file's ``[global]`` section:
.. code-block:: cfg .. code-block:: cfg
...@@ -31,15 +29,10 @@ You can also set these options in the .theanorc file's ``[global]`` section: ...@@ -31,15 +29,10 @@ You can also set these options in the .theanorc file's ``[global]`` section:
device = cuda device = cuda
floatX = float32 floatX = float32
.. warning::
The old CUDA backend will be deprecated soon, in favor of the new libgpuarray
backend.
.. note:: .. note::
* If your computer has multiple GPUs and you use ``device=cuda``, the driver * If your computer has multiple GPUs and you use ``device=cuda``,
selects the one to use (usually gpu0). the driver selects the one to use (usually cuda0).
* You can use the program ``nvidia-smi`` to change this policy. * You can use the program ``nvidia-smi`` to change this policy.
* By default, when ``device`` indicates preference for GPU computations, * By default, when ``device`` indicates preference for GPU computations,
Theano will fall back to the CPU if there is a problem with the GPU. Theano will fall back to the CPU if there is a problem with the GPU.
...@@ -65,14 +58,8 @@ remainder of this section, whatever compute device you are using will ...@@ -65,14 +58,8 @@ remainder of this section, whatever compute device you are using will
be referred to as GPU. be referred to as GPU.
.. note:: .. note::
GpuArray backend uses ``config.gpuarray.preallocate`` for GPU memory allocation. GpuArray backend uses ``config.gpuarray.preallocate`` for GPU memory
For the old backend, please see ``config.lib.cnmem`` allocation.
.. warning::
If you want to use the new GpuArray backend, make sure to have the
development version of Theano installed. The 0.8.X releases have not
been optimized to work correctly with the new backend.
.. warning:: .. warning::
...@@ -140,9 +127,10 @@ input *x* is stored on the GPU. ...@@ -140,9 +127,10 @@ input *x* is stored on the GPU.
Used the cpu Used the cpu
$ THEANO_FLAGS=device=cuda0 python gpu_tutorial1.py $ THEANO_FLAGS=device=cuda0 python gpu_tutorial1.py
Mapped name None to device cuda0: GeForce GTX 680 (cuDNN version 5004) Using cuDNN version 5105 on context None
Mapped name None to device cuda0: GeForce GTX 750 Ti (0000:07:00.0)
[GpuElemwise{exp,no_inplace}(<GpuArrayType<None>(float64, (False,))>), HostFromGpu(gpuarray)(GpuElemwise{exp,no_inplace}.0)] [GpuElemwise{exp,no_inplace}(<GpuArrayType<None>(float64, (False,))>), HostFromGpu(gpuarray)(GpuElemwise{exp,no_inplace}.0)]
Looping 1000 times took 1.202734 seconds Looping 1000 times took 1.697514 seconds
Result is [ 1.23178032 1.61879341 1.52278065 ..., 2.20771815 2.29967753 Result is [ 1.23178032 1.61879341 1.52278065 ..., 2.20771815 2.29967753
1.62323285] 1.62323285]
Used the gpu Used the gpu
...@@ -197,9 +185,10 @@ The output is ...@@ -197,9 +185,10 @@ The output is
:options: +ELLIPSIS, +SKIP :options: +ELLIPSIS, +SKIP
$ THEANO_FLAGS=device=cuda0 python gpu_tutorial2.py $ THEANO_FLAGS=device=cuda0 python gpu_tutorial2.py
Mapped name None to device cuda0: GeForce GTX 680 (cuDNN version 5004) Using cuDNN version 5105 on context None
Mapped name None to device cuda0: GeForce GTX 750 Ti (0000:07:00.0)
[GpuElemwise{exp,no_inplace}(<GpuArrayType<None>(float64, (False,))>)] [GpuElemwise{exp,no_inplace}(<GpuArrayType<None>(float64, (False,))>)]
Looping 1000 times took 0.088381 seconds Looping 1000 times took 0.040277 seconds
Result is [ 1.23178032 1.61879341 1.52278065 ..., 2.20771815 2.29967753 Result is [ 1.23178032 1.61879341 1.52278065 ..., 2.20771815 2.29967753
1.62323285] 1.62323285]
Used the gpu Used the gpu
...@@ -208,9 +197,10 @@ The output is ...@@ -208,9 +197,10 @@ The output is
.. code-block:: none .. code-block:: none
$ THEANO_FLAGS=device=cuda0 python gpu_tutorial2.py $ THEANO_FLAGS=device=cuda0 python gpu_tutorial2.py
Mapped name None to device cuda0: GeForce GTX 680 (cuDNN version 5004) Using cuDNN version 5105 on context None
Mapped name None to device cuda0: GeForce GTX 750 Ti (0000:07:00.0)
[GpuElemwise{exp,no_inplace}(<GpuArrayType<None>(float64, (False,))>)] [GpuElemwise{exp,no_inplace}(<GpuArrayType<None>(float64, (False,))>)]
Looping 1000 times took 0.089194 seconds Looping 1000 times took 0.040277 seconds
Result is [ 1.23178032 1.61879341 1.52278065 ..., 2.20771815 2.29967753 Result is [ 1.23178032 1.61879341 1.52278065 ..., 2.20771815 2.29967753
1.62323285] 1.62323285]
Used the gpu Used the gpu
...@@ -238,8 +228,8 @@ device, and also as we refine our implementation: ...@@ -238,8 +228,8 @@ device, and also as we refine our implementation:
* In general, matrix multiplication, convolution, and large element-wise * In general, matrix multiplication, convolution, and large element-wise
operations can be accelerated a lot (5-50x) when arguments are large enough operations can be accelerated a lot (5-50x) when arguments are large enough
to keep 30 processors busy. to keep 30 processors busy.
* Indexing, dimension-shuffling and constant-time reshaping will be equally fast * Indexing, dimension-shuffling and constant-time reshaping will be
on GPU as on CPU. equally fast on GPU as on CPU.
* Summation over rows/columns of tensors can be a little slower on the * Summation over rows/columns of tensors can be a little slower on the
GPU than on the CPU. GPU than on the CPU.
* Copying of large quantities of data to and from a device is relatively slow, * Copying of large quantities of data to and from a device is relatively slow,
...@@ -273,23 +263,22 @@ Tips for Improving Performance on GPU ...@@ -273,23 +263,22 @@ Tips for Improving Performance on GPU
* Minimize transfers to the GPU device by using ``shared`` variables * Minimize transfers to the GPU device by using ``shared`` variables
to store frequently-accessed data (see :func:`shared()<shared.shared>`). to store frequently-accessed data (see :func:`shared()<shared.shared>`).
When using the GPU, tensor ``shared`` variables are stored on When using the GPU, tensor ``shared`` variables are stored on
the GPU by default to eliminate transfer time for GPU ops using those variables. the GPU by default to eliminate transfer time for GPU ops using those
* If you aren't happy with the performance you see, try running your script with variables.
``profile=True`` flag. This should print some timing information at program * If you aren't happy with the performance you see, try running your
termination. Is time being used sensibly? If an op or Apply is script with ``profile=True`` flag. This should print some timing
taking more time than its share, then if you know something about GPU information at program termination. Is time being used sensibly? If
programming, have a look at how it's implemented in theano.gpuarray. an op or Apply is taking more time than its share, then if you know
Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and something about GPU programming, have a look at how it's implemented
Xs(X%) in transfer op*. This can tell you if not enough of your graph is in theano.gpuarray. Check the line similar to *Spent Xs(X%) in cpu
on the GPU or if there is too much memory transfer. op, Xs(X%) in gpu op and Xs(X%) in transfer op*. This can tell you
if not enough of your graph is on the GPU or if there is too much
memory transfer.
* To investigate whether all the Ops in the computational graph are * To investigate whether all the Ops in the computational graph are
running on GPU, it is possible to debug or check your code by providing running on GPU, it is possible to debug or check your code by providing
a value to `assert_no_cpu_op` flag, i.e. `warn`, for warning, `raise` for a value to `assert_no_cpu_op` flag, i.e. `warn`, for warning, `raise` for
raising an error or `pdb` for putting a breakpoint in the computational raising an error or `pdb` for putting a breakpoint in the computational
graph if there is a CPU Op. graph if there is a CPU Op.
* Please note that ``config.lib.cnmem`` and ``config.gpuarray.preallocate``
controls GPU memory allocation when using (:ref:`cuda`) and
(:ref:`gpuarray`) as theano backends respectively.
.. _gpu_async: .. _gpu_async:
...@@ -311,9 +300,9 @@ when doing benchmarks. ...@@ -311,9 +300,9 @@ when doing benchmarks.
Changing the Value of Shared Variables Changing the Value of Shared Variables
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To change the value of a ``shared`` variable, e.g. to provide new data to processes, To change the value of a ``shared`` variable, e.g. to provide new data
use ``shared_variable.set_value(new_value)``. For a lot more detail about this, to processes, use ``shared_variable.set_value(new_value)``. For a lot
see :ref:`aliasing`. more detail about this, see :ref:`aliasing`.
Exercise Exercise
~~~~~~~~ ~~~~~~~~
...@@ -389,50 +378,22 @@ Consider again the logistic regression: ...@@ -389,50 +378,22 @@ Consider again the logistic regression:
prediction on D prediction on D
... ...
Modify and execute this example to run on GPU with ``floatX=float32`` and Modify and execute this example to run on GPU with ``floatX=float32``
time it using the command line ``time python file.py``. (Of course, you may use some of your answer and time it using the command line ``time python file.py``. (Of
to the exercise in section :ref:`Configuration Settings and Compiling Mode<using_modes>`.) course, you may use some of your answer to the exercise in section
:ref:`Configuration Settings and Compiling Mode<using_modes>`.)
Is there an increase in speed from CPU to GPU? Is there an increase in speed from CPU to GPU?
Where does it come from? (Use ``profile=True`` flag.) Where does it come from? (Use ``profile=True`` flag.)
What can be done to further increase the speed of the GPU version? Put your ideas to test. What can be done to further increase the speed of the GPU version? Put
your ideas to test.
:download:`Solution<using_gpu_solution_1.py>` :download:`Solution<using_gpu_solution_1.py>`
------------------------------------------- -------------------------------------------
.. _cuda:
CUDA backend
------------
If you have not done so already, you will need to install Nvidia's
GPU-programming toolchain (CUDA) and configure Theano to use it.
We provide installation instructions for :ref:`Linux <gpu_linux>`,
:ref:`MacOS <gpu_macos>` and :ref:`Windows <gpu_windows>`.
The old CUDA backend can be activated using the flags ``device=gpu`` or
``device=gpu{0,1,...}``
.. note::
* CUDA backend uses ``config.lib.cnmem`` for GPU memory allocation. For the new backend(:ref:`gpuarray`), please see ``config.gpuarray.preallocate``
* Only 32 bit floats are supported.
* ``Shared`` variables with *float32* dtype are by default moved to the GPU memory space.
* There is a limit of one GPU per process.
* Apply the Theano flag ``floatX=float32`` (through ``theano.config.floatX``) in your code.
* ``Cast`` inputs before storing them into a ``shared`` variable.
* Circumvent the automatic cast of *int32* with *float32* to *float64*:
* Insert manual cast in your code or use *[u]int{8,16}*.
* Insert manual cast around the mean operator (this involves division by length, which is an *int64*).
* Notice that a new casting mechanism is being developed.
-------------------------------------------
Software for Directly Programming a GPU Software for Directly Programming a GPU
--------------------------------------- ---------------------------------------
......
...@@ -64,8 +64,8 @@ defined. This will look like this: ...@@ -64,8 +64,8 @@ defined. This will look like this:
.. code-block:: bash .. code-block:: bash
$ THEANO_FLAGS="contexts=dev0->cuda0;dev1->cuda1" python -c 'import theano' $ THEANO_FLAGS="contexts=dev0->cuda0;dev1->cuda1" python -c 'import theano'
Mapped name dev0 to device cuda0: GeForce GTX TITAN X Mapped name dev0 to device cuda0: GeForce GTX TITAN X (0000:09:00.0)
Mapped name dev1 to device cuda1: GeForce GTX TITAN X Mapped name dev1 to device cuda1: GeForce GTX TITAN X (0000:06:00.0)
If you don't have enough GPUs for a certain model, you can assign the If you don't have enough GPUs for a certain model, you can assign the
......
...@@ -35,6 +35,7 @@ Programming Language :: Python :: 2.7 ...@@ -35,6 +35,7 @@ Programming Language :: Python :: 2.7
Programming Language :: Python :: 3 Programming Language :: Python :: 3
Programming Language :: Python :: 3.3 Programming Language :: Python :: 3.3
Programming Language :: Python :: 3.4 Programming Language :: Python :: 3.4
Programming Language :: Python :: 3.5
""" """
NAME = 'Theano' NAME = 'Theano'
MAINTAINER = "LISA laboratory, University of Montreal" MAINTAINER = "LISA laboratory, University of Montreal"
......
...@@ -126,17 +126,6 @@ else: ...@@ -126,17 +126,6 @@ else:
raise ImportError("The nose module is not installed." raise ImportError("The nose module is not installed."
" It is needed for Theano tests.") " It is needed for Theano tests.")
if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
import theano.sandbox.cuda
# We can't test the driver during import of theano.sandbox.cuda as
# this cause circular import dependency. So we also test it manually
# after the import
if theano.sandbox.cuda.cuda_available:
import theano.sandbox.cuda.tests.test_driver
if config.enable_initial_driver_test:
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if (config.device.startswith('cuda') or if (config.device.startswith('cuda') or
config.device.startswith('opencl') or config.device.startswith('opencl') or
config.init_gpu_device.startswith('cuda') or config.init_gpu_device.startswith('cuda') or
......
...@@ -1198,10 +1198,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1198,10 +1198,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# To avoid circular imports # To avoid circular imports
from theano.tensor import TensorType from theano.tensor import TensorType
from theano.sandbox.cuda import cuda_available, CudaNdarrayType from theano.gpuarray import GpuArrayType
if cuda_available: try:
from theano.sandbox.cuda import CudaNdarray import pygpu
from theano.sandbox.cuda import dimshuffle as cuda_dimshuffle except ImportError:
pass
# TODO: Sparse? Scalar does not really make sense. # TODO: Sparse? Scalar does not really make sense.
...@@ -1240,7 +1241,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1240,7 +1241,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
for r in considered_outputs: for r in considered_outputs:
# There is no risk to overwrite inputs, since r does not work # There is no risk to overwrite inputs, since r does not work
# inplace. # inplace.
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, GpuArrayType)):
reuse_outputs[r][...] = np.asarray( reuse_outputs[r][...] = np.asarray(
def_val).astype(r.type.dtype) def_val).astype(r.type.dtype)
...@@ -1250,15 +1251,14 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1250,15 +1251,14 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
del reuse_outputs del reuse_outputs
# c_cont_output: use a c-continuous array # c_cont_output: use a c-continuous array
# (for TensorType and CudaNdarray, else None) # (for TensorType, else None)
if 'c_contiguous' in prealloc_modes or 'ALL' in prealloc_modes: if 'c_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
c_cont_outputs = {} c_cont_outputs = {}
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, GpuArrayType)):
# Build a C-contiguous buffer # Build a C-contiguous buffer
new_buf = r.type.value_zeros(r_vals[r].shape) new_buf = r.type.value_zeros(r_vals[r].shape)
# CudaNdarray don't have flags field assert new_buf.flags["C_CONTIGUOUS"]
# assert new_buf.flags["C_CONTIGUOUS"]
new_buf[...] = np.asarray(def_val).astype(r.type.dtype) new_buf[...] = np.asarray(def_val).astype(r.type.dtype)
c_cont_outputs[r] = new_buf c_cont_outputs[r] = new_buf
...@@ -1272,18 +1272,14 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1272,18 +1272,14 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes: if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
f_cont_outputs = {} f_cont_outputs = {}
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, GpuArrayType)):
new_buf = np.zeros( new_buf = np.zeros(
shape=r_vals[r].shape, shape=r_vals[r].shape,
dtype=r_vals[r].dtype, dtype=r_vals[r].dtype,
order='F') order='F')
new_buf[...] = def_val new_buf[...] = def_val
if isinstance(r.type, CudaNdarrayType): if isinstance(r.type, GpuArrayType):
# When the CudaNdarray is built, the underlying memory new_buf = pygpu.array(new_buf)
# is c-contiguous, so we transpose it before and after.
new_buf = CudaNdarray(new_buf.T)
new_buf = cuda_dimshuffle(
new_buf, reversed(list(range(new_buf.ndim))))
f_cont_outputs[r] = new_buf f_cont_outputs[r] = new_buf
...@@ -1305,7 +1301,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1305,7 +1301,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
max_ndim = 0 max_ndim = 0
rev_out_broadcastable = [] rev_out_broadcastable = []
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, GpuArrayType)):
if max_ndim < r.ndim: if max_ndim < r.ndim:
rev_out_broadcastable += [True] * (r.ndim - max_ndim) rev_out_broadcastable += [True] * (r.ndim - max_ndim)
max_ndim = r.ndim max_ndim = r.ndim
...@@ -1320,7 +1316,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1320,7 +1316,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# Initial allocation # Initial allocation
init_strided = {} init_strided = {}
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, GpuArrayType)):
# Create a buffer twice as large in every dimension, # Create a buffer twice as large in every dimension,
# except if broadcastable, or for dimensions above # except if broadcastable, or for dimensions above
# config.DebugMode.check_preallocated_output_ndim # config.DebugMode.check_preallocated_output_ndim
...@@ -1399,7 +1395,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1399,7 +1395,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
name = 'wrong_size%s' % str(tuple(shape_diff)) name = 'wrong_size%s' % str(tuple(shape_diff))
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, GpuArrayType)):
r_shape_diff = shape_diff[:r.ndim] r_shape_diff = shape_diff[:r.ndim]
out_shape = [max((s + sd), 0) out_shape = [max((s + sd), 0)
for s, sd in zip(r_vals[r].shape, for s, sd in zip(r_vals[r].shape,
...@@ -1741,7 +1737,6 @@ class _VariableEquivalenceTracker(object): ...@@ -1741,7 +1737,6 @@ class _VariableEquivalenceTracker(object):
# List of default version of make thunk. # List of default version of make thunk.
# This is needed to know if the user overrided it. # This is needed to know if the user overrided it.
# The GpuOp will be added here when theano.sandbox.cuda is imported.
default_make_thunk = [get_unbound_function(theano.gof.Op.make_thunk)] default_make_thunk = [get_unbound_function(theano.gof.Op.make_thunk)]
......
...@@ -8,7 +8,6 @@ import numpy as np ...@@ -8,7 +8,6 @@ import numpy as np
import theano import theano
from theano.configparser import config from theano.configparser import config
import theano.tensor as T import theano.tensor as T
import theano.sandbox.cuda as cuda
from theano.compile import Mode from theano.compile import Mode
from .mode import get_mode from .mode import get_mode
...@@ -107,16 +106,6 @@ def contains_nan(arr, node=None, var=None): ...@@ -107,16 +106,6 @@ def contains_nan(arr, node=None, var=None):
""" """
if not _is_numeric_value(arr, var): if not _is_numeric_value(arr, var):
return False return False
elif cuda.cuda_available and isinstance(arr, cuda.CudaNdarray):
if (node and hasattr(theano.sandbox, 'rng_mrg') and
isinstance(
node.op,
# It store ints in float container
theano.sandbox.rng_mrg.GPU_mrg_uniform)):
return False
else:
compile_gpu_func(True, False, False)
return np.isnan(f_gpumin(arr.reshape(arr.size)))
elif pygpu_available and isinstance(arr, GpuArray): elif pygpu_available and isinstance(arr, GpuArray):
return np.isnan(f_gpua_min(arr.reshape(arr.size))) return np.isnan(f_gpua_min(arr.reshape(arr.size)))
...@@ -150,70 +139,12 @@ def contains_inf(arr, node=None, var=None): ...@@ -150,70 +139,12 @@ def contains_inf(arr, node=None, var=None):
""" """
if not _is_numeric_value(arr, var): if not _is_numeric_value(arr, var):
return False return False
elif cuda.cuda_available and isinstance(arr, cuda.CudaNdarray):
if (node and hasattr(theano.sandbox, 'rng_mrg') and
isinstance(
node.op,
# It store ints in float container
theano.sandbox.rng_mrg.GPU_mrg_uniform)):
return False
else:
compile_gpu_func(False, True, False)
return (np.isinf(f_gpumin(arr.reshape(arr.size))) or
np.isinf(f_gpumax(arr.reshape(arr.size))))
elif pygpu_available and isinstance(arr, GpuArray): elif pygpu_available and isinstance(arr, GpuArray):
return (np.isinf(f_gpua_min(arr.reshape(arr.size))) or return (np.isinf(f_gpua_min(arr.reshape(arr.size))) or
np.isinf(f_gpua_max(arr.reshape(arr.size)))) np.isinf(f_gpua_max(arr.reshape(arr.size))))
return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr)) return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
f_gpumin = None
f_gpumax = None
f_gpuabsmax = None
def compile_gpu_func(nan_is_error, inf_is_error, big_is_error):
""" compile utility function used by contains_nan and contains_inf
"""
global f_gpumin, f_gpumax, f_gpuabsmax
if not cuda.cuda_available:
return
guard_input = cuda.fvector('nan_guard')
cuda_compile_failed = False
if (nan_is_error or inf_is_error) and f_gpumin is None:
try:
f_gpumin = theano.function(
[guard_input], T.min(guard_input),
mode='FAST_RUN'
)
except RuntimeError:
# This can happen if cuda is available, but the
# device is in exclusive mode and used by another
# process.
cuda_compile_failed = True
if inf_is_error and not cuda_compile_failed and f_gpumax is None:
try:
f_gpumax = theano.function(
[guard_input], T.max(guard_input),
mode='FAST_RUN'
)
except RuntimeError:
# This can happen if cuda is available, but the
# device is in exclusive mode and used by another
# process.
cuda_compile_failed = True
if big_is_error and not cuda_compile_failed and f_gpuabsmax is None:
try:
f_gpuabsmax = theano.function(
[guard_input], T.max(T.abs_(guard_input)),
mode='FAST_RUN'
)
except RuntimeError:
# This can happen if cuda is available, but the
# device is in exclusive mode and used by another
# process.
cuda_compile_failed = True
def f_compute(op): def f_compute(op):
def result(inp): def result(inp):
...@@ -270,9 +201,6 @@ class NanGuardMode(Mode): ...@@ -270,9 +201,6 @@ class NanGuardMode(Mode):
assert nan_is_error or inf_is_error or big_is_error assert nan_is_error or inf_is_error or big_is_error
if cuda.cuda_enabled:
compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
def do_check_on(value, nd, var=None): def do_check_on(value, nd, var=None):
""" """
Checks `value` for NaNs / Infs. If detected, raises an exception Checks `value` for NaNs / Infs. If detected, raises an exception
...@@ -304,9 +232,6 @@ class NanGuardMode(Mode): ...@@ -304,9 +232,6 @@ class NanGuardMode(Mode):
err = False err = False
if not _is_numeric_value(value, var): if not _is_numeric_value(value, var):
err = False err = False
elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray):
compile_gpu_func(False, False, True)
err = (f_gpuabsmax(value.reshape(value.size)) > 1e10)
elif pygpu_available and isinstance(value, GpuArray): elif pygpu_available and isinstance(value, GpuArray):
err = (f_gpua_absmax(value.reshape(value.size)) > 1e10) err = (f_gpua_absmax(value.reshape(value.size)) > 1e10)
else: else:
......
...@@ -810,7 +810,7 @@ class SpecifyShape(gof.Op): ...@@ -810,7 +810,7 @@ class SpecifyShape(gof.Op):
We currently don't support specifying partial shape information. We currently don't support specifying partial shape information.
TODO : test this op with sparse and cuda ndarray. Do C code for them too. TODO : test this op with sparse. Do C code for them too.
""" """
......
...@@ -267,11 +267,8 @@ class ProfileStats(object): ...@@ -267,11 +267,8 @@ class ProfileStats(object):
def __init__(self, atexit_print=True, flag_time_thunks=None, def __init__(self, atexit_print=True, flag_time_thunks=None,
gpu_checks=True, **kwargs): gpu_checks=True, **kwargs):
if (gpu_checks and if (gpu_checks and
((hasattr(theano, 'sandbox') and (hasattr(theano, 'gpuarray') and
hasattr(theano.sandbox, 'cuda') and theano.gpuarray.pygpu_activated) and
theano.sandbox.cuda.cuda_enabled) or (
hasattr(theano, 'gpuarray') and
theano.gpuarray.pygpu_activated)) and
os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1'): os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1'):
msg = ( msg = (
"You are running the Theano profiler with CUDA enabled." "You are running the Theano profiler with CUDA enabled."
...@@ -290,9 +287,9 @@ class ProfileStats(object): ...@@ -290,9 +287,9 @@ class ProfileStats(object):
theano.gpuarray.pygpu_activated and theano.gpuarray.pygpu_activated and
not config.profiling.ignore_first_call): not config.profiling.ignore_first_call):
warnings.warn( warnings.warn(
"Theano flag profiling.ignore_first_call is False." "Theano flag profiling.ignore_first_call is False. "
" This cause bad profiling result in the new gpu" "This cause bad profiling result in the gpu "
" back-end, as sometimes we compile at the first call.") "back-end, as sometimes we compile at the first call.")
self.apply_callcount = {} self.apply_callcount = {}
self.output_size = {} self.output_size = {}
...@@ -514,8 +511,8 @@ class ProfileStats(object): ...@@ -514,8 +511,8 @@ class ProfileStats(object):
tot += t tot += t
ftot = tot * 100 / local_time ftot = tot * 100 / local_time
# Remove the useless start and end of the class name: # Remove the useless start and end of the class name:
# "<class 'theano.sandbox.cuda.blas.GpuDot22'>" -> # "<class 'theano.gpuarray.blas.GpuDot22'>" ->
# "theano.sandbox.cuda.blas.GpuDot22" # "theano.gpuarray.blas.GpuDot22"
class_name = str(a)[8:-2][:maxlen] class_name = str(a)[8:-2][:maxlen]
print(format_str % (f, ftot, t, t / nb_call, print(format_str % (f, ftot, t, t / nb_call,
impl, nb_call, impl, nb_call,
...@@ -832,7 +829,8 @@ class ProfileStats(object): ...@@ -832,7 +829,8 @@ class ProfileStats(object):
new allocation. new allocation.
""" """
from theano.sandbox.cuda import CudaNdarrayType from theano.gpuarray import GpuArrayType
# Initial Mem info values [CPU, GPU] # Initial Mem info values [CPU, GPU]
node_memory_size = [0, 0] node_memory_size = [0, 0]
running_memory_size = [0, 0] running_memory_size = [0, 0]
...@@ -882,7 +880,7 @@ class ProfileStats(object): ...@@ -882,7 +880,7 @@ class ProfileStats(object):
# allocated by the node # allocated by the node
idx2 = 0 idx2 = 0
for out in node.outputs: for out in node.outputs:
if isinstance(out.type, CudaNdarrayType): if isinstance(out.type, GpuArrayType):
cg = 1 cg = 1
else: else:
cg = 0 cg = 0
...@@ -924,7 +922,7 @@ class ProfileStats(object): ...@@ -924,7 +922,7 @@ class ProfileStats(object):
for ins in set(node.inputs): for ins in set(node.inputs):
assert not (ins in view_of and viewed_by[ins]) assert not (ins in view_of and viewed_by[ins])
# we trac the original var, so this shouldn't happen # we trac the original var, so this shouldn't happen
if isinstance(ins.type, CudaNdarrayType): if isinstance(ins.type, GpuArrayType):
cg = 1 cg = 1
else: else:
cg = 0 cg = 0
...@@ -1257,16 +1255,6 @@ class ProfileStats(object): ...@@ -1257,16 +1255,6 @@ class ProfileStats(object):
print("---", file=file) print("---", file=file)
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
hasattr(theano.sandbox.cuda, 'cuda_ndarray') and
hasattr(theano.sandbox.cuda.cuda_ndarray.cuda_ndarray,
'theano_allocated')):
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
_, gpu_max = cuda_ndarray.theano_allocated()
print(" Max Memory allocated on the GPU (for all functions): "
"%dKB" % int(round(gpu_max / 1024.)), file=file)
print("", file=file) print("", file=file)
if len(fct_memory) > 1: if len(fct_memory) > 1:
print(" This list is based on all functions in the profile", print(" This list is based on all functions in the profile",
...@@ -1469,7 +1457,6 @@ class ProfileStats(object): ...@@ -1469,7 +1457,6 @@ class ProfileStats(object):
printed_tip = True printed_tip = True
# tip 7 # tip 7
import theano.sandbox.cuda as cuda
from theano.tensor.nnet import LogSoftmax from theano.tensor.nnet import LogSoftmax
import theano.tensor.signal.pool as pool import theano.tensor.signal.pool as pool
import theano.gpuarray import theano.gpuarray
...@@ -1477,12 +1464,12 @@ class ProfileStats(object): ...@@ -1477,12 +1464,12 @@ class ProfileStats(object):
for a in self.apply_time: for a in self.apply_time:
node = a node = a
if (isinstance(node.op, pool.Pool)): if (isinstance(node.op, pool.Pool)):
if (not cuda.dnn.dnn_available() and not theano.gpuarray.dnn.dnn_present()): if not theano.gpuarray.dnn.dnn_present():
print("Install CuDNN to do pooling faster" print("Install CuDNN to do pooling faster"
"this allows the operation to run on GPU") "this allows the operation to run on GPU")
printed_tip = True printed_tip = True
if (isinstance(node.op, LogSoftmax)): if (isinstance(node.op, LogSoftmax)):
if (not cuda.dnn.dnn_available() and not theano.gpuarray.dnn.dnn_present()): if not theano.gpuarray.dnn.dnn_present():
print("Install CuDNN to do LogSoftmax faster" print("Install CuDNN to do LogSoftmax faster"
"this allows the operation to run on GPU") "this allows the operation to run on GPU")
printed_tip = True printed_tip = True
......
...@@ -713,7 +713,6 @@ class VecAsRowAndCol(gof.Op): ...@@ -713,7 +713,6 @@ class VecAsRowAndCol(gof.Op):
if (c[0] is None) or (c[0].shape != (lv, 1)): if (c[0] is None) or (c[0].shape != (lv, 1)):
c[0] = node.outputs[1].type.value_zeros((lv, 1)) c[0] = node.outputs[1].type.value_zeros((lv, 1))
# Python loop because CudaNdarrays do not support newaxis
for i in range(lv): for i in range(lv):
r[0][0, i] = v[i] r[0][0, i] = v[i]
c[0][i, 0] = v[i] c[0][i, 0] = v[i]
...@@ -794,24 +793,3 @@ class Test_preallocated_output(unittest.TestCase): ...@@ -794,24 +793,3 @@ class Test_preallocated_output(unittest.TestCase):
v_val = self.rng.randn(5).astype('float32') v_val = self.rng.randn(5).astype('float32')
f(v_val) f(v_val)
def test_output_broadcast_cuda(self):
from theano.sandbox import cuda
if not cuda.cuda_available:
raise SkipTest("Optional package Cuda disabled")
if cuda.use.device_number is None:
# We should normally set VecAsRowAndCol as a GPUOp But we
# don't want to do this here as this will disable others
# tests in this file. So we manually init the GPU if
# needed to remove warning.
cuda.use("gpu",
force=True,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
v = cuda.fvector('v')
c, r = VecAsRowAndCol()(v)
f = theano.function([v], [c, r])
v_val = cuda.CudaNdarray(self.rng.randn(5).astype('float32'))
f(v_val)
...@@ -89,19 +89,20 @@ class DeviceParam(ConfigParam): ...@@ -89,19 +89,20 @@ class DeviceParam(ConfigParam):
self.default = default self.default = default
def filter(val): def filter(val):
if val == self.default or val.startswith('gpu') \ if (val == self.default or
or val.startswith('opencl') or val.startswith('cuda'): val.startswith('opencl') or
val.startswith('cuda')):
return val return val
else: else:
raise ValueError(('Invalid value ("%s") for configuration ' raise ValueError(('Invalid value ("%s") for configuration '
'variable "%s". Valid options start with ' 'variable "%s". Valid options start with '
'one of "%s", "gpu", "opencl", "cuda"' 'one of "%s", "opencl", "cuda"'
% (self.default, val, self.fullname))) % (self.default, val, self.fullname)))
over = kwargs.get("allow_override", True) over = kwargs.get("allow_override", True)
super(DeviceParam, self).__init__(default, filter, over) super(DeviceParam, self).__init__(default, filter, over)
def __str__(self): def __str__(self):
return '%s (%s, gpu*, opencl*, cuda*) ' % (self.fullname, self.default) return '%s (%s, opencl*, cuda*) ' % (self.fullname, self.default)
AddConfigVar( AddConfigVar(
'device', 'device',
...@@ -175,88 +176,6 @@ AddConfigVar( ...@@ -175,88 +176,6 @@ AddConfigVar(
in_c_key=False) in_c_key=False)
AddConfigVar(
'enable_initial_driver_test',
"Tests the nvidia driver when a GPU device is initialized.",
BoolParam(True, allow_override=False),
in_c_key=False)
def default_cuda_root():
v = os.getenv('CUDA_ROOT', "")
if v:
return v
s = os.getenv("PATH")
if not s:
return ''
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.dirname(os.path.abspath(dir))
return ''
AddConfigVar(
'cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
AddConfigVar(
'cuda.enabled',
'If false, C code in old backend is not compiled.',
BoolParam(True),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.cudafe',
"If 'always' (the default), cudafe will be called for every GPU"
" Op compilation. If 'heuristic', it will only be called if the"
" source code appears to contain CUDA code. This can speed up"
" compilation and importing theano, but might fail to compile"
" some custom GPU Ops.",
EnumStr('always', 'heuristic'),
# Not needed in c key, does not affect the compilation result.
in_c_key=False)
AddConfigVar('gpuarray.sync', AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before """If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution, returning. Setting this to True will slow down execution,
...@@ -391,11 +310,25 @@ AddConfigVar('dnn.conv.precision', ...@@ -391,11 +310,25 @@ AddConfigVar('dnn.conv.precision',
in_c_key=False) in_c_key=False)
def get_cuda_root():
v = os.getenv('CUDA_ROOT', "")
if v:
return v
s = os.getenv("PATH")
if not s:
return ''
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.dirname(os.path.abspath(dir))
return ''
def default_dnn_path(suffix): def default_dnn_path(suffix):
def f(suffix=suffix): def f(suffix=suffix):
if theano.config.cuda.root == '': cuda_root = get_cuda_root()
if cuda_root == '':
return '' return ''
return os.path.join(theano.config.cuda.root, suffix) return os.path.join(cuda_root, suffix)
return f return f
AddConfigVar('dnn.include_path', AddConfigVar('dnn.include_path',
...@@ -656,8 +589,8 @@ AddConfigVar( ...@@ -656,8 +589,8 @@ AddConfigVar(
in_c_key=False) in_c_key=False)
AddConfigVar('experimental.unpickle_gpu_on_cpu', AddConfigVar('experimental.unpickle_gpu_on_cpu',
"Allow unpickling of pickled CudaNdarrays as numpy.ndarrays." "Allow unpickling of pickled GpuArrays as numpy.ndarrays."
"This is useful, if you want to open a CudaNdarray without " "This is useful, if you want to open a GpuArray without "
"having cuda installed." "having cuda installed."
"If you have cuda installed, this will force unpickling to" "If you have cuda installed, this will force unpickling to"
"be done on the cpu to numpy.ndarray." "be done on the cpu to numpy.ndarray."
...@@ -1518,38 +1451,6 @@ AddConfigVar('scan.debug', ...@@ -1518,38 +1451,6 @@ AddConfigVar('scan.debug',
BoolParam(False), BoolParam(False),
in_c_key=False) in_c_key=False)
AddConfigVar('pycuda.init',
"""If True, always initialize PyCUDA when Theano want to
initilize the GPU. Currently, we must always initialize
PyCUDA before Theano do it. Setting this flag to True,
ensure that, but always import PyCUDA. It can be done
manually by importing theano.misc.pycuda_init before theano
initialize the GPU device.
""",
BoolParam(False),
in_c_key=False)
AddConfigVar('cublas.lib',
"""Name of the cuda blas library for the linker.""",
StrParam('cublas'),
# Added elsewhere in the c key only when needed.
in_c_key=False)
AddConfigVar('lib.cnmem',
"""Do we enable CNMeM or not (a faster CUDA memory allocator).
The parameter represent the start size (in MB or % of
total GPU memory) of the memory pool.
0: not enabled.
0 < N <= 1: % of the total GPU memory (clipped to .985 for driver memory)
> 0: use that number of MB of memory.
""",
# We should not mix both allocator, so we can't override
FloatParam(0, lambda i: i >= 0, allow_override=False),
in_c_key=False)
AddConfigVar('compile.wait', AddConfigVar('compile.wait',
"""Time to wait before retrying to aquire the compile lock.""", """Time to wait before retrying to aquire the compile lock.""",
IntParam(5, lambda i: i > 0, allow_override=False), IntParam(5, lambda i: i > 0, allow_override=False),
......
...@@ -751,8 +751,6 @@ class CLinker(link.Linker): ...@@ -751,8 +751,6 @@ class CLinker(link.Linker):
# This ensures that, when defining functions in support code, # This ensures that, when defining functions in support code,
# we cannot have two different functions, in different modules, # we cannot have two different functions, in different modules,
# that have the same name. # that have the same name.
# It was problematic, in particular, on Mac OS X (10.6 and 10.7)
# when defining CUDA kernels (with Cuda 4.2 and 5.0). See gh-1172.
name = "node_<<<<HASH_PLACEHOLDER>>>>_%i" % node_num name = "node_<<<<HASH_PLACEHOLDER>>>>_%i" % node_num
isyms = [symbol[r] for r in node.inputs] isyms = [symbol[r] for r in node.inputs]
osyms = [symbol[r] for r in node.outputs] osyms = [symbol[r] for r in node.outputs]
......
...@@ -796,12 +796,6 @@ class ModuleCache(object): ...@@ -796,12 +796,6 @@ class ModuleCache(object):
msg='broken cache directory [EOF]', msg='broken cache directory [EOF]',
level=logging.WARNING) level=logging.WARNING)
continue continue
except ValueError:
# This can happen when we have bad config value
# in the cuda.nvcc_compiler.py file.
# We should not hide it here, as this will cause
# an unrelated error to appear.
raise
except Exception: except Exception:
unpickle_failure() unpickle_failure()
if delete_if_problem: if delete_if_problem:
...@@ -1323,7 +1317,7 @@ class ModuleCache(object): ...@@ -1323,7 +1317,7 @@ class ModuleCache(object):
to -1 in order to delete all unversioned cached modules regardless to -1 in order to delete all unversioned cached modules regardless
of their age. of their age.
clear_base_files : bool clear_base_files : bool
If True, then delete base directories 'cuda_ndarray', 'cutils_ext', If True, then delete base directories 'cutils_ext',
'lazylinker_ext' and 'scan_perform' if they are present. 'lazylinker_ext' and 'scan_perform' if they are present.
If False, those directories are left intact. If False, those directories are left intact.
delete_if_problem delete_if_problem
...@@ -1340,8 +1334,8 @@ class ModuleCache(object): ...@@ -1340,8 +1334,8 @@ class ModuleCache(object):
def clear_base_files(self): def clear_base_files(self):
""" """
Remove base directories 'cuda_ndarray', 'cutils_ext', 'lazylinker_ext' Remove base directories 'cutils_ext', 'lazylinker_ext' and
and 'scan_perform' if present. 'scan_perform' if present.
Note that we do not delete them outright because it may not work on Note that we do not delete them outright because it may not work on
some systems due to these modules being currently in use. Instead we some systems due to these modules being currently in use. Instead we
...@@ -1350,8 +1344,7 @@ class ModuleCache(object): ...@@ -1350,8 +1344,7 @@ class ModuleCache(object):
""" """
with compilelock.lock_ctx(): with compilelock.lock_ctx():
for base_dir in ('cuda_ndarray', 'cutils_ext', 'lazylinker_ext', for base_dir in ('cutils_ext', 'lazylinker_ext', 'scan_perform'):
'scan_perform'):
to_delete = os.path.join(self.dirname, base_dir + '.delete.me') to_delete = os.path.join(self.dirname, base_dir + '.delete.me')
if os.path.isdir(to_delete): if os.path.isdir(to_delete):
try: try:
......
...@@ -216,7 +216,7 @@ class Apply(Node): ...@@ -216,7 +216,7 @@ class Apply(Node):
strict : bool strict : bool
If True, the type fields of all the inputs must be equal If True, the type fields of all the inputs must be equal
to the current ones (or compatible, for instance Tensor / to the current ones (or compatible, for instance Tensor /
CudaNdarray of the same dtype and broadcastable patterns, GpuArray of the same dtype and broadcastable patterns,
in which case they will be converted into current Type), and in which case they will be converted into current Type), and
returned outputs are guaranteed to have the same types as returned outputs are guaranteed to have the same types as
self.outputs. If False, then there's no guarantee that the self.outputs. If False, then there's no guarantee that the
...@@ -308,7 +308,7 @@ class Variable(Node): ...@@ -308,7 +308,7 @@ class Variable(Node):
- `SparseVariable` subclass of Variable that represents - `SparseVariable` subclass of Variable that represents
a scipy.sparse.{csc,csr}_matrix object. a scipy.sparse.{csc,csr}_matrix object.
- `CudaNdarrayVariable` subclass of Variable that represents our object on - `GpuArrayVariable` subclass of Variable that represents our object on
the GPU that is a subset of numpy.ndarray. the GPU that is a subset of numpy.ndarray.
- `RandomVariable`. - `RandomVariable`.
......
...@@ -15,8 +15,6 @@ from theano.gof.graph import ( ...@@ -15,8 +15,6 @@ from theano.gof.graph import (
is_same_graph, Variable) is_same_graph, Variable)
from theano.gof.op import Op from theano.gof.op import Op
from theano.gof.type import Type from theano.gof.type import Type
from theano.sandbox.cuda.var import (
CudaNdarrayVariable, CudaNdarrayConstant, CudaNdarraySharedVariable)
def as_variable(x): def as_variable(x):
...@@ -386,22 +384,6 @@ class TestAutoName: ...@@ -386,22 +384,6 @@ class TestAutoName:
assert r2.auto_name == "auto_" + str(autoname_id + 1) assert r2.auto_name == "auto_" + str(autoname_id + 1)
assert r3.auto_name == "auto_" + str(autoname_id + 2) assert r3.auto_name == "auto_" + str(autoname_id + 2)
def test_cudandarrayvariable(self):
# Get counter value
autoname_id = next(Variable.__count__)
Variable.__count__ = count(autoname_id)
mytype = tensor.TensorType(dtype='int32', broadcastable=())
r1 = CudaNdarrayVariable(type='int32')
r2 = CudaNdarrayVariable(type='int32')
r3 = CudaNdarrayConstant(type=mytype,
data=1)
r4 = CudaNdarraySharedVariable(name='x', type=mytype,
value=1, strict=False)
assert r1.auto_name == "auto_" + str(autoname_id)
assert r2.auto_name == "auto_" + str(autoname_id + 1)
assert r3.auto_name == "auto_" + str(autoname_id + 2)
assert r4.auto_name == "auto_" + str(autoname_id + 3)
def test_randomvariable(self): def test_randomvariable(self):
# Get counter value # Get counter value
autoname_id = next(Variable.__count__) autoname_id = next(Variable.__count__)
......
...@@ -279,21 +279,6 @@ if run_memory_usage_tests: ...@@ -279,21 +279,6 @@ if run_memory_usage_tests:
# these are not normal unit tests, do not run them as part of standard # these are not normal unit tests, do not run them as part of standard
# suite. I ran them while looking at top, and stopped when memory usage # suite. I ran them while looking at top, and stopped when memory usage
# was stable. # was stable.
def test_leak2():
import theano.sandbox.cuda as cuda
for i in xrange(1000000):
n = np.asarray([2.3, 4.5], dtype='f')
c = sys.getrefcount(n)
a = cuda.CudaNdarray(n)
a.sum()
assert c == sys.getrefcount(n)
del a
if not i % 1000:
print('.', end=' ')
print(gc.collect(), end=' ')
print(gc.collect())
sys.stdout.flush()
def test_no_leak_many_graphs(): def test_no_leak_many_graphs():
# Verify no memory leaks when creating and deleting a lot of functions # Verify no memory leaks when creating and deleting a lot of functions
......
...@@ -326,8 +326,9 @@ class PureType(object): ...@@ -326,8 +326,9 @@ class PureType(object):
Convert a symbolic variable into this Type, if compatible. Convert a symbolic variable into this Type, if compatible.
For the moment, the only Types compatible with one another are For the moment, the only Types compatible with one another are
TensorType and CudaNdarrayType, provided they have the same TensorType and GpuArrayType, provided they have the same
number of dimensions, same broadcasting pattern, and same dtype. number of dimensions, same broadcasting pattern, and same
dtype.
If Types are not compatible, a TypeError should be raised. If Types are not compatible, a TypeError should be raised.
......
...@@ -28,9 +28,9 @@ except ImportError: ...@@ -28,9 +28,9 @@ except ImportError:
try: try:
import skcuda import skcuda
from skcuda import fft from skcuda import fft
scikits_cuda_available = True skcuda_available = True
except (ImportError, Exception): except (ImportError, Exception):
scikits_cuda_available = False skcuda_available = False
class CuRFFTOp(Op): class CuRFFTOp(Op):
...@@ -51,7 +51,7 @@ class CuRFFTOp(Op): ...@@ -51,7 +51,7 @@ class CuRFFTOp(Op):
# the shape given to the plan, so padding will have to be done in the op. # the shape given to the plan, so padding will have to be done in the op.
# The effect of padding on gradients has yet to be investigated. # The effect of padding on gradients has yet to be investigated.
if not scikits_cuda_available: if not skcuda_available:
raise RuntimeError("skcuda is needed for CuFFTOp") raise RuntimeError("skcuda is needed for CuFFTOp")
if not pygpu_available: if not pygpu_available:
...@@ -175,7 +175,7 @@ class CuIRFFTOp(Op): ...@@ -175,7 +175,7 @@ class CuIRFFTOp(Op):
# the shape given to the plan, so padding will have to be done in the op. # the shape given to the plan, so padding will have to be done in the op.
# The effect of padding on gradients has yet to be investigated. # The effect of padding on gradients has yet to be investigated.
if not scikits_cuda_available: if not skcuda_available:
raise RuntimeError("skcuda is needed for CuIFFTOp") raise RuntimeError("skcuda is needed for CuIFFTOp")
if not pygpu_available: if not pygpu_available:
...@@ -370,7 +370,7 @@ def _unitary(norm): ...@@ -370,7 +370,7 @@ def _unitary(norm):
"'no norm'" % norm) "'no norm'" % norm)
return norm return norm
if scikits_cuda_available: if skcuda_available:
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.fft.RFFTOp]) @op_lifter([theano.tensor.fft.RFFTOp])
@register_opt2([theano.tensor.fft.RFFTOp], 'fast_compile') @register_opt2([theano.tensor.fft.RFFTOp], 'fast_compile')
......
差异被折叠。
...@@ -1155,26 +1155,6 @@ def test_version(): ...@@ -1155,26 +1155,6 @@ def test_version():
assert isinstance(dnn.version(), int) assert isinstance(dnn.version(), int)
def test_nvcc_compiler_bindir_and_flags():
# This tests if the options nvcc.compiler_bindir and nvcc.flags from
# the old sandbox.cuda backend are not passed to the g++ compiler in
# the new backend. (Regression test for issues GH-4978 and GH-5373.)
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
old_nvcc_compiler_bindir = theano.config.nvcc.compiler_bindir
old_nvcc_flags = theano.config.nvcc.flags
try:
theano.config.nvcc.compiler_bindir = "/usr/bin"
theano.config.nvcc.flags = "--cuda"
# compiling should still work, which means that the options
# have not been passed to the compiler
ret, msg = dnn._dnn_check_compile()
assert ret, msg
finally:
theano.config.nvcc.compiler_bindir = old_nvcc_compiler_bindir
theano.config.nvcc.flags = old_nvcc_flags
class test_SoftMax(test_nnet.test_SoftMax): class test_SoftMax(test_nnet.test_SoftMax):
gpu_op = dnn.GpuDnnSoftmax gpu_op = dnn.GpuDnnSoftmax
gpu_grad_op = dnn.GpuDnnSoftmaxGrad gpu_grad_op = dnn.GpuDnnSoftmaxGrad
......
...@@ -12,11 +12,11 @@ from .config import mode_with_gpu ...@@ -12,11 +12,11 @@ from .config import mode_with_gpu
# Skip tests if pygpu is not available. # Skip tests if pygpu is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
from theano.gpuarray.fft import pygpu_available, scikits_cuda_available, pycuda_available from theano.gpuarray.fft import pygpu_available, skcuda_available, pycuda_available
if not pygpu_available: # noqa if not pygpu_available: # noqa
raise SkipTest('Optional package pygpu not available') raise SkipTest('Optional package pygpu not available')
if not scikits_cuda_available: # noqa if not skcuda_available: # noqa
raise SkipTest('Optional package scikits.cuda not available') raise SkipTest('Optional package scikit-cuda not available')
if not pycuda_available: # noqa if not pycuda_available: # noqa
raise SkipTest('Optional package pycuda not available') raise SkipTest('Optional package pycuda not available')
......
from __future__ import absolute_import, print_function, division
from .config import test_ctx_name, mode_with_gpu
from ..basic_ops import (HostFromGpu, GpuFromHost)
from ..type import (get_context, GpuArrayType, GpuArraySharedVariable,
gpuarray_shared_constructor)
import pygpu
import numpy as np
from theano.misc.tests.test_may_share_memory import may_share_memory_core
from theano.misc.pkl_utils import dump, load
from theano.tensor.tests import test_opt
class test_fusion(test_opt.test_fusion):
mode = mode_with_gpu
_shared = staticmethod(gpuarray_shared_constructor)
topo_exclude = (GpuFromHost, HostFromGpu)
def test_may_share_memory():
ctx = get_context(test_ctx_name)
a = pygpu.empty((5, 4), context=ctx)
b = pygpu.empty((5, 4), context=ctx)
may_share_memory_core(a, b)
def test_dump_load():
x = GpuArraySharedVariable('x',
GpuArrayType('float32', (1, 1), name='x',
context_name=test_ctx_name),
[[1]], False)
with open('test', 'wb') as f:
dump(x, f)
with open('test', 'rb') as f:
x = load(f)
assert x.name == 'x'
np.testing.assert_allclose(x.get_value(), [[1]])
from __future__ import absolute_import, print_function, division
import functools
import numpy as np
import theano
from theano import tensor
from theano.sandbox import rng_mrg
from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.sandbox.tests.test_rng_mrg import java_samples, rng_mrg_overflow
from theano.tests import unittest_tools as utt
from .config import mode_with_gpu as mode
from ..type import gpuarray_shared_constructor
from ..rng_mrg import GPUA_mrg_uniform
utt.seed_rng()
def test_consistency_GPUA_serial():
# Verify that the random numbers generated by GPUA_mrg_uniform, serially,
# are the same as the reference (Java) implementation by L'Ecuyer et al.
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7
samples = []
curr_rstate = np.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_rstate = curr_rstate.copy()
for j in range(n_substreams):
substream_rstate = np.array([stream_rstate.copy()],
dtype='int32')
# Transfer to device
rstate = gpuarray_shared_constructor(substream_rstate)
new_rstate, sample = GPUA_mrg_uniform.new(rstate,
ndim=None,
dtype='float32',
size=(1,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
samples.append(s)
# next substream
stream_rstate = rng_mrg.ff_2p72(stream_rstate)
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_consistency_GPUA_parallel():
# Verify that the random numbers generated by GPUA_mrg_uniform, in
# parallel, are the same as the reference (Java) implementation by
# L'Ecuyer et al.
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7 # 7 samples will be drawn in parallel
samples = []
curr_rstate = np.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_samples = []
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = np.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
new_rstate, sample = GPUA_mrg_uniform.new(rstate, ndim=None,
dtype='float32',
size=(n_substreams,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
stream_samples.append(s)
samples.append(np.array(stream_samples).T.flatten())
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_GPUA_full_fill():
# Make sure the whole sample buffer is filled. Also make sure
# large samples are consistent with CPU results.
import theano.gpuarray.tests.config
from theano.gpuarray.type import gpuarray_shared_constructor
# This needs to be large to trigger the problem on GPU
size = (10, 1000)
R = MRG_RandomStreams(234)
uni = R.uniform(size, nstreams=60 * 256)
f_cpu = theano.function([], uni)
rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value())
new_rstate, sample = GPUA_mrg_uniform.new(rstate_gpu, ndim=None,
dtype='float32',
size=size)
rstate_gpu.default_update = new_rstate
f_gpu = theano.function([], sample)
utt.assert_allclose(f_cpu(), f_gpu())
def test_overflow_gpu_new_backend():
from theano.gpuarray.tests.test_basic_ops import \
mode_with_gpu as mode
from theano.gpuarray.type import gpuarray_shared_constructor
seed = 12345
n_substreams = 7
curr_rstate = np.array([seed] * 6, dtype='int32')
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = np.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
fct = functools.partial(GPUA_mrg_uniform.new, rstate,
ndim=None, dtype='float32')
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(np.int32(2**10), ),
(np.int32(2), np.int32(2**10), np.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_validate_input_types_gpuarray_backend():
from theano.sandbox.rng_mrg import mrg_uniform
from theano.gpuarray.type import gpuarray_shared_constructor
from theano.configparser import change_flags
with change_flags(compute_test_value="raise"):
rstate = np.zeros((7, 6), dtype="int32")
rstate = gpuarray_shared_constructor(rstate)
mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
...@@ -9,6 +9,9 @@ from theano import config ...@@ -9,6 +9,9 @@ from theano import config
from theano.compile import DeepCopyOp from theano.compile import DeepCopyOp
from theano.misc.pkl_utils import CompatUnpickler from theano.misc.pkl_utils import CompatUnpickler
# Disabled for now
# from theano.tensor.tests.test_sharedvar import makeSharedTester
from .config import test_ctx_name from .config import test_ctx_name
from .test_basic_ops import rand_gpuarray from .test_basic_ops import rand_gpuarray
from ..type import GpuArrayType, gpuarray_shared_constructor from ..type import GpuArrayType, gpuarray_shared_constructor
...@@ -76,3 +79,45 @@ def test_unpickle_gpuarray_as_numpy_ndarray_flag0(): ...@@ -76,3 +79,45 @@ def test_unpickle_gpuarray_as_numpy_ndarray_flag0():
assert np.asarray(mat)[0] == -42.0 assert np.asarray(mat)[0] == -42.0
finally: finally:
config.experimental.unpickle_gpu_on_cpu = oldflag config.experimental.unpickle_gpu_on_cpu = oldflag
# These tests are disabled because they expect the impossible
"""
@makeSharedTester(
shared_constructor_=gpuarray_shared_constructor,
dtype_=theano.config.floatX,
get_value_borrow_true_alias_=True,
shared_borrow_true_alias_=True,
set_value_borrow_true_alias_=True,
set_value_inplace_=True,
set_cast_value_inplace_=False,
shared_constructor_accept_ndarray_=True,
internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
cls=pygpu._array.ndgpuarray),
test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
theano_fct_=theano.tensor.exp,
ref_fct_=np.exp,
cast_value_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
cls=pygpu._array.ndgpuarray))
class test_shared_options(object):
pass
@makeSharedTester(
shared_constructor_=gpuarray_shared_constructor,
dtype_=theano.config.floatX,
get_value_borrow_true_alias_=False,
shared_borrow_true_alias_=False,
set_value_borrow_true_alias_=False,
set_value_inplace_=True,
set_cast_value_inplace_=True,
shared_constructor_accept_ndarray_=True,
internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
cls=pygpu._array.ndgpuarray),
test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
theano_fct_=theano.tensor.exp,
ref_fct_=np.exp,
cast_value_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
cls=pygpu._array.ndgpuarray))
class test_shared_options2(object):
pass
"""
...@@ -218,7 +218,7 @@ def Rop(f, wrt, eval_points): ...@@ -218,7 +218,7 @@ def Rop(f, wrt, eval_points):
str(eval_point.type.ndim)) str(eval_point.type.ndim))
except AttributeError: except AttributeError:
# wrt_elem and eval_point don't always have ndim like random type # wrt_elem and eval_point don't always have ndim like random type
# Tensor, Sparse and CudaNdArray have the ndim attribute # Tensor, Sparse and GpuArray have the ndim attribute
pass pass
seen_nodes = OrderedDict() seen_nodes = OrderedDict()
......
...@@ -168,8 +168,8 @@ class IfElse(Op): ...@@ -168,8 +168,8 @@ class IfElse(Op):
) )
c = theano.tensor.as_tensor_variable(c) c = theano.tensor.as_tensor_variable(c)
if not self.gpu: if not self.gpu:
# When gpu is true, we are given only cuda ndarrays, and we want # When gpu is true, we are given only gpuarrays, and we want
# to keep them be cuda ndarrays # to keep them as gpuarrays
nw_args = [] nw_args = []
for x in args: for x in args:
if hasattr(x, '_as_TensorVariable'): if hasattr(x, '_as_TensorVariable'):
......
...@@ -11,7 +11,6 @@ import os ...@@ -11,7 +11,6 @@ import os
import sys import sys
import time import time
from optparse import OptionParser from optparse import OptionParser
import subprocess
import numpy as np import numpy as np
import theano import theano
...@@ -51,12 +50,6 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, ...@@ -51,12 +50,6 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
print('Numpy dot module:', np.dot.__module__) print('Numpy dot module:', np.dot.__module__)
print('Numpy location:', np.__file__) print('Numpy location:', np.__file__)
print('Numpy version:', np.__version__) print('Numpy version:', np.__version__)
if (theano.config.device.startswith("gpu") or
theano.config.init_gpu_device.startswith("gpu")):
print('nvcc version:')
subprocess.call((theano.sandbox.cuda.nvcc_compiler.nvcc_path,
"--version"))
print()
a = theano.shared(np.ones((M, N), dtype=theano.config.floatX, a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
order=order)) order=order))
...@@ -88,17 +81,15 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, ...@@ -88,17 +81,15 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
f() # Ignore first function call to get representative time. f() # Ignore first function call to get representative time.
if execute: if execute:
sync = (hasattr(theano, "sandbox") and sync = (hasattr(theano, "gpuarray") and
hasattr(theano.sandbox, "cuda") and
isinstance(c, theano.sandbox.cuda.CudaNdarraySharedVariable))
sync2 = (hasattr(theano, "gpuarray") and
isinstance(c, theano.gpuarray.GpuArraySharedVariable)) isinstance(c, theano.gpuarray.GpuArraySharedVariable))
if sync:
# Make sure we don't include the time from the first call
c.get_value(borrow=True, return_internal_type=True).sync()
t0 = time.time() t0 = time.time()
for i in range(iters): for i in range(iters):
f() f()
if sync: if sync:
theano.sandbox.cuda.synchronize()
if sync2:
c.get_value(borrow=True, return_internal_type=True).sync() c.get_value(borrow=True, return_internal_type=True).sync()
t1 = time.time() t1 = time.time()
return t1 - t0, impl return t1 - t0, impl
...@@ -199,85 +190,30 @@ if __name__ == "__main__": ...@@ -199,85 +190,30 @@ if __name__ == "__main__":
goto2 1.13/8 1.94s goto2 1.13/8 1.94s
goto2 1.13/16 3.16s goto2 1.13/16 3.16s
Test time in float32 Test time in float32. There were 10 executions of gemm in
float32 with matrices of shape 5000x5000 (M=N=K=5000)
cuda version 6.5 6.0 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note
gpu
K6000/NOECC 0.06s 0.06s
K40 0.07s
K20m/ECC 0.08s 0.08s 0.07s
K20/NOECC 0.07s
M2090 0.19s
C2075 0.25s
M2075 0.25s
M2070 0.25s 0.27s 0.32s
M2070-Q 0.48s 0.27s 0.32s
M2050(Amazon) 0.25s
C1060 0.46s
K600 1.04s
GTX Titan Black 0.05s
GTX Titan(D15U-50) 0.06s 0.06s don't work
GTX 780 0.06s
GTX 980 0.06s
GTX 970 0.08s
GTX 680 0.11s 0.12s 0.154s 0.218s
GRID K520 0.14s
GTX 580 0.16s 0.16s 0.164s 0.203s
GTX 480 0.19s 0.19s 0.192s 0.237s 0.27s
GTX 750 Ti 0.20s
GTX 470 0.23s 0.23s 0.238s 0.297s 0.34s
GTX 660 0.18s 0.20s 0.23s
GTX 560 0.30s
GTX 650 Ti 0.27s
GTX 765M 0.27s
GTX 460 0.37s 0.45s
GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version?
750M 0.49s
GT 610 2.38s
GTX 550 Ti 0.57s
GT 520 2.68s 3.06s
GT 520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04
GT 220 3.80s
GT 210 6.35s
8500 GT 10.68s
Results for larger matrices.
There were 10 executions of gemm in float32
with matrices of shape 5000x5000 (M=N=K=5000).
All memory layout was in C order. All memory layout was in C order.
cuda version 7.5 7.0 6.5
cuda version 8.0 7.5 7.0
gpu gpu
M40 0.47s M40 0.45s 0.47s
k80 0.96s k80 0.92s 0.96s
K6000/NOECC 0.69s K6000/NOECC 0.71s 0.69s
K40 0.88s P6000/NOECC 0.25s
K20m/ECC
K20/NOECC Titan X (Pascal) 0.28s
M2090 GTX Titan X 0.45s 0.45s 0.47s
C2075 GTX Titan Black 0.66s 0.64s 0.64s
M2075 GTX 1080 0.35s
M2070
M2070-Q
M2050(Amazon)
C1060
K600
GTX Titan X 0.45s 0.47s
GTX Titan Black 0.64s 0.64s
GTX Titan(D15U-50)
GTX 780
GTX 980 Ti 0.41s GTX 980 Ti 0.41s
GTX 980
GTX 970 0.66s GTX 970 0.66s
GTX 680 1.57s GTX 680 1.57s
GRID K520
GTX 750 Ti 2.01s 2.01s GTX 750 Ti 2.01s 2.01s
GTX 750 2.46s 2.37s GTX 750 2.46s 2.37s
GTX 660 2.32s 2.32s GTX 660 2.32s 2.32s
GTX 580 2.42s 2.47s GTX 580 2.42s
GTX 480 2.87s 2.88s GTX 480 2.87s
TX1 7.6s (float32 storage and computation) TX1 7.6s (float32 storage and computation)
GT 610 33.5s GT 610 33.5s
""") """)
......
"""
This code can only work if cudamat and theano are initialized on the
same gpu as theano.
WARNING: In the test of this file there is a transpose that is used...
So there can be problem with shape and stride order...
"""
from __future__ import absolute_import, print_function, division
import six
try:
import cudamat
cudamat_available = True
import theano.sandbox.cuda as cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
if six.PY3:
long = int
def cudandarray_to_cudamat(x, copyif=False):
""" take a CudaNdarray and return a cudamat.CUDAMatrix object.
:type x: CudaNdarray
:param x: The array to transform to cudamat.CUDAMatrix.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: cudamat.CUDAMatrix
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
elif x.ndim != 2:
raise TypeError("cudandarray_to_cudamat: input must be 2-d (has %s dims). That's "
"because cudamat arrays are always 2-dimensional")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous.
# the next step is to create a CUDAMatrix object. We do so by first creating
# a cudamat object with no data_host.
cm_mat = cudamat.cudamat()
cm_mat.size[0] = x.shape[0]
cm_mat.size[1] = x.shape[1]
cm_mat.on_host = 0
cm_mat.on_device = 1
cm_mat.is_trans = 0
cm_mat.owns_data = 0 # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
# x.gpudata is a long. We need a pointer to a float. cast.
import ctypes
cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
px = cudamat.CUDAMatrix(cm_mat)
px._base = x # x won't be __del__'ed as long as px is around.
# let cudamat know that we don't have a numpy array attached.
px.mat_on_host = False
return px
def cudamat_to_cudandarray(x):
""" take a cudamat.CUDAMatrix and make a CudaNdarray that point to its memory
"""
if not isinstance(x, cudamat.CUDAMatrix):
raise ValueError("We can transfer only cudamat.CUDAMatrix to CudaNdarray")
# elif x.dtype != "float32":
# raise ValueError("CudaNdarray support only float32")
# We don't need this, because cudamat is always float32.
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
import ctypes
ptr_long = long(ctypes.cast(x.mat.data_device, ctypes.c_void_p).value)
# seems legit.
z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x)
return z
except (ImportError, OSError):
cudamat_available = False
"""
This code can only work if gnumpy and theano are initialized on the
same gpu as theano.
"""
from __future__ import absolute_import, print_function, division
import six
from six.moves import reduce
try:
import gnumpy
import cudamat
gnumpy_available = True
___const_garray = gnumpy.rand(1)
import theano.sandbox.cuda as cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
if six.PY3:
long = int
def cudandarray_to_garray(x, copyif=False):
""" take a CudaNdarray and return a gnumpy.garray object.
:type x: CudaNdarray
:param x: The array to transform to gnumpy.garray.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: cudamat.CUDAMatrix
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous.
# the next step is to create a CUDAMatrix object. We do so by first creating
# a cudamat object with no data_host.
cm_mat = cudamat.cudamat()
cm_mat.size[0] = reduce(lambda x, y: x * y, x.shape, 1)
cm_mat.size[1] = 1
cm_mat.on_host = 0
cm_mat.on_device = 1
cm_mat.is_trans = 0
cm_mat.owns_data = 0 # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
# x.gpudata is a long. We need a pointer to a float. cast.
import ctypes
cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
px = cudamat.CUDAMatrix(cm_mat)
px._base = x # x won't be freed if the cudamat object isn't freed.
# let cudamat know that we don't have a numpy array attached.
px.mat_on_host = False
# Note how gnumpy tracks its cudamat objects: it moves things to the
# _cmsReuseCache when the gnumpy array is deleted, thus the arrays
# returned by theano will never be deleted.
# However, if the garray thinks that the object is a view, then it won't
# move the _base to the _cmsResueCache; so the cudamat object will be deleted,
# and we won't overpump the world with memory.
_is_alias_of = ___const_garray
ans = gnumpy.garray(px,
x.shape,
_is_alias_of)
return ans
def garray_to_cudandarray(x):
""" take a gnumpy.garray and make a CudaNdarray that point to its memory
"""
if not isinstance(x, gnumpy.garray):
raise ValueError("We can transfer only gnumpy.garray to CudaNdarray")
# elif x.dtype != "float32":
# raise ValueError("CudaNdarray support only float32")
# We don't need this, because cudamat is always float32.
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = strides[::-1]
for i in range(len(strides)):
if x.shape[i] == 1:
strides[i] = 0
strides = tuple(strides)
import ctypes
ptr_long = long(ctypes.cast(x._base.mat.data_device, ctypes.c_void_p).value)
# seems legit.
z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x._base)
return z
except (ImportError, OSError):
gnumpy_available = False
""" """
Function to detect memory sharing for ndarray AND sparse type AND CudaNdarray. Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
numpy version support only ndarray. numpy version support only ndarray.
""" """
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
...@@ -14,25 +14,12 @@ try: ...@@ -14,25 +14,12 @@ try:
def _is_sparse(a): def _is_sparse(a):
return scipy.sparse.issparse(a) return scipy.sparse.issparse(a)
except ImportError: except ImportError:
# scipy not imported, their can be only ndarray and cudandarray # scipy not imported, their can be only ndarray and gpuarray
def _is_sparse(a): def _is_sparse(a):
return False return False
from theano.sandbox import cuda
from theano import gpuarray from theano import gpuarray
if cuda.cuda_available:
from theano.sandbox.cuda.type import CudaNdarrayType
def _is_cuda(a):
return isinstance(a, cuda.CudaNdarray)
else:
def _is_cuda(a):
return False
__docformat__ = "restructuredtext en"
if gpuarray.pygpu: if gpuarray.pygpu:
def _is_gpua(a): def _is_gpua(a):
return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray) return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
...@@ -40,16 +27,14 @@ else: ...@@ -40,16 +27,14 @@ else:
def _is_gpua(a): def _is_gpua(a):
return False return False
__docformat__ = "restructuredtext en"
def may_share_memory(a, b, raise_other_type=True): def may_share_memory(a, b, raise_other_type=True):
a_ndarray = isinstance(a, np.ndarray) a_ndarray = isinstance(a, np.ndarray)
b_ndarray = isinstance(b, np.ndarray) b_ndarray = isinstance(b, np.ndarray)
if a_ndarray and b_ndarray: if a_ndarray and b_ndarray:
return TensorType.may_share_memory(a, b) return TensorType.may_share_memory(a, b)
a_cuda = _is_cuda(a)
b_cuda = _is_cuda(b)
if a_cuda and b_cuda:
return CudaNdarrayType.may_share_memory(a, b)
a_gpua = _is_gpua(a) a_gpua = _is_gpua(a)
b_gpua = _is_gpua(b) b_gpua = _is_gpua(b)
if a_gpua and b_gpua: if a_gpua and b_gpua:
...@@ -57,13 +42,13 @@ def may_share_memory(a, b, raise_other_type=True): ...@@ -57,13 +42,13 @@ def may_share_memory(a, b, raise_other_type=True):
a_sparse = _is_sparse(a) a_sparse = _is_sparse(a)
b_sparse = _is_sparse(b) b_sparse = _is_sparse(b)
if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or if (not(a_ndarray or a_sparse or a_gpua) or
not(b_ndarray or b_sparse or b_cuda or b_gpua)): not(b_ndarray or b_sparse or b_gpua)):
if raise_other_type: if raise_other_type:
raise TypeError("may_share_memory support only ndarray" raise TypeError("may_share_memory support only ndarray"
" and scipy.sparse, CudaNdarray or GpuArray type") " and scipy.sparse or GpuArray type")
return False return False
if a_cuda or b_cuda or a_gpua or b_gpua: if a_gpua or b_gpua:
return False return False
return SparseType.may_share_memory(a, b) return SparseType.may_share_memory(a, b)
...@@ -26,11 +26,6 @@ from theano import config ...@@ -26,11 +26,6 @@ from theano import config
from theano.compat import PY3 from theano.compat import PY3
from six import string_types from six import string_types
from theano.compile.sharedvalue import SharedVariable from theano.compile.sharedvalue import SharedVariable
try:
from theano.sandbox.cuda import cuda_ndarray
except ImportError:
cuda_ndarray = None
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
__authors__ = "Pascal Lamblin" __authors__ = "Pascal Lamblin"
...@@ -202,21 +197,28 @@ class PersistentNdarrayID(object): ...@@ -202,21 +197,28 @@ class PersistentNdarrayID(object):
return self.seen[id(obj)] return self.seen[id(obj)]
class PersistentCudaNdarrayID(PersistentNdarrayID): class PersistentGpuArrayID(PersistentNdarrayID):
def __call__(self, obj): def __call__(self, obj):
if (cuda_ndarray is not None and from theano.gpuarray.type import _name_for_ctx
type(obj) is cuda_ndarray.cuda_ndarray.CudaNdarray): try:
import pygpu
except ImportError:
pygpu = None
if (pygpu and
isinstance(obj, pygpu.gpuarray.GpuArray)):
if id(obj) not in self.seen: if id(obj) not in self.seen:
def write_array(f): def write_array(f):
pickle.dump(_name_for_ctx(obj.context), f, 2)
np.lib.format.write_array(f, np.asarray(obj)) np.lib.format.write_array(f, np.asarray(obj))
name = self._resolve_name(obj) name = self._resolve_name(obj)
zipadd(write_array, self.zip_file, name) zipadd(write_array, self.zip_file, name)
self.seen[id(obj)] = 'cuda_ndarray.{0}'.format(name) self.seen[id(obj)] = 'gpuarray.{0}'.format(name)
return self.seen[id(obj)] return self.seen[id(obj)]
return super(PersistentCudaNdarrayID, self).__call__(obj) return super(PersistentGpuArrayID, self).__call__(obj)
class PersistentSharedVariableID(PersistentCudaNdarrayID): class PersistentSharedVariableID(PersistentGpuArrayID):
"""Uses shared variable names when persisting to zip file. """Uses shared variable names when persisting to zip file.
If a shared variable has a name, this name is used as the name of the If a shared variable has a name, this name is used as the name of the
...@@ -282,26 +284,29 @@ class PersistentNdarrayLoad(object): ...@@ -282,26 +284,29 @@ class PersistentNdarrayLoad(object):
self.cache = {} self.cache = {}
def __call__(self, persid): def __call__(self, persid):
from theano.gpuarray.type import get_context
from theano.gpuarray import pygpu
array_type, name = persid.split('.') array_type, name = persid.split('.')
if name in self.cache: if name in self.cache:
return self.cache[name] return self.cache[name]
ret = None ret = None
array = np.lib.format.read_array(self.zip_file.open(name)) if array_type == 'gpuarray':
if array_type == 'cuda_ndarray': with self.zip_file.open(name) as f:
ctx_name = pickle.load(f)
array = np.lib.format.read_array(f)
if config.experimental.unpickle_gpu_on_cpu: if config.experimental.unpickle_gpu_on_cpu:
# directly return numpy array # directly return numpy array
warnings.warn("config.experimental.unpickle_gpu_on_cpu is set " warnings.warn("config.experimental.unpickle_gpu_on_cpu is set "
"to True. Unpickling CudaNdarray as " "to True. Unpickling GpuArray as numpy.ndarray")
"numpy.ndarray")
ret = array ret = array
elif cuda_ndarray: elif pygpu:
ret = cuda_ndarray.cuda_ndarray.CudaNdarray(array) ret = pygpu.array(array, context=get_context(ctx_name))
else: else:
raise ImportError("Cuda not found. Cannot unpickle " raise ImportError("pygpu not found. Cannot unpickle GpuArray")
"CudaNdarray")
else: else:
ret = array with self.zip_file.open(name) as f:
ret = np.lib.format.read_array(f)
self.cache[name] = ret self.cache[name] = ret
return ret return ret
......
差异被折叠。
from __future__ import absolute_import, print_function, division
import os
import warnings
import theano
import theano.sandbox.cuda
from theano import config
def set_gpu_from_theano():
"""
This set the GPU used by PyCUDA to the same as the one used by Theano.
"""
# Transfer the theano gpu binding to pycuda, for consistency
if config.device.startswith("gpu") and len(config.device) > 3:
os.environ["CUDA_DEVICE"] = theano.config.device[3:]
elif (config.init_gpu_device.startswith("gpu") and
len(config.init_gpu_device) > 3):
os.environ["CUDA_DEVICE"] = theano.config.init_gpu_device[3:]
set_gpu_from_theano()
pycuda_available = False
# If theano.sandbox.cuda don't exist, it is because we are importing
# it and it try to import this file! This mean we must init the device.
if (not hasattr(theano.sandbox, 'cuda') or
theano.sandbox.cuda.use.device_number is None):
try:
import pycuda
import pycuda.autoinit
pycuda_available = True
except (ImportError, RuntimeError):
# presumably, the user wanted to use pycuda, else they wouldn't have
# imported this module, so issue a warning that the import failed.
warnings.warn("PyCUDA import failed in theano.misc.pycuda_init")
except pycuda._driver.LogicError:
if theano.config.force_device:
raise
else:
if "CUDA_DEVICE" in os.environ:
del os.environ["CUDA_DEVICE"]
import pycuda.autoinit
pycuda_available = True
else:
try:
import pycuda.driver
pycuda_available = True
except ImportError:
pass
if pycuda_available:
if hasattr(pycuda.driver.Context, "attach"):
pycuda.driver.Context.attach()
import atexit
atexit.register(pycuda.driver.Context.pop)
else:
# Now we always import this file when we call
# theano.sandbox.cuda.use. So this should not happen
# normally.
# TODO: make this an error.
warnings.warn("For some unknow reason, theano.misc.pycuda_init was"
" not imported before Theano initialized the GPU and"
" your PyCUDA version is 2011.2.2 or earlier."
" To fix the problem, import theano.misc.pycuda_init"
" manually before using/initializing the GPU, use the"
" Theano flag pycuda.init=True or use a"
" more recent version of PyCUDA.")
from __future__ import absolute_import, print_function, division
import pycuda.gpuarray
from theano.sandbox import cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
def to_gpuarray(x, copyif=False):
""" take a CudaNdarray and return a pycuda.gpuarray.GPUArray
:type x: CudaNdarray
:param x: The array to transform to pycuda.gpuarray.GPUArray.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: pycuda.gpuarray.GPUArray
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to pycuda.gpuarray.GPUArray")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We were asked to not copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous
px = pycuda.gpuarray.GPUArray(x.shape, x.dtype, base=x, gpudata=x.gpudata)
return px
def to_cudandarray(x):
""" take a pycuda.gpuarray.GPUArray and make a CudaNdarray that point to its memory
:note: CudaNdarray support only float32, so only float32 GPUArray are accepted
"""
if not isinstance(x, pycuda.gpuarray.GPUArray):
raise ValueError("We can transfer only pycuda.gpuarray.GPUArray to CudaNdarray")
elif x.dtype != "float32":
raise ValueError("CudaNdarray support only float32")
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
ptr = int(x.gpudata) # in pycuda trunk, y.ptr also works, which is a little cleaner
z = cuda.from_gpu_pointer(ptr, x.shape, strides, x)
return z
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
from theano.misc.cudamat_utils import cudamat_available
if not cudamat_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("gnumpy not installed. Skip test of theano op with pycuda "
"code.")
from theano.misc.cudamat_utils import (cudandarray_to_cudamat,
cudamat_to_cudandarray)
def test(shape=(3, 4)):
"""
Make sure that the cudamat conversion is exact.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.fmatrix('U'))
ii = theano.function([U], gpu(U + 1))
A_cpu = np.asarray(np.random.rand(*shape), dtype="float32")
A_cnd = theano.sandbox.cuda.CudaNdarray(A_cpu)
A_cmat = cudandarray_to_cudamat(A_cnd)
B_cnd = cudamat_to_cudandarray(A_cmat)
B_cnd = ii(A_cnd)
u = A_cnd.copy()
u += theano.sandbox.cuda.CudaNdarray(np.asarray([[1]], dtype='float32'))
u = np.asarray(u)
v = np.asarray(B_cnd)
w = A_cmat.add(1).asarray()
assert abs(u - v).max() == 0
assert abs(u - w.T.reshape(u.shape)).max() == 0
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
from theano.misc.gnumpy_utils import gnumpy_available
if not gnumpy_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("gnumpy not installed. Skip test related to it.")
from theano.misc.gnumpy_utils import (garray_to_cudandarray,
cudandarray_to_garray)
import gnumpy
def test(shape=(3, 4, 5)):
"""
Make sure that the gnumpy conversion is exact from garray to
CudaNdarray back to garray.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.ftensor3('U'))
ii = theano.function([U], gpu(U + 1))
A = gnumpy.rand(*shape)
A_cnd = garray_to_cudandarray(A)
assert A_cnd.shape == A.shape
# dtype always float32
# garray don't have strides
B_cnd = ii(A_cnd)
B = cudandarray_to_garray(B_cnd)
assert A_cnd.shape == A.shape
u = (A + 1).asarray()
v = B.asarray()
w = np.array(B_cnd)
assert (u == v).all()
assert (u == w).all()
def test2(shape=(3, 4, 5)):
"""
Make sure that the gnumpy conversion is exact from CudaNdarray to
garray back to CudaNdarray.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.ftensor3('U'))
theano.function([U], gpu(U + 1))
A = np.random.rand(*shape).astype('float32')
A_cnd = theano.sandbox.cuda.CudaNdarray(A)
A_gar = cudandarray_to_garray(A_cnd)
assert A_cnd.shape == A_gar.shape
# dtype always float32
# garray don't have strides
B = garray_to_cudandarray(A_gar)
assert A_cnd.shape == B.shape
# dtype always float32
assert A_cnd._strides == B._strides
assert A_cnd.gpudata == B.gpudata
v = np.asarray(B)
assert (v == A).all()
def test_broadcast_dims():
"""
Test with some dimensions being 1.
CudaNdarray use 0 for strides for those dimensions.
"""
test((1, 2, 3))
test((2, 1, 3))
test((2, 3, 1))
test2((1, 2, 3))
test2((2, 1, 3))
test2((2, 3, 1))
""" """
test the tensor and sparse type. The CudaNdarray type is tested in test the tensor and sparse type. (gpuarray is tested in the gpuarray folder).
sandbox/cuda/tests/test_tensor_op.py.test_may_share_memory_cuda
""" """
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import numpy as np import numpy as np
...@@ -15,9 +14,7 @@ except ImportError: ...@@ -15,9 +14,7 @@ except ImportError:
from theano.misc.may_share_memory import may_share_memory from theano.misc.may_share_memory import may_share_memory
def test_may_share_memory(): def may_share_memory_core(a, b):
a = np.random.rand(5, 4)
b = np.random.rand(5, 4)
va = a.view() va = a.view()
vb = b.view() vb = b.view()
ra = a.reshape((4, 5)) ra = a.reshape((4, 5))
...@@ -51,6 +48,13 @@ def test_may_share_memory(): ...@@ -51,6 +48,13 @@ def test_may_share_memory():
except TypeError: except TypeError:
pass pass
def test_may_share_memory():
a = np.random.rand(5, 4)
b = np.random.rand(5, 4)
may_share_memory_core(a, b)
if scipy_imported: if scipy_imported:
def test_may_share_memory_scipy(): def test_may_share_memory_scipy():
a = scipy.sparse.csc_matrix(scipy.sparse.eye(5, 3)) a = scipy.sparse.csc_matrix(scipy.sparse.eye(5, 3))
......
...@@ -5,13 +5,9 @@ import unittest ...@@ -5,13 +5,9 @@ import unittest
from tempfile import mkdtemp from tempfile import mkdtemp
import numpy as np import numpy as np
from nose.plugins.skip import SkipTest
import theano import theano
import theano.sandbox.cuda as cuda_ndarray
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.var import CudaNdarraySharedVariable
from theano.sandbox.rng_mrg import MRG_RandomStreams from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.misc.pkl_utils import dump, load, StripPickler from theano.misc.pkl_utils import dump, load, StripPickler
...@@ -29,24 +25,8 @@ class T_dump_load(unittest.TestCase): ...@@ -29,24 +25,8 @@ class T_dump_load(unittest.TestCase):
if self.tmpdir is not None: if self.tmpdir is not None:
shutil.rmtree(self.tmpdir) shutil.rmtree(self.tmpdir)
def test_dump_load(self):
if not cuda_ndarray.cuda_enabled:
raise SkipTest('Optional package cuda disabled')
x = CudaNdarraySharedVariable('x', CudaNdarrayType((1, 1), name='x'),
[[1]], False)
with open('test', 'wb') as f:
dump(x, f)
with open('test', 'rb') as f:
x = load(f)
assert x.name == 'x'
np.testing.assert_allclose(x.get_value(), [[1]])
def test_dump_load_mrg(self): def test_dump_load_mrg(self):
rng = MRG_RandomStreams(use_cuda=cuda_ndarray.cuda_enabled) rng = MRG_RandomStreams()
with open('test', 'wb') as f: with open('test', 'wb') as f:
dump(rng, f) dump(rng, f)
......
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op"
" with pycuda code.")
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional package cuda disabled')
import theano.tensor as T
from theano.misc.pycuda_example import (PycudaElemwiseSourceModuleOp,
PycudaElemwiseSourceModuleMakeThunkOp)
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_pycuda_elemwise_source_module():
for shape in [(5, 5), (10, 49), (50, 49), (500, 501)]:
for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
x = T.fmatrix('x')
y = T.fmatrix('y')
elemwise_op = theano.tensor.Elemwise(op)
pycuda_op = PycudaElemwiseSourceModuleOp(op)
pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op)
f = theano.function([x, y], elemwise_op(x, y), mode=mode_with_gpu)
f2 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op(x, y)),
mode=mode_with_gpu)
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise")
f3 = theano.function([x, y], elemwise_op(x, y),
mode=mode_pycuda)
f4 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op_thunk(x, y)),
mode=mode_with_gpu)
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f2.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f3.maker.fgraph.toposort()])
assert any([isinstance(node.op,
PycudaElemwiseSourceModuleMakeThunkOp)
for node in f4.maker.fgraph.toposort()])
val1 = np.asarray(np.random.rand(*shape), dtype='float32')
val2 = np.asarray(np.random.rand(*shape), dtype='float32')
assert np.allclose(f(val1, val2), f2(val1, val2))
assert np.allclose(f(val1, val2), f3(val1, val2))
assert np.allclose(f(val1, val2), f4(val1, val2))
# print f(val1,val2)
# print f2(val1,val2)
"""
#commented as it work only with old pycuda version.
def test_pycuda_elemwise_kernel():
x = T.fmatrix('x')
y = T.fmatrix('y')
f = theano.function([x, y], x + y, mode=mode_with_gpu)
print(f.maker.fgraph.toposort())
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
f2 = theano.function([x, y], x + y, mode=mode_pycuda)
print(f2.maker.fgraph.toposort())
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f2.maker.fgraph.toposort()])
val1 = np.asarray(np.random.rand(5, 5), dtype='float32')
val2 = np.asarray(np.random.rand(5, 5), dtype='float32')
#val1 = np.ones((5,5))
#val2 = np.arange(25).reshape(5,5)
assert (f(val1, val2) == f2(val1, val2)).all()
print(f(val1, val2))
print(f2(val1, val2))
x3 = T.ftensor3('x')
y3 = T.ftensor3('y')
z3 = T.ftensor3('y')
f4 = theano.function([x3, y3, z3], x3 * y3 + z3, mode=mode_pycuda)
print(f4.maker.fgraph.toposort())
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f4.maker.fgraph.toposort()])
val1 = np.random.rand(2, 2, 2)
print(val1)
print(f4(val1, val1, val1))
assert np.allclose(f4(val1, val1, val1), val1 * val1 + val1)
"""
"""
This file is an example of view the memory allocated by pycuda in a GpuArray
in a CudaNdarray to be able to use it in Theano.
This also serve as a test for the function: cuda_ndarray.from_gpu_pointer
"""
from __future__ import absolute_import, print_function, division
import sys
import numpy as np
import theano
import theano.sandbox.cuda as cuda_ndarray
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed."
" We skip tests of Theano Ops with pycuda code.")
if cuda_ndarray.cuda_available is False: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional theano package cuda disabled')
import pycuda
import pycuda.driver as drv
import pycuda.gpuarray
def test_pycuda_only():
"""Run pycuda only example to test that pycuda works."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
# Test with pycuda in/out of numpy.ndarray
a = np.random.randn(100).astype(np.float32)
b = np.random.randn(100).astype(np.float32)
dest = np.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400, 1, 1), grid=(1, 1))
assert (dest == a * b).all()
def test_pycuda_theano():
"""Simple example with pycuda function and Theano CudaNdarray object."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = np.random.randn(100).astype(np.float32)
b = np.random.randn(100).astype(np.float32)
# Test with Theano object
ga = cuda_ndarray.CudaNdarray(a)
gb = cuda_ndarray.CudaNdarray(b)
dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
multiply_them(dest, ga, gb,
block=(400, 1, 1), grid=(1, 1))
assert (np.asarray(dest) == a * b).all()
def test_pycuda_memory_to_theano():
# Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
y = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
print(sys.getrefcount(y))
# This increase the ref count with never pycuda. Do pycuda also
# cache ndarray?
# print y.get()
initial_refcount = sys.getrefcount(y)
print("gpuarray ref count before creating a CudaNdarray", end=' ')
print(sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount
rand = np.random.randn(*y.shape).astype(np.float32)
cuda_rand = cuda_ndarray.CudaNdarray(rand)
strides = [1]
for i in y.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
print('strides', strides)
assert cuda_rand._strides == strides, (cuda_rand._strides, strides)
# in pycuda trunk, y.ptr also works, which is a little cleaner
y_ptr = int(y.gpudata)
z = cuda_ndarray.from_gpu_pointer(y_ptr, y.shape, strides, y)
print("gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount + 1
assert (np.asarray(z) == 0).all()
assert z.base is y
# Test that we can take a view from this cuda view on pycuda memory
zz = z.view()
assert sys.getrefcount(y) == initial_refcount + 2
assert zz.base is y
del zz
assert sys.getrefcount(y) == initial_refcount + 1
cuda_ones = cuda_ndarray.CudaNdarray(np.asarray([[[1]]],
dtype='float32'))
z += cuda_ones
assert (np.asarray(z) == np.ones(y.shape)).all()
assert (np.asarray(z) == 1).all()
assert cuda_rand.shape == z.shape
assert cuda_rand._strides == z._strides, (cuda_rand._strides, z._strides)
assert (np.asarray(cuda_rand) == rand).all()
z += cuda_rand
assert (np.asarray(z) == (rand + 1)).all()
# Check that the ref count to the gpuarray is right.
del z
print("gpuarray ref count after deleting the CudaNdarray", end=' ')
print(sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount
from __future__ import absolute_import, print_function, division
import numpy as np
import theano.sandbox.cuda as cuda
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda "
"code.")
if cuda.cuda_available is False: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional theano package cuda disabled')
from theano.misc.pycuda_utils import to_gpuarray, to_cudandarray
import pycuda.gpuarray
def test_to_gpuarray():
cx = cuda.CudaNdarray.zeros((5, 4))
px = to_gpuarray(cx)
assert isinstance(px, pycuda.gpuarray.GPUArray)
cx[0, 0] = np.asarray(1, dtype="float32")
# Check that they share the same memory space
assert px.gpudata == cx.gpudata
assert np.asarray(cx[0, 0]) == 1
assert np.allclose(np.asarray(cx), px.get())
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert all(np.asarray(cx._strides) * 4 == px.strides)
# Test when the CudaNdarray is strided
cx = cx[::2, ::]
px = to_gpuarray(cx, copyif=True)
assert isinstance(px, pycuda.gpuarray.GPUArray)
cx[0, 0] = np.asarray(2, dtype="float32")
# Check that they do not share the same memory space
assert px.gpudata != cx.gpudata
assert np.asarray(cx[0, 0]) == 2
assert not np.allclose(np.asarray(cx), px.get())
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert not all(np.asarray(cx._strides) * 4 == px.strides)
# Test that we return an error
try:
px = to_gpuarray(cx)
assert False
except ValueError:
pass
def test_to_cudandarray():
px = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
cx = to_cudandarray(px)
assert isinstance(cx, cuda.CudaNdarray)
assert np.allclose(px.get(),
np.asarray(cx))
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert all(np.asarray(cx._strides) * 4 == px.strides)
try:
px = pycuda.gpuarray.zeros((3, 4, 5), 'float64')
to_cudandarray(px)
assert False
except ValueError:
pass
try:
to_cudandarray(np.zeros(4))
assert False
except ValueError:
pass
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论