提交 27146e2e authored 作者: ruslanagit's avatar ruslanagit

Merge remote-tracking branch 'refs/remotes/Theano/master'

......@@ -37,4 +37,5 @@ Theano.suo
.ipynb_checkpoints
.pydevproject
.ropeproject
core
\ No newline at end of file
core
.idea
#!/bin/bash
# Script for Jenkins continuous integration pre-testing
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# Test flake8
echo "===== Testing flake8"
bin/theano-nose theano/tests/test_flake8.py || exit 1
# Test documentation
echo "===== Testing documentation build"
python doc/scripts/docgen.py --nopdf --check || exit 1
echo "===== Testing documentation code snippets"
python doc/scripts/docgen.py --test --check || exit 1
#!/bin/bash
# Script for Jenkins continuous integration testing of theano base
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
echo "===== Testing theano core"
# Test theano core
PARTS="theano -e cuda -e gpuarray"
THEANO_PARAM="${PARTS} --with-timer --timer-top-n 10"
FLAGS="mode=FAST_RUN,floatX=float32"
THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
#!/bin/bash
# Script for Jenkins continuous integration testing of gpu backends
# Print commands as they are executed
set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
echo "===== Testing old theano.sandbox.cuda backend"
THEANO_CUDA_TESTS="theano/sandbox/cuda/tests \
theano/misc/tests/test_pycuda_example.py \
theano/misc/tests/test_pycuda_theano_simple.py \
theano/misc/tests/test_pycuda_utils.py \
theano/tensor/tests/test_opt.py:TestCompositeCodegen \
theano/tensor/tests/test_opt.py:test_shapeoptimizer \
theano/tensor/tests/test_opt.py:test_fusion \
theano/compile/tests/test_debugmode.py:Test_preallocated_output \
theano/sparse/tests/test_basic.py:DotTests \
theano/sandbox/tests/test_multinomial.py:test_gpu_opt \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPU_serial \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPU_parallel \
theano/sandbox/tests/test_rng_mrg.py:test_GPU_nstreams_limit \
theano/sandbox/tests/test_rng_mrg.py:test_overflow_gpu_old_backend \
theano/scan_module/tests/test_scan.py:T_Scan_Cuda"
THEANO_PARAM="${THEANO_CUDA_TESTS} --with-timer --timer-top-n 10"
FLAGS="mode=FAST_RUN,init_gpu_device=gpu,floatX=float32"
THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
echo "===== Testing gpuarray backend"
GPUARRAY_CONFIG="Release"
DEVICE=cuda0
LIBDIR=~/tmp/local
# Make fresh clones of libgpuarray (with no history since we don't need it)
rm -rf libgpuarray
git clone --depth 1 "https://github.com/Theano/libgpuarray.git"
# Clean up previous installs (to make sure no old files are left)
rm -rf $LIBDIR
mkdir $LIBDIR
# Build libgpuarray
mkdir libgpuarray/build
(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)
# Finally install
(cd libgpuarray/build && make install)
# Export paths
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib64/
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib64/
export CPATH=$CPATH:$LIBDIR/include
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
# Build the pygpu modules
(cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
ls $LIBDIR
mkdir $LIBDIR/lib/python
export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
# Then install
(cd libgpuarray && python setup.py install --home=$LIBDIR)
# Testing theano (the gpuarray parts)
THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \
theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray"
FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN"
THEANO_FLAGS=${FLAGS} time nosetests -v ${THEANO_GPUARRAY_TESTS}
......@@ -30,10 +30,14 @@
function build_vswitch() {
// Build HTML string for version selector, based on ReadTheDocs theme's versions.html
var vlabel = current_version.replace("theano_versions/", "");
if (vlabel == 'theano') {
vlabel = 'release';
}
var vswitch = ['<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions" align=left>'];
vswitch.push('<span class="rst-current-version" data-toggle="rst-current-version">');
vswitch.push('<span class="fa fa-book"></span>');
vswitch.push('v: ', current_version.replace("theano_versions/", ""), ' ');
vswitch.push('v: ', vlabel, ' ');
vswitch.push('<span class="fa fa-caret-down"></span>');
vswitch.push('</span>');
......
.. _css:
.. raw:: html
<style> .black {color:black} </style>
<style> .blue {color:blue} </style>
<style> .red {color:red} </style>
<style> .green {color:green} </style>
<style> .pink {color:pink} </style>
.. role:: blue
.. role:: red
.. role:: green
.. role:: pink
.. role:: black
......@@ -10,14 +10,14 @@ Contributing
You want to contribute to Theano? That is great! This page explain our
workflow and some resource for doing so.
Looking for an idea for a first contribution? Check `github issue
<https://github.com/Theano/Theano/issues?q=is%3Aopen+is%3Aissue+label%3A%22Easy+fix%22>`
Looking for an idea for a first contribution? Check the `github issues
<https://github.com/Theano/Theano/issues?q=is%3Aopen+is%3Aissue+label%3A%22Easy+fix%22>`_
with a label ``easy fix``. They are good starter. It is recommanded
that you write on the issue you want to work on it. This help make
sure it is up to date and see if nobody else is working on it. Also,
we can sometimes provides more information about it. There is also
the label `NeedSomeoneToFinish
<https://github.com/Theano/Theano/labels/NeedSomeoneToFinish>` that is
<https://github.com/Theano/Theano/labels/NeedSomeoneToFinish>`_ that is
interresting to check. The difficulty level is variable.
Resources
......@@ -85,8 +85,8 @@ make sure there are no global impacts.
Also, if you are changing GPU code, travis doesn't test that, because
there are no GPUs on the test nodes.
To run the test suite with the default options, you can follow the
instructions of :ref:`testing_installation`.
To run the test suite with the default options, see
:ref:`test_theano`.
Each night we execute all the unit tests automatically, with several
sets of options. The result is sent by email to the `theano-buildbot`_
......@@ -126,7 +126,11 @@ To setup VIM:
#. Install flake8 (if not already installed) with::
pip install flake8
pip install "flake8<3"
.. warning:: Starting version 3.0.0, flake8 changed its dependancies and
moved its Python API to a legacy module, breaking Theano's flake8 tests.
We recommend using a version prior to 3.
.. note:: You can use ``easy_install`` instead of ``pip``, and ``pep8``
instead of ``flake8`` if you prefer. The important thing is that the
......@@ -150,6 +154,8 @@ To setup VIM:
Plugin 'scrooloose/syntastic'
Plugin 'jimf/vim-pep8-text-width'
call vundle#end()
" Syntastic settings
" You can run checkers explicitly by calling :SyntasticCheck <checker
let g:syntastic_python_checkers = ['flake8'] "use one of the following checkers:
......@@ -360,7 +366,7 @@ You can choose another name than "central" to reference Theano/Theano
to "central."
You can then test your installation of Theano by following the steps of
:ref:`testing_installation`.
:ref:`test_theano`.
Using your local copy
......
......@@ -872,9 +872,9 @@ To understand this profile here is some explanation of how optimizations work:
0.131s for callback
time - (name, class, index) - validate time
Then it will print, with some additional indentation, each sub-optimizer's profile
information. These sub-profiles are ordered by the time they took to execute,
not by their execution order.
Then it will print, with some additional indentation, each sub-optimizer's profile
information. These sub-profiles are ordered by the time they took to execute,
not by their execution order.
* ``OPT_FAST_RUN`` is the name of the optimizer
* 1.152s is the total time spent in that optimizer
......
......@@ -10,21 +10,6 @@ Does Theano support Python 3?
------------------------------
We support both Python 2 >= 2.6 and Python 3 >= 3.3.
TypeError: object of type 'TensorVariable' has no len()
-------------------------------------------------------
If you receive the following error, it is because the Python function *__len__* cannot
be implemented on Theano variables:
.. code-block:: python
TypeError: object of type 'TensorVariable' has no len()
Python requires that *__len__* returns an integer, yet it cannot be done as Theano's variables are symbolic. However, `var.shape[0]` can be used as a workaround.
This error message cannot be made more explicit because the relevant aspects of Python's
internals cannot be modified.
Output slight numerical difference
----------------------------------
......@@ -39,7 +24,6 @@ Every Computer Scientist Should Know About Floating-Point Arithmetic
<https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html>`_.
Faster gcc optimization
-----------------------
......@@ -179,33 +163,6 @@ but requires that all nodes in the graph have a C implementation:
f(10.)
Out of memory... but not really
-------------------------------
Occasionally Theano may fail to allocate memory when there appears to be more
than enough reporting:
Error allocating X bytes of device memory (out of memory). Driver report Y
bytes free and Z total.
where X is far less than Y and Z (i.e. X << Y < Z).
This scenario arises when an operation requires allocation of a large contiguous
block of memory but no blocks of sufficient size are available.
GPUs do not have virtual memory and as such all allocations must be assigned to
a continuous memory region. CPUs do not have this limitation because or their
support for virtual memory. Multiple allocations on a GPU can result in memory
fragmentation which can makes it more difficult to find contiguous regions
of memory of sufficient size during subsequent memory allocations.
A known example is related to writing data to shared variables. When updating a
shared variable Theano will allocate new space if the size of the data does not
match the size of the space already assigned to the variable. This can lead to
memory fragmentation which means that a continugous block of memory of
sufficient capacity may not be available even if the free memory overall is
large enough.
Related Projects
----------------
......@@ -226,55 +183,3 @@ Here is a list of some of the known limitations:
interact with the rest of the graph).
- Neither *goto* nor *recursion* is supported or planned within expression graphs.
"float32 / int{32, 64} gives float64"
-------------------------------------
It should be noted that using float32 and int{32, 64} together
inside a function would provide float64 as output.
Since the GPU can't compute this kind of output, it would be
preferable not to use those dtypes together.
To help you find where float64 are created, see the
:attr:`warn_float64` Theano flag.
Theano memory/speed trade-off
-----------------------------
There is a few things you can easily do to change the trade-off
between speed and memory usage. It nothing is said, this affect the
CPU and GPU memory usage.
Could speed up and lower memory usage:
- :ref:`cuDNN <libdoc_cuda_dnn>` default cuDNN convolution use less
memory then Theano version. But some flags allow it to use more
memory. GPU only.
- Shortly avail, multi-GPU.
Could raise memory usage but speed up computation:
- :attr:`config.lib.cnmem` =1 # Do not raise much memory usage, but if you are at the limit of GPU memory available. GPU only.
- :attr:`config.allow_gc` =False
- :attr:`config.optimizer_excluding` =low_memory , GPU only for now.
Could lower the memory usage, but raise computation time:
- :attr:`config.scan.allow_gc` =True # Probably not significant slowdown if config.lib.cnmem is used.
- :attr:`config.scan.allow_output_prealloc` =False
- Use :func:`batch_normalization()
<theano.tensor.nnet.bn.batch_normalization>`. It use less memory
then building a corresponding Theano graph.
- Disable one or scan more optimizations:
- ``optimizer_excluding=scanOp_pushout_seqs_ops``
- ``optimizer_excluding=scan_pushout_dot1``
- ``optimizer_excluding=scanOp_pushout_output``
- Disable all optimization tagged as raising memory usage:
``optimizer_excluding=more_mem`` (currently only the 3 scan optimizations above)
- `float16 <https://github.com/Theano/Theano/issues/2908>`_.
If you want to analyze the memory usage during computation, the
simplest is to let the memory error happen during Theano execution and
use the Theano flags :attr:`exception_verbosity=high`.
......@@ -17,17 +17,18 @@ shapes = [
('col', (False, True)),
('matrix', (False,False)),
('tensor3', (False,False,False)),
('tensor4', (False,False,False,False)),]
('tensor4', (False,False,False,False)),
('tensor5', (False,False,False,False,False)),]
hdr = '============ =========== ==== =========== ================================='
hdr = '============ =========== ==== ============ ==================================='
print(hdr)
print('Constructor dtype ndim shape broadcastable')
print('Constructor dtype ndim shape broadcastable')
print(hdr)
for letter in letters:
for shape in shapes:
suff = ',)' if len(shape[1])==1 else ')'
s = '(' + ','.join('1' if b else '?' for b in shape[1]) + suff
print('%s%-10s %-10s %-4s %-10s %-20s' %(
print('%s%-10s %-10s %-4s %-11s %-20s' %(
letter[0], shape[0], letter[1], len(shape[1]), s, shape[1]
))
print(hdr)
......@@ -125,6 +125,7 @@ Roughly in order of what you'll want to check out:
* :ref:`install` -- How to install Theano.
* :ref:`introduction` -- What is Theano?
* :ref:`tutorial` -- Learn the basics.
* :ref:`troubleshooting` -- Tips and tricks for common debugging.
* :ref:`libdoc` -- Theano's functionality, module by module.
* :ref:`faq` -- A set of commonly asked questions.
* :ref:`optimizations` -- Guide to Theano's graph optimizations.
......@@ -237,12 +238,15 @@ StackOverflow, follow their guidance for `answering questions <http://stackoverf
NEWS
introduction
requirements
install
updating
tutorial/index
extending/index
dev_start_guide
optimizations
library/index
troubleshooting
glossary
links
internal/index
......
差异被折叠。
:orphan:
.. include:: css.inc
.. _install_centos6:
CentOS 6 Installation Instructions
##################################
Easy Installation of an optimized Theano on CentOS 6
====================================================
.. warning::
If you want to install the bleeding-edge or development version of Theano
from GitHub, please make sure you are reading `the latest version of this
page <http://deeplearning.net/software/theano_versions/dev/install_centos6.html>`_.
.. note::
.. include:: requirements.txt
It is possible to have a faster installation of Theano than the one these
instructions will provide, but this will make the installation more
complicated and/or may require that you buy software. This is a simple set
of installation instructions that will leave you with a relatively
well-optimized version that uses only free software. With more work or by
investing money (i.e. buying a license to a proprietary BLAS
implementation), it is possible to gain further performance.
.. include:: install_generic.inc
:start-line: 5
.. note::
If you are behind a proxy, you must do some extra configuration steps
before starting the installation. You must set the environment
variable ``http_proxy`` to the proxy address. Using bash this is
accomplished with the command
``export http_proxy="http://user:pass@my.site:port/"``
You can also provide the ``--proxy=[user:pass@]url:port`` parameter
to pip. The ``[user:pass@]`` portion is optional.
.. note::
We use ``pip`` for 2 reasons. First, it allows "``import module;
module.test()``" to work correctly. Second, the installation of NumPy
1.6 or 1.6.1 with ``easy_install`` raises an ImportError at the end of
the installation. To my knowledge we can ignore this error, but
this is not completely safe. ``easy_install`` with NumPy 1.5.1 does not
raise this error.
Installation steps
~~~~~~~~~~~~~~~~~~
1) ``sudo yum install python-devel python-nose python-setuptools gcc
gcc-gfortran gcc-c++ blas-devel lapack-devel atlas-devel``
2) ``sudo easy_install pip``
3) ``sudo pip install numpy==1.6.1``
4) ``sudo pip install scipy==0.10.1``
5) ``sudo pip install Theano``
Test the newly installed packages
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1) NumPy (~30s): ``python -c "import numpy; numpy.test()"``
2) SciPy (~1m): ``python -c "import scipy; scipy.test()"``
3) Theano (~30m): ``python -c "import theano; theano.test()"``
Speed test Theano/BLAS
~~~~~~~~~~~~~~~~~~~~~~
It is recommended to test your Theano/BLAS integration. There are many versions
of BLAS that exist and there can be up to 10x speed difference between them.
Also, having Theano link directly against BLAS instead of using NumPy/SciPy as
an intermediate layer reduces the computational overhead. This is
important for BLAS calls to ``ger``, ``gemv`` and small ``gemm`` operations
(automatically called when needed when you use ``dot()``). To run the
Theano/BLAS speed test:
Requirements through System Packages (not recommended)
------------------------------------------------------
.. code-block:: bash
python /usr/lib/python2.*/site-packages/theano/misc/check_blas.py
This will print a table with different versions of BLAS/numbers of
threads on multiple CPUs and GPUs. It will also print some Theano/NumPy
configuration information. Then, it will print the running time of the same
benchmarks for your installation. Try to find a CPU similar to yours in
the table, and check that the single-threaded timings are roughly the same.
Updating Theano
~~~~~~~~~~~~~~~
If you followed these installation instructions, you can execute this command
to update only Theano:
.. code-block:: bash
sudo pip install --upgrade --no-deps theano
If you want to also update NumPy/SciPy, you can run this:
.. code-block:: bash
sudo pip install --upgrade theano
Bleeding edge
~~~~~~~~~~~~~
Do like in the section "Updating Theano", but use
``git+git://github.com/Theano/Theano.git`` instead of ``theano``.
sudo yum install python-devel python-nose python-setuptools gcc gcc-gfortran gcc-c++ blas-devel lapack-devel atlas-devel
sudo easy_install pip
.. include:: css.inc
.. _install_generic:
Installation
============
Stable Installation
-------------------
Install the latest stable version of Theano with:
.. raw:: html
<div class="highlight"><pre><span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> Theano[test, doc]</pre></div>
- Any argument between <...> is optional.
- Use :red:`sudo` for a root installation.
- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
- [test] will install the requirements for testing.
- [doc] will install the requirements in order to generate the documentation.
If you encountered any trouble, head to the :ref:`troubleshooting` page.
libgpuarray
^^^^^^^^^^^
For the stable version of Theano you need a specific version of libgpuarray,
that has been tagged ``v-9998``.
Download it with:
.. raw:: html
<div class='highlight'><pre>
git clone https://github.com/Theano/libgpuarray.git --tags
git checkout origin/v-9998
cd libgpuarray
</pre></div>
and then follow the `Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
Bleeding-Edge Installation (recommended)
----------------------------------------
Install the latest, bleeding-edge, development version of Theano with:
.. raw:: html
<div class='highlight'><pre><span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> <span class="pink">&lt;--no-deps&gt;</span> git+https://github.com/Theano/Theano.git#egg=Theano</pre></div>
- Any argument between <...> is optional.
- Use :red:`sudo` for a root installation.
- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
- Use :pink:`no-deps` when you don't want the dependencies of Theano to be installed through pip. This is important when they have already been installed as system packages.
If you encountered any trouble, head to the :ref:`troubleshooting` page.
libgpuarray
^^^^^^^^^^^
Install the latest, development version of libgpuarray following the
`Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
Developer Installation
----------------------
Install the developer version of Theano with:
.. raw:: html
<div class="highlight"><pre>git clone git://github.com/Theano/Theano.git
cd Theano
<span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> <span class="pink">&lt;--no-deps&gt;</span> <span class="green">-e .</span></pre></div>
- Any argument between <...> is optional.
- Use :red:`sudo` for a root installation.
- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
- Use :pink:`no-deps` when you don't want the dependencies of Theano to be installed through pip. This is important when they have already been installed as system packages.
- :green:`-e` makes your installation *editable*, i.e., it links it to your
source directory.
If you encountered any trouble, head to the :ref:`troubleshooting` page.
libgpuarray
^^^^^^^^^^^
Install the latest, development version of libgpuarray following the
`Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
.. include:: css.inc
.. _install_macos:
Mac OS Installation Instructions
################################
.. warning::
If you want to install the bleeding-edge or development version of Theano
from GitHub, please make sure you are reading `the latest version of this
page <http://deeplearning.net/software/theano_versions/dev/install_macos.html>`_.
There are various ways to install Theano dependencies on a Mac. Here
we describe the process in detail with Anaconda, Homebrew or MacPorts
but if you did it differently and it worked, please let us know the
details on the `theano-users`_ mailing-list, so that we can add
alternative instructions here.
.. _theano-users: http://groups.google.com/group/theano-users?pli=1
.. include:: requirements.txt
.. _gpu_macos:
.. attention::
For MacOS you should be able to follow the above instructions to
setup CUDA, but be aware of the following caveats:
* If you want to compile the CUDA SDK code, you may need to temporarily
revert back to Apple's gcc (``sudo port select gcc``) as their Makefiles
are not compatible with MacPort's gcc.
* If CUDA seems unable to find a CUDA-capable GPU, you may need to manually
toggle your GPU on, which can be done with
`gfxCardStatus <http://codykrieger.com/gfxCardStatus>`__.
.. attention::
Theano officially supports only clang on OS X. This can be installed
by getting XCode from the App Store and running it once to install the
command-line tools.
.. include:: install_generic.inc
:start-line: 5
Requirements through Homebrew (not recommended)
-----------------------------------------------
Install python with homebrew:
.. code-block:: bash
$ brew install python # or python3 if you prefer
This will install pip. Then use pip to install numpy, scipy:
.. code-block:: bash
$ pip install numpy scipy
If you want to use openblas instead of Accelerate, you have to install
numpy and scipy with hombrew:
.. code-block:: bash
$ brew tap homebrew/python
$ brew install numpy --with-openblas
$ brew install scipy --with-openblas
Requirements through MacPorts (not recommended)
-----------------------------------------------
Using `MacPorts <http://www.macports.org/>`__ to install all required
Theano dependencies is easy, but be aware that it will take a long time
(a few hours) to build and install everything.
- MacPorts requires installing XCode first (which can be found in the
Mac App Store), if you do not have it already.
If you can't install it from the App Store, look in your MacOS X installation
DVD for an old version. Then update your Mac to update XCode.
- Download and install `MacPorts <http://www.macports.org/>`__, then
ensure its package list is up-to-date with ``sudo port selfupdate``.
- Then, in order to install one or more of the required libraries, use
``port install``, e.g. as follows:
.. code-block:: bash
$ sudo port install py27-numpy +atlas py27-scipy +atlas py27-pip
This will install all the required Theano dependencies. gcc will
be automatically installed (since it is a SciPy dependency), but be
aware that it takes a long time to compile (hours)!
Having NumPy and SciPy linked with ATLAS (an optimized BLAS
implementation) is not mandatory, but recommended if you care about
performance.
- You might have some different versions of gcc, SciPy, NumPy, Python installed
on your system, perhaps via Xcode. It is a good idea to use **either** the
MacPorts version of everything **or** some other set of compatible versions
(e.g. provided by Xcode or Fink). The advantages of MacPorts are the
transparency with which everything can be installed and the fact that
packages are updated quite frequently. The following steps describe how to
make sure you are using the MacPorts version of these packages.
- In order to use the MacPorts version of Python, you will probably
need to explicitly select it with ``sudo port select python python27``. The
reason this is necessary is because you may have an Apple-provided Python
(via, for example, an Xcode installation). After performing this step, you
should check that the symbolic link provided by ``which python`` points to
the MacPorts python. For instance, on MacOS X Lion with MacPorts 2.0.3,
the output of ``which python`` is ``/opt/local/bin/python`` and this symbolic
link points to ``/opt/local/bin/python2.7``. When executing ``sudo
port select python python27-apple`` (which you should **not** do), the link
points to ``/usr/bin/python2.7``.
- Similarly, make sure that you are using the MacPorts-provided gcc:
use ``sudo port select gcc`` to see which gcc installs you have on the
system. Then execute for instance ``sudo port select gcc mp-gcc44``
to create a symlink that points to the correct (MacPorts) gcc (version 4.4
in this case).
- At this point, if you have not done so already, it may be a good idea to
close and restart your terminal, to make sure all configuration changes
are properly taken into account.
- Afterwards, please check that the ``scipy`` module that is imported in
Python is the right one (and is a recent one). For instance, ``import
scipy`` followed by ``print scipy.__version__`` and ``print scipy.__path__``
should result in a version number of at least 0.7.0 and a path that starts
with ``/opt/local`` (the path where MacPorts installs its packages). If this
is not the case, then you might have some old installation of ``scipy`` in your
``PYTHONPATH`` so you should edit ``PYTHONPATH`` accordingly.
- Please follow the same procedure with ``numpy``.
- This is covered in the MacPorts installation process, but make sure that
your ``PATH`` environment variable contains ``/opt/local/bin`` and
``/opt/local/sbin`` before any other paths (to ensure that the Python and
gcc binaries that you installed with MacPorts are visible first).
- MacPorts does not create automatically ``nosetests`` and ``pip`` symlinks
pointing to the MacPorts version, so you can add them yourself with
.. code-block:: bash
$ sudo ln -s /opt/local/bin/nosetests-2.7 /opt/local/bin/nosetests
$ sudo ln -s /opt/local/bin/pip-2.7 /opt/local/bin/pip
.. _install_others:
Other Platform-specific Installations
=====================================
.. warning::
These instructions are not kept up to date.
NVIDIA Jetson TX1 embedded platform
-----------------------------------
.. code-block:: bash
sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libblas-dev git
pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --user # Need Theano 0.8 or more recent
Gentoo
------
Brian Vandenberg emailed `installation instructions on Gentoo
<http://groups.google.com/d/msg/theano-dev/-8WCMn2FMR0/bJPasoZXaqoJ>`_,
focusing on how to install the appropriate dependencies.
Nicolas Pinto provides `ebuild scripts <https://github.com/npinto/sekyfsr-gentoo-overlay/tree/master/sci-libs/Theano>`_.
Docker images
-------------
Builds of Theano are available as `Docker <https://www.docker.com/whatisdocker>`_ images:
`Theano Docker (CPU) <https://hub.docker.com/r/kaixhin/theano/>`_ or `Theano Docker (CUDA) <https://hub.docker.com/r/kaixhin/cuda-theano/>`_.
These are updated on a weekly basis with bleeding-edge builds of Theano. Examples of running bash in a Docker container
are as follows:
.. code-block:: bash
sudo docker run -it kaixhin/theano
sudo docker run -it --device /dev/nvidiactl --device /dev/nvidia-uvm --device /dev/nvidia0 kaixhin/cuda-theano:7.0
For a guide to Docker, see the `official docs <https://docs.docker.com/userguide/>`_. For more details on how to use the
Theano Docker images, including requirements for CUDA support, consult the `source project <https://github.com/Kaixhin/dockerfiles>`_.
差异被折叠。
:orphan:
.. _install_windows:
Windows Installation Instructions
=================================
.. warning::
If you want to install the bleeding-edge or development version of Theano
from GitHub, please make sure you are reading `the latest version of this
page <http://deeplearning.net/software/theano_versions/dev/install_windows.html>`_.
.. warning::
Installation of Theano on Windows
==================================
Theano is mainly developed and tested on Linux Machines.
These instructions show step-by-step how to install Theano and
required dependencies on a 32- or 64-bit system using freely available
......@@ -26,6 +32,8 @@ C/C++ (for Python 2.7 family this has to be Microsoft Visual Studio
version supporting Visual Studio 2008), and GCC (for non-CUDA C code
generated by Theano).
.. _gpu_windows:
Visual Studio and CUDA
######################
......@@ -37,7 +45,6 @@ Studio installation to proceed). Afterwards, the Visual Studio 2010
can be safely removed. If someone knows how to install CUDA 5.5
without a proper Visual Studio installation, please let us know.
First we need to install Microsoft Visual Studio 2010 Express, which
is required to install CUDA. You can download it from
`Visual Studio Express
......@@ -79,7 +86,7 @@ The package will be installed to ``C:\Program Files
(x86)\Common Files\Microsoft\Visual C++ for Python\9.0``.
Finally download the ``stdint.h`` header from
`here <http://msinttypes.googlecode.com/svn/trunk/stdint.h>`_ and save it as
`here <https://sourceforge.net/p/mspgcc/msp430-libc/ci/master/tree/include/stdint.h>`_ and save it as
``C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for
Python\9.0\VC\include\stdint.h``.
......@@ -619,6 +626,3 @@ follows:
dependencies. In the case where it is a dependency, you can use the
`Dependency Walker <http://www.dependencywalker.com/>`__ utility to figure out
which one.
.. _gpu_windows:
......@@ -174,7 +174,8 @@ Reference
list is not used in the graph. Possible values are 'raise',
'warn', and 'ignore'.
:rtype: Function instance
:rtype: :class:`Function <theano.compile.function_module.Function>`
instance
:returns: a callable object that will compute the outputs (given the inputs)
and update the implicit function arguments according to the `updates`.
......
......@@ -487,6 +487,21 @@ import theano and print the config variable, as in:
automatically to get more memory. But this can cause
fragmentation, see note above.
.. attribute:: config.gpuarray.sched
String value: ``'default'``, ``'multi'``, ``'single'``
Default: ``'default'``
Control the stream mode of contexts.
The sched parameter passed for context creation to pygpu. With
CUDA, using "multi" mean using the parameter
cudaDeviceScheduleYield. This is useful to lower the CPU overhead
when waiting for GPU. One user found that it speeds up his other
processes that was doing data augmentation.
.. attribute:: config.gpuarray.single_stream
Boolean value
......
......@@ -61,7 +61,7 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
usage
* ``none`` : use a slower implementation with minimal memory usage
* ``large`` : use a sometimes faster implementation with large memory usage
* ``fft`` : use the Fast Fourrier Transform implementation of convolution
* ``fft`` : use the Fast Fourier Transform implementation of convolution
(very high memory usage)
* ``guess_once`` : the first time a convolution is executed, the
implementation to use is chosen according to cuDNN's heuristics and reused
......@@ -83,7 +83,7 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
* ``none`` (default) : use the default non-deterministic convolution
implementation
* ``deterministic`` : use a slower but deterministic implementation
* ``fft`` : use the Fast Fourrier Transform implementation of convolution
* ``fft`` : use the Fast Fourier Transform implementation of convolution
(very high memory usage)
* ``guess_once`` : the first time a convolution is executed, the
implementation to use is chosen according to cuDNN's heuristics and reused
......
......@@ -38,7 +38,7 @@ There are also some top-level imports that you might find more convenient:
.. function:: function(...)
Alias for :func:`function.function`
Alias for :func:`theano.compile.function.function`
.. function:: function_dump(...)
......
......@@ -64,9 +64,9 @@ get an error when cuDNN can not be used with them, use this flag:
usage
* ``none`` : use a slower implementation with minimal memory usage
* ``large`` : use a sometimes faster implementation with large memory usage
* ``fft`` : use the Fast Fourrier Transform implementation of convolution
* ``fft`` : use the Fast Fourier Transform implementation of convolution
(very high memory usage)
* ``fft_tiling`` : use the Fast Fourrier Transform implementation of convolution
* ``fft_tiling`` : use the Fast Fourier Transform implementation of convolution
with tiling (high memory usage, but less then fft)
* ``guess_once`` : the first time a convolution is executed, the
implementation to use is chosen according to cuDNN's heuristics and reused
......@@ -89,7 +89,7 @@ get an error when cuDNN can not be used with them, use this flag:
* ``none`` (default) : use the default non-deterministic convolution
implementation
* ``deterministic`` : use a slower but deterministic implementation
* ``fft`` : use the Fast Fourrier Transform implementation of convolution
* ``fft`` : use the Fast Fourier Transform implementation of convolution
(very high memory usage)
* ``guess_once`` : the first time a convolution is executed, the
implementation to use is chosen according to cuDNN's heuristics and reused
......@@ -104,7 +104,7 @@ get an error when cuDNN can not be used with them, use this flag:
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
* (algo_bwd_data only) ``fft_tiling`` : use the Fast Fourrier
* (algo_bwd_data only) ``fft_tiling`` : use the Fast Fourier
Transform implementation of convolution with tiling (high memory
usage, but less then fft)
......
......@@ -85,6 +85,10 @@ floating-point precision.
Return a Variable for a 4-dimensional ndarray
.. function:: tensor5(name=None, dtype=config.floatX)
Return a Variable for a 5-dimensional ndarray
.. #COMMENT
Each of the types described above can be constructed by two methods:
a singular version (e.g., :ref:`dmatrix <libdoc_tensor_creation>`)
......@@ -112,66 +116,74 @@ They are all callable, and accept an optional ``name`` argument. So for example
table generated by
$ python Theano/doc/generate_dtype_tensor_table.py
============ =========== ==== =========== =================================
Constructor dtype ndim shape broadcastable
============ =========== ==== =========== =================================
bscalar int8 0 () ()
bvector int8 1 (?,) (False,)
brow int8 2 (1,?) (True, False)
bcol int8 2 (?,1) (False, True)
bmatrix int8 2 (?,?) (False, False)
btensor3 int8 3 (?,?,?) (False, False, False)
btensor4 int8 4 (?,?,?,?) (False, False, False, False)
wscalar int16 0 () ()
wvector int16 1 (?,) (False,)
wrow int16 2 (1,?) (True, False)
wcol int16 2 (?,1) (False, True)
wmatrix int16 2 (?,?) (False, False)
wtensor3 int16 3 (?,?,?) (False, False, False)
wtensor4 int16 4 (?,?,?,?) (False, False, False, False)
iscalar int32 0 () ()
ivector int32 1 (?,) (False,)
irow int32 2 (1,?) (True, False)
icol int32 2 (?,1) (False, True)
imatrix int32 2 (?,?) (False, False)
itensor3 int32 3 (?,?,?) (False, False, False)
itensor4 int32 4 (?,?,?,?) (False, False, False, False)
lscalar int64 0 () ()
lvector int64 1 (?,) (False,)
lrow int64 2 (1,?) (True, False)
lcol int64 2 (?,1) (False, True)
lmatrix int64 2 (?,?) (False, False)
ltensor3 int64 3 (?,?,?) (False, False, False)
ltensor4 int64 4 (?,?,?,?) (False, False, False, False)
dscalar float64 0 () ()
dvector float64 1 (?,) (False,)
drow float64 2 (1,?) (True, False)
dcol float64 2 (?,1) (False, True)
dmatrix float64 2 (?,?) (False, False)
dtensor3 float64 3 (?,?,?) (False, False, False)
dtensor4 float64 4 (?,?,?,?) (False, False, False, False)
fscalar float32 0 () ()
fvector float32 1 (?,) (False,)
frow float32 2 (1,?) (True, False)
fcol float32 2 (?,1) (False, True)
fmatrix float32 2 (?,?) (False, False)
ftensor3 float32 3 (?,?,?) (False, False, False)
ftensor4 float32 4 (?,?,?,?) (False, False, False, False)
cscalar complex64 0 () ()
cvector complex64 1 (?,) (False,)
crow complex64 2 (1,?) (True, False)
ccol complex64 2 (?,1) (False, True)
cmatrix complex64 2 (?,?) (False, False)
ctensor3 complex64 3 (?,?,?) (False, False, False)
ctensor4 complex64 4 (?,?,?,?) (False, False, False, False)
zscalar complex128 0 () ()
zvector complex128 1 (?,) (False,)
zrow complex128 2 (1,?) (True, False)
zcol complex128 2 (?,1) (False, True)
zmatrix complex128 2 (?,?) (False, False)
ztensor3 complex128 3 (?,?,?) (False, False, False)
ztensor4 complex128 4 (?,?,?,?) (False, False, False, False)
============ =========== ==== =========== =================================
============ =========== ==== ============ ===================================
Constructor dtype ndim shape broadcastable
============ =========== ==== ============ ===================================
bscalar int8 0 () ()
bvector int8 1 (?,) (False,)
brow int8 2 (1,?) (True, False)
bcol int8 2 (?,1) (False, True)
bmatrix int8 2 (?,?) (False, False)
btensor3 int8 3 (?,?,?) (False, False, False)
btensor4 int8 4 (?,?,?,?) (False, False, False, False)
btensor5 int8 5 (?,?,?,?,?) (False, False, False, False, False)
wscalar int16 0 () ()
wvector int16 1 (?,) (False,)
wrow int16 2 (1,?) (True, False)
wcol int16 2 (?,1) (False, True)
wmatrix int16 2 (?,?) (False, False)
wtensor3 int16 3 (?,?,?) (False, False, False)
wtensor4 int16 4 (?,?,?,?) (False, False, False, False)
wtensor5 int16 5 (?,?,?,?,?) (False, False, False, False, False)
iscalar int32 0 () ()
ivector int32 1 (?,) (False,)
irow int32 2 (1,?) (True, False)
icol int32 2 (?,1) (False, True)
imatrix int32 2 (?,?) (False, False)
itensor3 int32 3 (?,?,?) (False, False, False)
itensor4 int32 4 (?,?,?,?) (False, False, False, False)
itensor5 int32 5 (?,?,?,?,?) (False, False, False, False, False)
lscalar int64 0 () ()
lvector int64 1 (?,) (False,)
lrow int64 2 (1,?) (True, False)
lcol int64 2 (?,1) (False, True)
lmatrix int64 2 (?,?) (False, False)
ltensor3 int64 3 (?,?,?) (False, False, False)
ltensor4 int64 4 (?,?,?,?) (False, False, False, False)
ltensor5 int64 5 (?,?,?,?,?) (False, False, False, False, False)
dscalar float64 0 () ()
dvector float64 1 (?,) (False,)
drow float64 2 (1,?) (True, False)
dcol float64 2 (?,1) (False, True)
dmatrix float64 2 (?,?) (False, False)
dtensor3 float64 3 (?,?,?) (False, False, False)
dtensor4 float64 4 (?,?,?,?) (False, False, False, False)
dtensor5 float64 5 (?,?,?,?,?) (False, False, False, False, False)
fscalar float32 0 () ()
fvector float32 1 (?,) (False,)
frow float32 2 (1,?) (True, False)
fcol float32 2 (?,1) (False, True)
fmatrix float32 2 (?,?) (False, False)
ftensor3 float32 3 (?,?,?) (False, False, False)
ftensor4 float32 4 (?,?,?,?) (False, False, False, False)
ftensor5 float32 5 (?,?,?,?,?) (False, False, False, False, False)
cscalar complex64 0 () ()
cvector complex64 1 (?,) (False,)
crow complex64 2 (1,?) (True, False)
ccol complex64 2 (?,1) (False, True)
cmatrix complex64 2 (?,?) (False, False)
ctensor3 complex64 3 (?,?,?) (False, False, False)
ctensor4 complex64 4 (?,?,?,?) (False, False, False, False)
ctensor5 complex64 5 (?,?,?,?,?) (False, False, False, False, False)
zscalar complex128 0 () ()
zvector complex128 1 (?,) (False,)
zrow complex128 2 (1,?) (True, False)
zcol complex128 2 (?,1) (False, True)
zmatrix complex128 2 (?,?) (False, False)
ztensor3 complex128 3 (?,?,?) (False, False, False)
ztensor4 complex128 4 (?,?,?,?) (False, False, False, False)
ztensor5 complex128 5 (?,?,?,?,?) (False, False, False, False, False)
============ =========== ==== ============ ===================================
Plural Constructors
--------------------------
......@@ -220,11 +232,11 @@ If you would like to construct a tensor variable with a non-standard
broadcasting pattern, or a larger number of dimensions you'll need to create
your own :class:`TensorType` instance. You create such an instance by passing
the dtype and broadcasting pattern to the constructor. For example, you
can create your own 5-dimensional tensor type
can create your own 6-dimensional tensor type
>>> dtensor5 = TensorType('float64', (False,)*5)
>>> x = dtensor5()
>>> z = dtensor5('z')
>>> dtensor6 = TensorType('float64', (False,)*6)
>>> x = dtensor6()
>>> z = dtensor6('z')
You can also redefine some of the provided types and they will interact
correctly:
......@@ -1095,13 +1107,11 @@ Indexing
Like NumPy, Theano distinguishes between *basic* and *advanced* indexing.
Theano fully supports basic indexing
(see `NumPy's indexing <http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html>`_).
`Integer advanced indexing
<http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#integer>`_
will be supported in 0.6rc4 (or the development version). We do not
support boolean masks, as Theano does not have a boolean type (we use
int8 for the output of logic operators).
(see `NumPy's indexing <http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html>`_)
and `integer advanced indexing
<http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#integer>`_. We do not
support boolean masks, as Theano does not have a boolean type (we use int8 for the output of
logic operators).
.. testsetup:: indexing
......
Requirements
============
.. note::
We only support the installation of the requirements through conda.
.. _BLAS: http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms
.. _Python: http://www.python.org/
Python_ >= 2.6 or >= 3.3
The development package (python-dev or python-devel on most Linux distributions) is recommended (see just below). Python 2.4 was supported up to and including the release 0.6. Python 3 is supported past the 3.3 release.
`NumPy <http://numpy.scipy.org/>`_ >= 1.7.1 < 1.11.1
Earlier versions could work, but we don’t test it.
`SciPy <http://scipy.org>`_ >= 0.11 < 0.17.1
Only currently required for sparse matrix and special functions support, but highly recommended. SciPy >=0.8 could work, but earlier versions have known bugs with sparse matrices.
`BLAS`_ installation (with Level 3 functionality)
* **Recommended**: MKL, which is free through Conda.
* Alternatively, we suggest to install OpenBLAS, with the development headers (``-dev``, ``-devel``, depending on your Linux distribution).
**Optional requirements**
``python-dev``, ``g++`` >= 4.2
**Highly recommended.** Theano can fall back on a NumPy-based Python execution model, but a C compiler allows for vastly faster execution.
`nose <http://nose.readthedocs.io/en/latest/>`_ >= 1.3.0
Recommended, to run Theano's test-suite.
`Sphinx <http://sphinx.pocoo.org/>`_ >= 0.5.1, `pygments <http://pygments.org/>`_
For building the documentation. LaTeX_ and dvipng_ are also necessary for math to show up as images.
`pydot-ng <https://github.com/pydot/pydot-ng>`_
To handle large picture for gif/images.
`NVIDIA CUDA drivers and SDK`_
**Highly recommended** Required for GPU code generation/execution on NVIDIA gpus. See instruction below.
`libgpuarray`_
Required for GPU/CPU code generation on CUDA and OpenCL devices (see: :ref:`gpuarray`.)
Requirements installation through Conda (recommended)
-----------------------------------------------------
Install Miniconda
^^^^^^^^^^^^^^^^^
Follow this `link <http://conda.pydata.org/miniconda.html>`__ to install Miniconda.
.. note::
If you want fast compiled code (recommended), make sure you have g++ (Windows/Linux) or Clang (OS X) installed.
Install requirements and optional packages
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code-block:: bash
conda install numpy scipy mkl <nose> <sphinx> <pydot-ng>
* Arguments between <...> are optional.
Install and configure the GPU drivers (recommended)
---------------------------------------------------
.. warning::
OpenCL support is still minimal for now.
1. Install CUDA drivers
* Follow `this link <https://developer.nvidia.com/cuda-downloads>`__
to install the CUDA driver and the CUDA Toolkit.
* You must reboot the computer after the driver installation.
* Test that it was loaded correctly after the reboot, executing the
command `nvidia-smi` from the command line.
.. note::
Sanity check: The *bin* subfolder should contain an *nvcc*
program. This folder is called the *cuda root* directory.
2. Fix 'lib' path
* Add the 'lib' subdirectory (and/or 'lib64' subdirectory if you have a
64-bit OS) to your ``$LD_LIBRARY_PATH`` environment
variable.
3. Set Theano's config flags
To use the GPU you need to define the *cuda root*. You can do it in one
of the following ways:
* Define a $CUDA_ROOT environment variable to equal the cuda root directory, as in ``CUDA_ROOT=/path/to/cuda/root``, or
* add a ``cuda.root`` flag to :envvar:`THEANO_FLAGS`, as in ``THEANO_FLAGS='cuda.root=/path/to/cuda/root'``, or
* add a [cuda] section to your .theanorc file containing the option ``root = /path/to/cuda/root``.
.. _LaTeX: http://www.latex-project.org/
.. _dvipng: http://savannah.nongnu.org/projects/dvipng/
.. _NVIDIA CUDA drivers and SDK: http://developer.nvidia.com/object/gpucomputing.html
.. _libgpuarray: http://deeplearning.net/software/libgpuarray/installation.html
......@@ -54,6 +54,10 @@ if __name__ == '__main__':
pythonpath = os.pathsep.join([throot, pythonpath])
sys.path[0:0] = [throot] # We must not use os.environ.
# Make sure we don't use gpu to compile documentation
env_th_flags = os.environ.get('THEANO_FLAGS', '')
os.environ['THEANO_FLAGS'] = 'device=cpu,force_device=True'
def call_sphinx(builder, workdir):
import sphinx
if options['--check']:
......@@ -99,3 +103,6 @@ if __name__ == '__main__':
# To go back to the original current directory.
os.chdir(currentdir)
# Reset THEANO_FLAGS
os.environ['THEANO_FLAGS'] = env_th_flags
差异被折叠。
......@@ -175,13 +175,13 @@ by :ref:`broadcasting <libdoc_tensor_broadcastable>`.
The following types are available:
* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4``
* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4``
* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4``
* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4``
* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4``
* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4``
* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4``
* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4, btensor5``
* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4, wtensor5``
* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4, itensor5``
* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4, ltensor5``
* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4, ftensor5``
* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4, dtensor5``
* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4, ctensor5``
The previous list is not exhaustive and a guide to all types compatible
with NumPy arrays may be found here: :ref:`tensor creation<libdoc_tensor_creation>`.
......
.. _using_gpu:
=============
......@@ -19,11 +18,33 @@ There are two ways currently to use a gpu, one that should support any OpenCL
device as well as NVIDIA cards (:ref:`gpuarray`), and the old backend that
only supports NVIDIA cards (:ref:`cuda`).
Using the GPU in Theano is as simple as setting the ``device`` configuration
flag to ``device=cuda`` (or ``device=gpu`` for the old backend). You can optionally target a specific gpu by specifying
the number of the gpu as in e.g. ``device=cuda2``. You also need to set the
default floating point precision.
For example: ``THEANO_FLAGS='cuda.root=/path/to/cuda/root,device=cuda,floatX=float32'``.
You can also set these options in the .theanorc file's ``[global]`` section:
.. code-block:: cfg
[global]
device = cuda
floatX = float32
.. warning::
If you want to use the new GpuArray backend, make sure to have the
development version of Theano installed. The 0.8.X releases have not
been optimized to work correctly with the new backend.
The old CUDA backend will be deprecated soon, in favor of the new libgpuarray
backend.
.. note::
* If your computer has multiple GPUs and you use ``device=cuda``, the driver
selects the one to use (usually gpu0).
* You can use the program ``nvidia-smi`` to change this policy.
* By default, when ``device`` indicates preference for GPU computations,
Theano will fall back to the CPU if there is a problem with the GPU.
You can use the flag ``force_device=True`` to instead raise an error when
Theano cannot use the GPU.
.. _gpuarray:
......@@ -31,19 +52,32 @@ GpuArray Backend
----------------
If you have not done so already, you will need to install libgpuarray
as well as at least one computing toolkit. Instructions for doing so
are provided at `libgpuarray <http://deeplearning.net/software/libgpuarray/installation.html>`_.
as well as at least one computing toolkit (CUDA or OpenCL). Detailed
instructions to accomplish that are provided at
`libgpuarray <http://deeplearning.net/software/libgpuarray/installation.html>`_.
To install Nvidia's GPU-programming toolchain (CUDA) and configure
Theano to use it, see the installation instructions for
:ref:`Linux <gpu_linux>`, :ref:`MacOS <gpu_macos>` and :ref:`Windows <gpu_windows>`.
While all types of devices are supported if using OpenCL, for the
remainder of this section, whatever compute device you are using will
be referred to as GPU.
.. warning::
If you want to use the new GpuArray backend, make sure to have the
development version of Theano installed. The 0.8.X releases have not
been optimized to work correctly with the new backend.
.. warning::
The backend was designed to support OpenCL, however current support is
incomplete. A lot of very useful ops still do not support it because they
were ported from the old backend with minimal change.
.. _testing_the_gpu:
Testing Theano with GPU
~~~~~~~~~~~~~~~~~~~~~~~
......@@ -150,7 +184,7 @@ the GPU object directly. The following code is modified to do just that.
Here ``tensor.exp(x).transfer(None)`` means "copy ``exp(x)`` to the GPU",
with ``None`` the default GPU context when not explicitly given.
For information on how to set GPU contexts, see :ref:`tut_using_multi_gpu`.
For information on how to set GPU contexts, see :ref:`tut_using_multi_gpu`.
The output is
......@@ -227,10 +261,10 @@ Tips for Improving Performance on GPU
``.theanorc`` file if you plan to do a lot of GPU work.
* The GPU backend supports *float64* variables, but they are still slower
to compute than *float32*. The more *float32*, the better GPU performance
you will get.
* Prefer constructors like ``matrix``, ``vector`` and ``scalar`` (which
you will get.
* Prefer constructors like ``matrix``, ``vector`` and ``scalar`` (which
follow the type set in ``floatX``) to ``dmatrix``, ``dvector`` and
``dscalar``. The latter enforce double precision (*float64* on most
``dscalar``. The latter enforce double precision (*float64* on most
machines), which slows down GPU computations on current hardware.
* Minimize transfers to the GPU device by using ``shared`` variables
to store frequently-accessed data (see :func:`shared()<shared.shared>`).
......
.. include:: css.inc
.. _updating:
Updating Theano
===============
Follow one of these three sections depending on how you installed Theano.
You should update frequently, bugs are fixed on a very regular basis, and features are
added even more frequently!
Stable Installation
-------------------
The following command will update only Theano:
.. raw:: html
<pre><span class="red">&#60;sudo&#62;</span> pip install <span class="blue">&#60;--user&#62;</span> <span class="pink">&#60;--no-deps&#62;</span> theano</pre>
- Use :red:`sudo` for a root installation.
- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
- Use :pink:`no-deps` when you don't want the dependencies of Theano to not be installed through pip. This is important when they have already been installed as system packages.
.. warning::
If you installed NumPy/SciPy with yum/apt-get, updating NumPy/SciPy
with pip/easy_install is not always a good idea. This can make Theano
crash due to problems with BLAS. The versions of
NumPy/SciPy in the distribution are sometimes linked against faster
versions of BLAS. Installing NumPy/SciPy with
yum/apt-get/pip/easy_install won't install the development package
needed to recompile it with the fast version.
To fix a possible crash, you can clear
the Theano cache like this:
.. code-block:: bash
theano-cache clear
Bleeding-Edge Installation
--------------------------
The following command will update your bleeding-edge version of Theano
.. raw:: html
<div style="width:100%"><pre><span class="red">&#60;sudo&#62;</span> pip install <span class="blue">&#60;--user&#62;</span> <span class="pink">&#60;--no-deps&#62;</span> git+https://github.com/Theano/Theano.git#egg=Theano</pre></div>
- Use :red:`sudo` for a root installation.
- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
- Use :pink:`no-deps` when you don't want the dependencies of Theano to not be installed through pip. This is important when they have already been installed as system packages.
.. warning::
If you installed NumPy/SciPy with yum/apt-get, updating NumPy/SciPy
with pip/easy_install is not always a good idea. This can make Theano
crash due to problems with BLAS. The versions of
NumPy/SciPy in the distribution are sometimes linked against faster
versions of BLAS. Installing NumPy/SciPy with
yum/apt-get/pip/easy_install won't install the development package
needed to recompile it with the fast version.
To fix a possible crash, you can clear
the Theano cache like this:
.. code-block:: bash
theano-cache clear
Developer Installation
----------------------
To update your library to the latest revision, change directory (``cd``)
to your ``Theano`` folder and execute the following command:
.. warning::
The following assumes you have knowledge of git and know how to do a rebase.
.. code-block:: bash
git pull --rebase
......@@ -166,7 +166,7 @@ def do_setup():
install_requires=['numpy>=1.7.1', 'scipy>=0.11', 'six>=1.9.0'],
# pygments is a dependency for Sphinx code highlight
extras_require={
'test': ['nose>=1.3.0', 'nose-parameterized>=0.5.0'],
'test': ['nose>=1.3.0', 'nose-parameterized>=0.5.0', 'flake8<3'],
'doc': ['Sphinx>=0.5.1', 'pygments']
},
package_data={
......
......@@ -147,7 +147,7 @@ class BadThunkOutput(DebugModeError):
print(" thunk2 :", self.thunk2, file=sio)
# Don't import it at the top of the file to prevent circular import.
utt = theano.tests.unittest_tools
import theano.tests.unittest_tools as utt
print(utt.str_diagnostic(self.val1, self.val2, None, None), file=sio)
ret = sio.getvalue()
return ret
......@@ -1769,12 +1769,13 @@ class _Linker(gof.link.LocalLinker):
if schedule:
self.schedule = schedule
def accept(self, fgraph, no_recycling=None):
def accept(self, fgraph, no_recycling=None, profile=None):
if no_recycling is None:
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
assert type(self) is _Linker
return type(self)(maker=self.maker).accept(fgraph, no_recycling)
return type(self)(maker=self.maker).accept(
fgraph, no_recycling, profile)
self.fgraph = fgraph
self.no_recycling = no_recycling
return self
......
......@@ -28,7 +28,7 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
on_unused_input=None,
extra_tag_to_remove=None):
"""
This is helpful to make a reproducable case for problem during Theano
This is helpful to make a reproducible case for problems during Theano
compilation.
Ex:
......@@ -36,13 +36,13 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
replace `theano.function(...)` by
`theano.function_dump('filename.pkl', ...)`.
If you see this, you where probably asked to use this function to
If you see this, you were probably asked to use this function to
help debug a particular case during the compilation of a Theano
function. `function_dump` allows to easily reproduce your
compilation without asking any code. It pickle all the objects and
function. `function_dump` allows you to easily reproduce your
compilation without generating any code. It pickles all the objects and
parameters needed to reproduce a call to `theano.function()`. This
include shared variables and there values. If you do not want
that, you can set to replace shared variables values by zeros by
includes shared variables and their values. If you do not want
that, you can choose to replace shared variables values with zeros by
calling set_value(...) on them before calling `function_dump`.
To load such a dump and do the compilation:
......@@ -53,9 +53,9 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
>>> f = theano.function(**d) # doctest: +SKIP
Note:
The parameter extra_tag_to_remove, is passed to the StripPickler used.
The parameter `extra_tag_to_remove` is passed to the StripPickler used.
To pickle graph made by Blocks, it must be:
['annotations', 'replacement_of', 'aggregation_scheme', 'roles']
`['annotations', 'replacement_of', 'aggregation_scheme', 'roles']`
"""
assert isinstance(filename, string_types)
......@@ -78,7 +78,8 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
rebuild_strict=True, allow_input_downcast=None, profile=None,
on_unused_input=None):
"""
Return a callable object that will calculate `outputs` from `inputs`.
Return a :class:`callable object <theano.compile.function_module.Function>`
that will calculate `outputs` from `inputs`.
Parameters
----------
......@@ -100,6 +101,10 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
If True, do not perform any automatic update on Variables. If False
(default), perform them all. Else, perform automatic updates on all
Variables that are neither in "updates" nor in "no_default_updates".
accept_inplace : bool
True iff the graph can contain inplace operations prior to the
optimization phase (default is False). *Note* this parameter is unsupported,
and its use is not recommended.
name : str
An optional name for this function. The profile mode will print the time
spent in this function.
......@@ -115,10 +120,10 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
Ops in the graph, an Exception will be raised.
allow_input_downcast: bool or None
True means that the values passed as inputs when calling the function
can be silently downcasted to fit the dtype of the corresponding
can be silently down-casted to fit the dtype of the corresponding
Variable, which may lose precision. False means that it will only be
cast to a more general, or precise, type. None (default) is almost like
False, but allows downcasting of Python float scalars to floatX.
False, but allows down-casting of Python float scalars to floatX.
profile: None, True, or ProfileStats instance
Accumulate profiling information into a given ProfileStats instance.
If argument is `True` then a new ProfileStats instance will be used.
......@@ -131,7 +136,7 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
Returns
-------
Function instance
:class:`theano.compile.function_module.Function` instance
A callable object that will compute the outputs (given the inputs) and
update the implicit function arguments according to the `updates`.
......@@ -209,9 +214,9 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
4. Linker
The linker uses a Python loop to execute the code associated
with all the Apply nodes in the graph in the correct order.
The CVM is a linker that replaces this Python loop with a C
loop to avoid continuously changing between Python and C.
The CVM is faster for 2 reasons:
The C Virtual Machine (CVM) is a linker that replaces this
Python loop with a C loop to avoid continuously changing
between Python and C. The CVM is faster for 2 reasons:
1) Its internal logic is in C, so no Python interpreter
overhead.
2) It makes native calls from the VM logic into thunks that
......@@ -219,7 +224,6 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
The VM is a linker that was developed to prototype the CVM. it
was easier to develop the VM in Python then translate it to C instead
of just writing it in C from scratch.
CVM stands for C Virtual Machine.
"""
if isinstance(outputs, dict):
......@@ -252,7 +256,7 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
func_frame = stack[idx - 1]
while "theano/gof" in func_frame[0] and idx > 0:
idx -= 1
# This can hapen if we call var.eval()
# This can happen if we call var.eval()
func_frame = stack[idx - 1]
name = func_frame[0] + ':' + str(func_frame[1])
......
......@@ -735,9 +735,13 @@ class Function(object):
kwargs : dict
The function inputs can be passed as keyword argument. For this, use
the name of the input or the input instance as the key.
Keyword argument ``output_subset`` is a list of either indices of the
function's outputs or the keys belonging to the `output_keys` dict
and represent outputs that are requested to be calculated.
and represent outputs that are requested to be calculated. Regardless
of the presence of ``output_subset``, the updates are always calculated
and processed. To disable the updates, you should use the ``copy``
method with ``delete_updates=True``.
Returns
-------
......@@ -1496,9 +1500,10 @@ class FunctionMaker(object):
if not spec.borrow]
if no_borrow:
self.linker = linker.accept(
fgraph, no_recycling=infer_reuse_pattern(fgraph, no_borrow))
fgraph, no_recycling=infer_reuse_pattern(fgraph, no_borrow),
profile=profile)
else:
self.linker = linker.accept(fgraph)
self.linker = linker.accept(fgraph, profile=profile)
if hasattr(linker, 'accept_var_updates'):
# hacky thing so VMLinker knows about updates
......@@ -1722,8 +1727,8 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
Default of None means to use `config.mode` (see below for descriptive
string list).
name : str
An optional name for this fct. If used, the profile mode will print the
time spent in this fct.
An optional name for this function. If used, the profile mode will print the
time spent in this function.
accept_inplace : bool
True iff the graph can contain inplace operations prior to the
optimization phase (default is False).
......
......@@ -5,8 +5,6 @@ WRITEME
from __future__ import absolute_import, print_function, division
import logging
import numpy
import theano
from theano import gof
import theano.gof.vm
......@@ -18,35 +16,6 @@ from six import string_types
_logger = logging.getLogger('theano.compile.mode')
def check_equal(x, y):
"""
Returns True iff x[0] and y[0] are equal (checks the dtype and shape if x
and y are numpy.ndarray instances). Used internally.
"""
# I put the import here to allow using theano without scipy.
import scipy.sparse as sp
x, y = x[0], y[0]
# TODO: bug in current scipy, two sparse matrices are never equal,
# remove when moving to 0.7
if sp.issparse(x):
x = x.todense()
if sp.issparse(y):
y = y.todense()
if isinstance(x, numpy.ndarray) and isinstance(y, numpy.ndarray):
if (x.dtype != y.dtype or
x.shape != y.shape or
numpy.any(abs(x - y) > 1e-10)):
raise Exception("Output mismatch.",
{'performlinker': x, 'clinker': y})
else:
if x != y:
raise Exception("Output mismatch.",
{'performlinker': x, 'clinker': y})
# If a string is passed as the linker argument in the constructor for
# Mode, it will be used as the key to retrieve the real linker in this
# dictionary
......@@ -384,7 +353,7 @@ predefined_modes = {'FAST_COMPILE': FAST_COMPILE,
'FAST_RUN': FAST_RUN,
}
instanciated_default_mode = None
instantiated_default_mode = None
def get_mode(orig_string):
......@@ -395,17 +364,17 @@ def get_mode(orig_string):
if not isinstance(string, string_types):
return string # it is hopefully already a mode...
global instanciated_default_mode
global instantiated_default_mode
# The default mode is cached. However, config.mode can change
# If instanciated_default_mode has the right class, use it.
if orig_string is None and instanciated_default_mode:
# If instantiated_default_mode has the right class, use it.
if orig_string is None and instantiated_default_mode:
if string in predefined_modes:
default_mode_class = predefined_modes[string].__class__.__name__
else:
default_mode_class = string
if (instanciated_default_mode.__class__.__name__ ==
if (instantiated_default_mode.__class__.__name__ ==
default_mode_class):
return instanciated_default_mode
return instantiated_default_mode
if string in ['Mode', 'ProfileMode', 'DebugMode', 'NanGuardMode']:
if string == 'DebugMode':
......@@ -422,6 +391,7 @@ def get_mode(orig_string):
# This might be required if the string is 'ProfileMode'
from .profilemode import ProfileMode # noqa
from .profilemode import prof_mode_instance_to_print
# TODO: Can't we look up the name and invoke it rather than using eval here?
ret = eval(string +
'(linker=config.linker, optimizer=config.optimizer)')
elif string in predefined_modes:
......@@ -437,7 +407,7 @@ def get_mode(orig_string):
ret = ret.including(*theano.config.optimizer_including.split(':'))
if theano.config.optimizer_requiring:
ret = ret.requiring(*theano.config.optimizer_requiring.split(':'))
instanciated_default_mode = ret
instantiated_default_mode = ret
# must tell python to print the summary at the end.
if string == 'ProfileMode':
......
......@@ -41,7 +41,7 @@ def flatten(l):
return rval
def contains_nan(arr, node=None):
def contains_nan(arr, node=None, var=None):
"""
Test whether a numpy.ndarray contains any `np.nan` values.
......@@ -50,6 +50,7 @@ def contains_nan(arr, node=None):
arr : np.ndarray or output of any Theano op
node : None or an Apply instance.
If arr is the output of a Theano op, the node associated to it.
var : The Theano symbolic variable.
Returns
-------
......@@ -68,6 +69,8 @@ def contains_nan(arr, node=None):
return False
elif isinstance(arr, np.random.mtrand.RandomState):
return False
elif var and getattr(var.tag, 'is_rng', False):
return False
elif isinstance(arr, slice):
return False
elif arr.size == 0:
......@@ -86,7 +89,7 @@ def contains_nan(arr, node=None):
return np.isnan(np.min(arr))
def contains_inf(arr, node=None):
def contains_inf(arr, node=None, var=None):
"""
Test whether a numpy.ndarray contains any `np.inf` values.
......@@ -95,6 +98,7 @@ def contains_inf(arr, node=None):
arr : np.ndarray or output of any Theano op
node : None or an Apply instance.
If the output of a Theano op, the node associated to it.
var : The Theano symbolic variable.
Returns
-------
......@@ -114,6 +118,8 @@ def contains_inf(arr, node=None):
return False
elif isinstance(arr, np.random.mtrand.RandomState):
return False
elif var and getattr(var.tag, 'is_rng', False):
return False
elif isinstance(arr, slice):
return False
elif arr.size == 0:
......@@ -215,44 +221,47 @@ class NanGuardMode(Mode):
assert nan_is_error or inf_is_error or big_is_error
compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
def do_check_on(var, nd):
def do_check_on(value, nd, var=None):
"""
Checks `var` for NaNs / Infs. If detected, raises an exception
Checks `value` for NaNs / Infs. If detected, raises an exception
and / or prints information about `nd`, `f`, and `is_input` to
help the user determine the cause of the invalid values.
Parameters
----------
var : numpy.ndarray
value : numpy.ndarray
The value to be checked.
nd : theano.gof.Apply
The Apply node being executed.
var : theano.gof.Variable
Not used if nd is there. Otherwise, used to print the stack
trace for inputs of the graph.
"""
error = False
sio = StringIO()
if nan_is_error:
if contains_nan(var, nd):
if contains_nan(value, nd, var):
print('NaN detected', file=sio)
error = True
if inf_is_error:
if contains_inf(var, nd):
if contains_inf(value, nd, var):
print('Inf detected', file=sio)
error = True
if big_is_error:
err = False
if isinstance(var, theano.gof.type.CDataType._cdata_type):
if isinstance(value, theano.gof.type.CDataType._cdata_type):
err = False
elif isinstance(var, np.random.mtrand.RandomState):
elif isinstance(value, np.random.mtrand.RandomState):
err = False
elif isinstance(var, slice):
elif isinstance(value, slice):
err = False
elif var.size == 0:
elif value.size == 0:
err = False
elif cuda.cuda_available and isinstance(var, cuda.CudaNdarray):
err = (f_gpuabsmax(var.reshape(var.size)) > 1e10)
elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray):
err = (f_gpuabsmax(value.reshape(value.size)) > 1e10)
else:
err = (np.abs(var).max() > 1e10)
err = (np.abs(value).max() > 1e10)
if err:
print('Big value detected', file=sio)
error = True
......@@ -264,6 +273,11 @@ class NanGuardMode(Mode):
else:
print("NanGuardMode found an error in an input of the "
"graph.", file=sio)
# Add the stack trace
if nd:
var = nd.outputs[0]
print(theano.gof.utils.get_variable_trace_string(var),
file=sio)
msg = sio.getvalue()
if config.NanGuardMode.action == 'raise':
raise AssertionError(msg)
......@@ -281,7 +295,7 @@ class NanGuardMode(Mode):
def nan_check_input(var, value):
if getattr(var.tag, 'nan_guard_mode_check', True):
do_check_on(value, None)
do_check_on(value, None, var=var)
wrap_linker = theano.gof.vm.VM_Linker(callback=nan_check,
callback_input=nan_check_input)
......
......@@ -306,6 +306,10 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
If False (default), perform them all. Else, perform automatic updates
on all Variables that are neither in "updates" nor in
"no_default_updates".
accept_inplace : bool
True iff the graph can contain inplace operations prior to the
optimization phase (default is False). *Note* this parameter is unsupported,
and its use is not recommended.
name : None or string
Attaches a name to the profiling result of this function.
allow_input_downcast : bool
......
......@@ -72,7 +72,8 @@ def _atexit_print_fn():
for ps in to_sum[1:]:
for attr in ["compile_time", "fct_call_time", "fct_callcount",
"vm_call_time", "optimizer_time", "linker_time",
"validate_time", "import_time"]:
"validate_time", "import_time",
"linker_node_make_thunks"]:
setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
# merge dictonary
......@@ -190,6 +191,8 @@ class ProfileStats(object):
import_time = 0.0
# time spent in importing compiled python module.
linker_node_make_thunks = 0.0
line_width = config.profiling.output_line_width
nb_nodes = -1
......@@ -665,6 +668,8 @@ class ProfileStats(object):
print(' Theano Linker time (includes C, CUDA code '
'generation/compiling): %es' % self.linker_time, file=file)
print(' Import time %es' % self.import_time, file=file)
print(' Node make_thunk time %es' % self.linker_node_make_thunks,
file=file)
print('', file=file)
# The validation time is a subset of optimizer_time
......
......@@ -242,6 +242,15 @@ AddConfigVar('gpuarray.preallocate',
FloatParam(0),
in_c_key=False)
AddConfigVar('gpuarray.sched',
"""The sched parameter passed for context creation to pygpu.
With CUDA, using "multi" is equivalent to using the parameter
cudaDeviceScheduleYield. This is useful to lower the
CPU overhead when waiting for GPU. One user found that it
speeds up his other processes that was doing data augmentation.
""",
EnumStr("default", "multi", "single"))
AddConfigVar('gpuarray.single_stream',
"""
If your computations are mostly lots of small elements,
......@@ -1630,6 +1639,8 @@ def short_platform(r=None, p=None):
return p
compiledir_format_dict['short_platform'] = short_platform()
# Allow to have easily one compiledir per device.
compiledir_format_dict['device'] = config.device
compiledir_format_keys = ", ".join(sorted(compiledir_format_dict.keys()))
default_compiledir_format = ("compiledir_%(short_platform)s-%(processor)s-"
"%(python_version)s-%(python_bitwidth)s")
......
......@@ -8,6 +8,7 @@ import os
import shlex
import sys
import warnings
from functools import wraps
from six import StringIO
......@@ -96,6 +97,7 @@ def change_flags(**kwargs):
Useful during tests.
"""
def change_flags_exec(f):
@wraps(f)
def inner(*args, **kwargs_):
old_val = {}
for k in kwargs:
......@@ -117,9 +119,6 @@ def change_flags(**kwargs):
assert len(l) == 1
l[0].__set__(None, old_val[k])
# Make sure that the name of the decorated function remains the same.
inner.__name__ = f.__name__
return inner
return change_flags_exec
......
......@@ -548,7 +548,7 @@ class CLinker(link.Linker):
if schedule:
self.schedule = schedule
def accept(self, fgraph, no_recycling=None):
def accept(self, fgraph, no_recycling=None, profile=None):
"""
Associate linker with fgraph
......@@ -557,7 +557,8 @@ class CLinker(link.Linker):
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
# A linker can be tied to only one FunctionGraph.
return type(self)(self.schedule).accept(fgraph, no_recycling)
return type(self)(self.schedule).accept(
fgraph, no_recycling, profile)
self.fgraph = fgraph
self.fetch_variables()
self.no_recycling = no_recycling
......@@ -717,7 +718,7 @@ class CLinker(link.Linker):
[get_c_declare, get_c_extract_out,
(get_c_sync, get_c_cleanup)]]
else:
raise Exception("what the fuck")
raise Exception("this shouldn't be possible, please report this exception")
builder, block = struct_variable_codeblocks(variable, policy,
id, symbol, sub)
......@@ -1737,7 +1738,7 @@ class OpWiseCLinker(link.LocalLinker):
if schedule:
self.schedule = schedule
def accept(self, fgraph, no_recycling=None):
def accept(self, fgraph, no_recycling=None, profile=None):
"""
Associate linker with fgraph
"""
......@@ -1750,7 +1751,7 @@ class OpWiseCLinker(link.LocalLinker):
allow_gc=self.allow_gc,
nice_errors=self.nice_errors,
schedule=self.schedule,
).accept(fgraph, no_recycling)
).accept(fgraph, no_recycling, profile)
self.fgraph = fgraph
self.no_recycling = no_recycling
return self
......@@ -1897,7 +1898,7 @@ class DualLinker(link.Linker):
if schedule:
self.schedule = schedule
def accept(self, fgraph, no_recycling=None):
def accept(self, fgraph, no_recycling=None, profile=None):
"""
Update/tie self with fgraph
"""
......@@ -1905,7 +1906,7 @@ class DualLinker(link.Linker):
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
return type(self)(self.checker, self.schedule).accept(
fgraph, no_recycling)
fgraph, no_recycling, profile)
self.fgraph = fgraph
self.no_recycling = no_recycling
return self
......
......@@ -789,15 +789,47 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
{
CLazyLinker * self = (CLazyLinker*)_self;
static char *kwlist[] = {
(char*)"time_thunks",
(char *)"time_thunks",
(char *)"n_calls",
(char *)"output_subset",
NULL};
int n_calls=1;
if (! PyArg_ParseTupleAndKeywords(args, kwds, "|ii", kwlist,
PyObject *output_subset_ptr = NULL;
if (! PyArg_ParseTupleAndKeywords(args, kwds, "|iiO", kwlist,
&self->do_timing,
&n_calls))
&n_calls,
&output_subset_ptr))
return NULL;
int err = 0;
// parse an output_subset list
// it is stored as a bool list of length n_output_vars: calculate a var or not
char *output_subset = NULL;
int output_subset_size = -1;
if (output_subset_ptr != NULL)
{
if (! PyList_Check(output_subset_ptr))
{
err = 1;
PyErr_SetString(PyExc_RuntimeError, "Output_subset is not a list");
}
else
{
output_subset_size = PyList_Size(output_subset_ptr);
output_subset = (char*)calloc(self->n_output_vars, sizeof(char));
for (int it = 0; it < output_subset_size; ++it)
{
PyObject *elem = PyList_GetItem(output_subset_ptr, it);
if (! PyInt_Check(elem))
{
err = 1;
PyErr_SetString(PyExc_RuntimeError, "Some elements of output_subset list are not int");
}
output_subset[PyInt_AsLong(elem)] = 1;
}
}
}
self->position_of_error = -1;
// create constants used to fill the var_compute_cells
PyObject * one = PyInt_FromLong(1);
......@@ -833,9 +865,13 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
}
}
int first_updated = self->n_output_vars - self->n_updates;
for (int i = 0; i < self->n_output_vars && (!err); ++i)
{
err = lazy_rec_eval(self, self->output_vars[i], one, zero);
if (i >= first_updated || output_subset == NULL || output_subset[i] == 1)
{
err = lazy_rec_eval(self, self->output_vars[i], one, zero);
}
}
if (!err)
......@@ -848,7 +884,8 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
{
Py_ssize_t src = self->output_vars[i];
PyObject * item = PyList_GetItem(self->var_value_cells[src], 0);
if (self->var_computed[src] != 1)
if ((output_subset == NULL || output_subset[i]) &&
self->var_computed[src] != 1)
{
err = 1;
PyErr_Format(PyExc_AssertionError,
......@@ -876,7 +913,7 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
}
/*
Clear everything that is left and not an output. This is needed
Clear everything that is left and not an output. This is needed
for lazy evaluation since the current GC algo is too conservative
with lazy graphs.
*/
......@@ -901,6 +938,9 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
PyList_SetItem(self->var_value_cells[i], 0, Py_None);
}
}
if (output_subset != NULL)
free(output_subset);
Py_DECREF(one);
Py_DECREF(zero);
if (err)
......@@ -1014,7 +1054,7 @@ static PyTypeObject lazylinker_ext_CLazyLinkerType = {
static PyObject * get_version(PyObject *dummy, PyObject *args)
{
PyObject *result = PyFloat_FromDouble(0.21);
PyObject *result = PyFloat_FromDouble(0.211);
return result;
}
......
......@@ -15,7 +15,7 @@ from theano.gof import cmodule
_logger = logging.getLogger('theano.gof.lazylinker_c')
force_compile = False
version = 0.21 # must match constant returned in function get_version()
version = 0.211 # must match constant returned in function get_version()
lazylinker_ext = None
......@@ -145,4 +145,4 @@ except ImportError:
release_lock()
from lazylinker_ext.lazylinker_ext import * # noqa
assert force_compile or (version == get_version())
assert force_compile or (version == get_version()) # noqa
......@@ -762,7 +762,7 @@ class PerformLinker(LocalLinker):
if schedule:
self.schedule = schedule
def accept(self, fgraph, no_recycling=None):
def accept(self, fgraph, no_recycling=None, profile=None):
"""
Parameters
......@@ -781,7 +781,8 @@ class PerformLinker(LocalLinker):
if no_recycling is None:
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
return type(self)(allow_gc=self.allow_gc).accept(fgraph, no_recycling)
return type(self)(allow_gc=self.allow_gc).accept(
fgraph, no_recycling, profile)
# raise Exception("Cannot accept from a Linker that is already tied to another FunctionGraph.")
self.fgraph = fgraph
self.no_recycling = no_recycling
......@@ -944,7 +945,7 @@ class WrapLinker(Linker):
linkers=[l.clone(allow_gc=allow_gc) for l in self.linkers],
wrapper=self.wrapper)
def accept(self, fgraph, no_recycling=None):
def accept(self, fgraph, no_recycling=None, profile=None):
"""
Parameters
......
......@@ -5,7 +5,8 @@ from theano.gof.type import Type
from theano.gof import graph
from theano.gof.graph import Variable, Apply
from theano.gof.op import Op
from theano.gof.opt import * # noqa
from theano.gof.opt import (OpKeyOptimizer, PatternSub, NavigatorOptimizer,
TopoOptimizer, OpSub)
from theano.gof import destroyhandler
from theano.gof.fg import FunctionGraph, InconsistencyError
......
......@@ -11,7 +11,7 @@ from theano.gof.type import Type
from theano.gof.op import Op
from theano.gof import fg
from theano.gof.link import * # noqa
from theano.gof.link import PerformLinker, WrapLinker, Container
from theano.compat import cmp
......
......@@ -3,9 +3,11 @@ from __future__ import absolute_import, print_function, division
from theano.gof.type import Type
from theano.gof.graph import Variable, Apply, Constant
from theano.gof.op import Op
from theano.gof.opt import * # noqa
from theano.gof.opt import (OpKeyOptimizer, PatternSub, TopoOptimizer, OpSub,
MergeOptimizer, config, theano,
EquilibriumOptimizer, logging, pre_constant_merge,
pre_greedy_local_optimizer)
from theano.gof.fg import FunctionGraph
from theano.gof.toolbox import * # noqa
from theano import tensor as T
......
......@@ -5,7 +5,7 @@ from theano.gof.type import Type
from theano.gof.op import Op
from theano.gof.fg import FunctionGraph
from theano.gof.toolbox import * # noqa
from theano.gof.toolbox import NodeFinder
def as_variable(x):
......
......@@ -197,23 +197,53 @@ def test_speed_lazy():
def test_partial_function():
import numpy as np
from theano.tests import unittest_tools as utt
x = tensor.scalar('input')
y = x ** 2
f = theano.function([x], [y + 7, y - 9, y / 14.], mode=Mode(
optimizer=None, linker=vm.VM_Linker(allow_partial_eval=True)))
assert f(3, output_subset=[0, 1, 2]) == f(3)
assert f(4, output_subset=[0, 2]) == [f(4)[0], f(4)[2]]
utt.assert_allclose(f(5), np.array([32., 16., 1.7857142857142858]))
def check_partial_function(linker_name):
x = tensor.scalar('input')
y = x ** 2
f = theano.function([x], [y + 7, y - 9, y / 14.], mode=Mode(
optimizer=None, linker=linker_name))
assert f(3, output_subset=[0, 1, 2]) == f(3)
assert f(4, output_subset=[0, 2]) == [f(4)[0], f(4)[2]]
utt.assert_allclose(f(5), np.array([32., 16., 1.7857142857142858]))
def test_partial_function_output_keys():
x = tensor.scalar('input')
y = 3 * x
f = theano.function([x], {'a': y * 5, 'b': y - 7}, mode=Mode(
optimizer=None, linker=vm.VM_Linker(allow_partial_eval=True)))
check_partial_function(vm.VM_Linker(allow_partial_eval=True, use_cloop=False))
check_partial_function('cvm')
assert f(5, output_subset=['a'])['a'] == f(5)['a']
def test_partial_function_with_output_keys():
def check_partial_function_output_keys(linker_name):
x = tensor.scalar('input')
y = 3 * x
f = theano.function([x], {'a': y * 5, 'b': y - 7}, mode=Mode(
optimizer=None, linker=linker_name))
assert f(5, output_subset=['a'])['a'] == f(5)['a']
check_partial_function_output_keys(vm.VM_Linker(allow_partial_eval=True, use_cloop=False))
check_partial_function_output_keys('cvm')
def test_partial_function_with_updates():
def check_updates(linker_name):
x = tensor.lscalar('input')
y = theano.shared(numpy.asarray(1, 'int64'), name='global')
f = theano.function([x], [x, x + 34], updates=[(y, x + 1)], mode=Mode(
optimizer=None, linker=linker_name))
g = theano.function([x], [x - 6], updates=[(y, y + 3)], mode=Mode(
optimizer=None, linker=linker_name))
assert f(3, output_subset=[]) == []
assert y.get_value() == 4
assert g(30, output_subset=[0]) == [24]
assert g(40, output_subset=[]) == []
assert y.get_value() == 10
check_updates(vm.VM_Linker(allow_partial_eval=True, use_cloop=False))
check_updates('cvm')
def test_allow_gc_cvm():
......
......@@ -332,7 +332,8 @@ class Stack(VM):
def __init__(self, nodes, thunks, pre_call_clear,
storage_map, compute_map, fgraph, allow_gc,
dependencies=None, callback=None, callback_input=None):
n_updates, dependencies=None, callback=None,
callback_input=None):
super(Stack, self).__init__(nodes, thunks, pre_call_clear)
self.allow_gc = allow_gc
......@@ -346,6 +347,7 @@ class Stack(VM):
self.node_idx = node_idx = {}
self.callback = callback
self.callback_input = callback_input
self.n_updates = n_updates
ords = fgraph.orderings()
......@@ -417,6 +419,9 @@ class Stack(VM):
# apply_stack contains nodes
if output_subset is not None:
first_updated = len(self.outputs) - self.n_updates
output_subset = output_subset + list(range(first_updated,
len(self.outputs)))
apply_stack =\
[self.outputs[i].owner for i in output_subset
if self.outputs[i].owner]
......@@ -425,7 +430,7 @@ class Stack(VM):
last_apply_stack_len = -1
# This record all function inputs/shared varibles and constants
# This record all function inputs/shared variables and constants
for var, data in iteritems(self.storage_map):
if data[0] is None:
continue
......@@ -726,7 +731,7 @@ class VM_Linker(link.LocalLinker):
if schedule:
self.schedule = schedule
def accept(self, fgraph, no_recycling=None):
def accept(self, fgraph, no_recycling=None, profile=None):
"""
Check if fgraph is the first FunctionGraph that has ever been
associated to self, else, create a new VM_Linker
......@@ -774,9 +779,11 @@ class VM_Linker(link.LocalLinker):
schedule=self.schedule,
c_thunks=self.c_thunks,
allow_partial_eval=self.allow_partial_eval
).accept(fgraph, no_recycling)
).accept(fgraph, no_recycling, profile)
self.fgraph = fgraph
self.no_recycling = no_recycling
self.profile = profile
return self
def accept_var_updates(self, updated_vars):
......@@ -842,7 +849,7 @@ class VM_Linker(link.LocalLinker):
if (self.callback is not None or self.callback_input is not None or
(config.profile and config.profile_memory) or
self.allow_partial_eval):
(self.allow_partial_eval and not self.use_cloop)):
if self.use_cloop and (self.callback is not None or
self.callback_input is not None):
......@@ -850,9 +857,9 @@ class VM_Linker(link.LocalLinker):
if self.use_cloop and config.profile_memory:
warnings.warn(
'CVM does not support memory profile, using Stack VM.')
if self.use_cloop and self.allow_partial_eval:
if not self.use_cloop and self.allow_partial_eval:
warnings.warn(
'CVM does not support partial evaluation yet, '
'LoopGC does not support partial evaluation, '
'using Stack VM.')
# Needed for allow_gc=True, profiling and storage_map reuse
deps = self.compute_gc_dependencies(storage_map)
......@@ -860,6 +867,7 @@ class VM_Linker(link.LocalLinker):
nodes, thunks, pre_call_clear,
storage_map, compute_map,
self.fgraph, self.allow_gc,
len(updated_vars),
dependencies=deps,
callback=self.callback,
callback_input=self.callback_input)
......@@ -1000,7 +1008,8 @@ class VM_Linker(link.LocalLinker):
nodes, thunks, pre_call_clear,
storage_map, compute_map,
self.fgraph, self.allow_gc,
dependencies=deps
len(updated_vars),
dependencies=deps,
)
return vm
......@@ -1031,7 +1040,7 @@ class VM_Linker(link.LocalLinker):
reallocated_info = calculate_reallocate_info(
order, fgraph, storage_map, compute_map_re, dependencies)
t0 = time.time()
for node in order:
try:
if self.c_thunks is False:
......@@ -1049,6 +1058,11 @@ class VM_Linker(link.LocalLinker):
e.args = ("The following error happened while"
" compiling the node", node, "\n") + e.args
raise
t1 = time.time()
if self.profile:
self.profile.linker_node_make_thunks += t1 - t0
for node, thunk in zip(order, thunks):
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
......
......@@ -63,7 +63,8 @@ def init_dev(dev, name=None):
if dev not in init_dev.devmap:
ctx = pygpu.init(dev,
disable_alloc_cache=config.gpuarray.preallocate < 0,
single_stream=config.gpuarray.single_stream)
single_stream=config.gpuarray.single_stream,
sched=config.gpuarray.sched)
init_dev.devmap[dev] = ctx
if config.gpuarray.preallocate > 0:
MB = (1024 * 1024)
......@@ -89,11 +90,11 @@ def init_dev(dev, name=None):
if dev.startswith('cuda'):
try:
cudnn_version = dnn.version()
# 5100 should not print warning with cudnn 5 final.
if cudnn_version > 5100:
# 5200 should not print warning with cudnn 5.1 final.
if cudnn_version >= 5200:
warnings.warn("Your cuDNN version is more recent than Theano."
" If you see problems, try updating Theano or"
" downgrading cuDNN to version 5.")
" downgrading cuDNN to version 5.1.")
if config.print_active_device:
print("Using cuDNN version %d on context %s" %
(cudnn_version, name), file=sys.stderr)
......
......@@ -47,8 +47,8 @@ class GpuGemv(BlasOp):
A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
alpha = as_tensor_variable(alpha).astype('float64')
beta = as_tensor_variable(beta).astype('float64')
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 2
......@@ -128,8 +128,8 @@ class GpuGemm(BlasOp):
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
alpha = as_tensor_variable(alpha).astype('float64')
beta = as_tensor_variable(beta).astype('float64')
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 2
......@@ -208,7 +208,7 @@ class GpuGer(BlasOp):
A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
alpha = as_tensor_variable(alpha)
alpha = as_tensor_variable(alpha).astype('float64')
assert alpha.ndim == 0
assert A.ndim == 2
assert x.ndim == 1
......@@ -345,8 +345,8 @@ class GpuGemmBatch(BlasOp):
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
alpha = as_tensor_variable(alpha).astype('float64')
beta = as_tensor_variable(beta).astype('float64')
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 3
......
......@@ -369,7 +369,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return node.inputs[0].type.context
def c_code_cache_version(self):
return (11,)
return (12,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
......@@ -499,7 +499,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
load_sm = load_w(dtype_sm)
write_dx = write_w(dtype_dx)
flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
type_dnll = gpuarray.dtype_to_ctype(work_dnll)
wtype_dnll = gpuarray.dtype_to_ctype(work_dnll)
type_dnll = gpuarray.dtype_to_ctype(dtype_dnll)
type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
type_dx = gpuarray.dtype_to_ctype(dtype_dx)
......@@ -525,7 +526,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
for (int i = blockIdx.x; i < N; i += gridDim.x)
{
%(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
%(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
%(type_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x)
......
......@@ -6,6 +6,7 @@ import pdb
import time
from six import iteritems
from six.moves import xrange
import sys
import theano
from theano import tensor, scalar, gof, config
......@@ -13,7 +14,6 @@ from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
SequenceDB, Optimizer, DB, toolbox, graph)
from theano.gof.opt import NavigatorOptimizer
from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet
......@@ -81,8 +81,8 @@ gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpuarray_graph_optimization', GraphToGPUDB(), -0.5,
'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_local_optimizations', gpu_optimizer, 1,
'fast_compile', 'fast_run', 'gpuarray', 'gpuarray_local_optimiziations')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
'fast_compile', 'fast_run', 'gpuarray')
......@@ -262,7 +262,7 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
class GraphToGPU(NavigatorOptimizer):
class GraphToGPU(Optimizer):
"""
Transfer the graph as a whole to GPU instead of transfering node by node.
......@@ -373,6 +373,14 @@ class GraphToGPU(NavigatorOptimizer):
if new_ops:
node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None)
for old_o, new_o in zip(outputs, node.outputs)]):
_logger.warning(
"The optimization %s returned bad dtype. Skipping it."
" Write to theano-dev mailing list about this." %
str(lopt))
newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
outputs = newnode.outputs
for new_o, old_o in zip(outputs, node.outputs):
assert len(outputs) == len(node.outputs)
......@@ -477,6 +485,16 @@ class GraphToGPU(NavigatorOptimizer):
node_created,
process_count)
def print_summary(self, stream=sys.stdout, level=0, depth=-1):
print("%s%s (%i)" % (
(' ' * level), self.__class__.__name__, id(self)), file=stream)
if depth != 0:
map_values = []
for opts in self.local_optimizers_map.values():
map_values += opts
for opt in self.local_optimizers_all + map_values:
opt.print_summary(stream, level=(level + 2), depth=(depth - 1))
@local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
def local_cut_gpu_transfers(node):
......@@ -625,10 +643,7 @@ def local_gpua_contiguous(op, context_name, inputs, outputs):
@op_lifter([tensor.Reshape])
@register_opt2([tensor.Reshape], 'fast_compile')
def local_gpua_reshape(op, context_name, inputs, outputs):
name = op.name
if name:
name = 'Gpu' + name
res = GpuReshape(op.ndim, op.name)
res = GpuReshape(op.ndim)
return res
......@@ -647,7 +662,7 @@ def local_gpua_flatten(op, context_name, inputs, outputs):
if op.outdim != 1:
shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
shp += [-1]
res = GpuReshape(op.outdim, None)
res = GpuReshape(op.outdim)
o = res(inputs[0], theano.tensor.as_tensor_variable(shp))
return o
......@@ -1009,10 +1024,9 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
else:
return False
x, = inputs
greduce = op2(
op.scalar_op, axis=op.axis,
dtype=getattr(op, 'dtype', None),
dtype=getattr(op, 'dtype', outputs[0].dtype),
acc_dtype=getattr(op, 'acc_dtype', None))
gvar = greduce(x)
# We need to have the make node called, otherwise the mask can
......@@ -1051,7 +1065,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
greduce = op2(
op.scalar_op,
axis=new_axis, reduce_mask=new_mask,
dtype=getattr(op, 'dtype', None),
dtype=getattr(op, 'dtype', outputs[0].dtype),
acc_dtype=getattr(op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp))
......
......@@ -336,12 +336,23 @@ class GpuIncSubtensor(IncSubtensor):
C code expression to copy source into view, and 0 on success.
"""
return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
return """sub_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
'<gpuarray/elemwise.h>']
def c_support_code(self):
return """
int sub_setarray(GpuArray *dst, GpuArray *src) {
int err;
err = GpuArray_setarray(dst, src);
if (err != GA_NO_ERROR)
PyErr_SetString(PyExc_RuntimeError, "setarray failed");
return err;
}
"""
def c_support_code_struct(self, node, nodename):
return "\nGpuElemwise *iadd;\n"
......@@ -383,7 +394,7 @@ class GpuIncSubtensor(IncSubtensor):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
if not parent_version:
return
return parent_version + (7,)
return parent_version + (8,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
......
......@@ -21,6 +21,7 @@ from ..basic_ops import (
host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu,
GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..elemwise import GpuDimShuffle, GpuElemwise
from ..subtensor import GpuSubtensor
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
......@@ -167,6 +168,8 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
inputs, variables))
Checker.__name__ = name
if hasattr(Checker, '__qualname__'):
Checker.__qualname__ = name
return Checker
......@@ -228,6 +231,7 @@ def gpu_alloc_expected(x, *shp):
g[:] = x
return g
GpuAllocTester = makeTester(
name="GpuAllocTester",
op=alloc,
......@@ -321,7 +325,7 @@ class G_reshape(test_basic.T_reshape):
mode=mode_with_gpu,
ignore_topo=(HostFromGpu, GpuFromHost,
theano.compile.DeepCopyOp,
theano.gpuarray.elemwise.GpuElemwise,
GpuDimShuffle, GpuElemwise,
theano.tensor.opt.Shape_i,
theano.tensor.opt.MakeVector))
assert self.op == GpuReshape
......
......@@ -479,10 +479,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
@parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
def test_conv3d_none(self, border_mode, conv_mode):
ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
self._test_conv(ftensor5('img'),
ftensor5('kerns'),
ftensor5('out'),
self._test_conv(T.ftensor5('img'),
T.ftensor5('kerns'),
T.ftensor5('out'),
numpy.random.rand(10, 2, 6, 4, 11),
numpy.random.rand(8, 2, 4, 3, 1),
border_mode,
......
......@@ -28,6 +28,27 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
raise SkipTest("Cuda specific tests")
self.max_threads_dim0 = test_ctx.maxlsize0
self.max_grid_size1 = test_ctx.maxgsize2
self.op_class = GpuCumsum
def test_infer_shape(self):
# GpuCumSum is only defined for float32 for now, so we skip it
# in the unsupported cases
gpucumsum_supported_dtypes = ('float32',)
if theano.config.floatX not in gpucumsum_supported_dtypes:
raise SkipTest('GpuCumSum not implemented for dtype %s'
% theano.config.floatX)
x = T.tensor3('x')
a = np.random.random((3, 5, 2)).astype(theano.config.floatX)
for axis in range(-len(a.shape), len(a.shape)):
self._compile_and_check([x],
[cumsum(x, axis=axis)],
[a],
self.op_class)
def test_grad(self):
# no grad for GpuCumsum
pass
def test_Strides1D(self):
x = T.fvector('x')
......
......@@ -214,8 +214,7 @@ class TestFFT(unittest.TestCase):
res_irfft = f_irfft()
inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
irfft_ref = numpy.fft.irfftn(
inputs_ref, s=(M, M), axes=(1, 2), norm='ortho')
irfft_ref = numpy.fft.irfftn(inputs_ref, s=(M, M), axes=(1, 2)) * M
utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
......
......@@ -7,6 +7,7 @@ from theano.compile import DeepCopyOp
from theano.tensor.tests import test_subtensor
from ..basic_ops import HostFromGpu, GpuFromHost
from ..elemwise import GpuDimShuffle
from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1)
......@@ -27,6 +28,7 @@ class G_subtensor(test_subtensor.T_subtensor):
inc_sub=GpuIncSubtensor,
adv_sub1=GpuAdvancedSubtensor1,
adv_incsub1=GpuAdvancedIncSubtensor1,
dimshuffle=GpuDimShuffle,
mode=mode_with_gpu,
# avoid errors with limited devices
dtype='float32',
......
......@@ -390,8 +390,8 @@ def grad(cost, wrt, consider_constant=None,
If True, variables generated by grad will be named
(d<cost.name>/d<wrt.name>) provided that both cost and wrt
have names
known_grads : dict, optional
A dictionary mapping variables to their gradients. This is
known_grads : OrderedDict, optional
A ordered dictionary mapping variables to their gradients. This is
useful in the case where you know the gradient on some
variables but do not know the original cost.
return_disconnected : {'zero', 'None', 'Disconnected'}
......@@ -462,6 +462,9 @@ def grad(cost, wrt, consider_constant=None,
if known_grads is None:
known_grads = OrderedDict()
else:
m = "known_grads must be an OrderedDict. "
assert isinstance(known_grads, OrderedDict) or len(known_grads) <= 1, m
# The gradient of the cost is 1 unless specified otherwise by known_grads.
if cost is not None:
......@@ -1369,8 +1372,10 @@ class numeric_grad(object):
# perfectly accurate.
type_eps = {'float64': 1e-7,
'float32': 3e-4,
'float16': 1e-3,
numpy.dtype('float64'): 1e-7,
numpy.dtype('float32'): 3e-4}
numpy.dtype('float32'): 3e-4,
numpy.dtype('float16'): 1e-3}
def __init__(self, f, pt, eps=None, out_type=None):
"""Return the gradient of f at pt.
......@@ -1570,7 +1575,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
and returns a Theano variable. For instance, an Op instance with
a single output.
:param pt: the list of numpy.ndarrays to use as input values.
These arrays must be either float32 or float64 arrays.
These arrays must be either float16, float32, or float64 arrays.
:param n_tests: number of times to run the test
:param rng: random number generator used to sample u, we test gradient
of sum(u * fun) at pt
......@@ -1589,7 +1594,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
comparison
:param cast_to_output_type: if the output is float32 and
cast_to_output_type is True, cast the random projection to
float32. Otherwise it is float64.
float32. Otherwise it is float64. float16 is not handled here.
:param no_debug_ref: Don't use DebugMode for the numerical
gradient function.
......@@ -1606,12 +1611,13 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
pt = [numpy.array(p) for p in pt]
for i, p in enumerate(pt):
if p.dtype not in ('float32', 'float64'):
if p.dtype not in ('float16', 'float32', 'float64'):
raise TypeError(
('verify_grad can work only with floating point '
'inputs, but input %i has dtype "%s".') % (i, p.dtype))
_type_tol = dict( # relative error tolerances for different types
float16=5e-2,
float32=1e-2,
float64=1e-4)
......
......@@ -6,6 +6,7 @@ import os
import shutil
import stat
import sys
import textwrap
import warnings
import theano
......@@ -82,10 +83,10 @@ def set_cuda_disabled():
cuda_path = os.path.abspath(os.path.split(__file__)[0])
cuda_ndarray_loc = os.path.join(config.compiledir, 'cuda_ndarray')
cuda_ndarray_so = os.path.join(cuda_ndarray_loc,
'cuda_ndarray.' + get_lib_extension())
libcuda_ndarray_so = os.path.join(cuda_ndarray_loc,
'libcuda_ndarray.' + get_lib_extension())
cuda_ndarray_so = os.path.join(
cuda_ndarray_loc, 'cuda_ndarray.' + get_lib_extension())
libcuda_ndarray_so = os.path.join(
cuda_ndarray_loc, 'libcuda_ndarray.' + get_lib_extension())
def try_import():
......@@ -280,6 +281,7 @@ def dnn_available():
dnn_available.msg = "Device not supported"
dnn_available.avail = False
else:
<<<<<<< HEAD
preambule = """
#include <stdio.h>
#include <cuda.h>
......@@ -300,6 +302,27 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
path_wrapper = "\"" if os.name =='nt' else ""
params = ["-l", "cudnn"]
params.extend(['-I%s%s%s' % (path_wrapper,os.path.dirname(__file__),path_wrapper)])
=======
preambule = textwrap.dedent(
"""
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn_helper.h>
""")
body = textwrap.dedent(
"""
cudnnHandle_t _handle = NULL;
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
fprintf(stderr, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
return 1;
}
""")
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
>>>>>>> refs/remotes/Theano/master
if config.dnn.include_path:
params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)])
if config.dnn.library_path:
......@@ -307,8 +330,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
if config.nvcc.compiler_bindir:
params.extend(['--compiler-bindir',
config.nvcc.compiler_bindir])
params.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
params.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
......@@ -370,24 +393,26 @@ class DnnVersion(GpuOp):
return ['-Wl,-rpath,' + config.dnn.library_path]
def c_support_code(self):
return """
#if PY_MAJOR_VERSION >= 3
#define PyInt_FromLong PyLong_FromLong
#endif
"""
return textwrap.dedent(
"""
#if PY_MAJOR_VERSION >= 3
#define PyInt_FromLong PyLong_FromLong
#endif
""")
def make_node(self):
return theano.gof.Apply(self, [], [theano.gof.Generic()()])
def c_code(self, node, name, inputs, outputs, sub):
o = outputs[0]
return """
#if defined(CUDNN_VERSION)
%(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
#else
%(o)s = PyInt_FromLong(-1);
#endif
""" % locals()
return textwrap.dedent(
"""
#if defined(CUDNN_VERSION)
%(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
#else
%(o)s = PyInt_FromLong(-1);
#endif
""") % locals()
def do_constant_folding(self, node):
# Needed as we do not want to cache this information.
......@@ -426,12 +451,13 @@ if cuda_available:
import cuda_ndarray.cuda_ndarray
if cuda_ndarray_so != cuda_ndarray.cuda_ndarray.__file__:
_logger.warning("cuda_ndarray was loaded from %s, but Theano expected "
"to load it from %s. This is not expected as theano should "
"compile it automatically for you. Do you have a directory "
"called cuda_ndarray in your LD_LIBRARY_PATH environment "
"variable? If so, please remove it as it is outdated.",
cuda_ndarray.cuda_ndarray.__file__,
cuda_ndarray_so)
"to load it from %s. This is not expected as theano "
"should compile it automatically for you. Do you have "
"a directory called cuda_ndarray in your "
"LD_LIBRARY_PATH environment variable? If so, please "
"remove it as it is outdated.",
cuda_ndarray.cuda_ndarray.__file__,
cuda_ndarray_so)
shared_constructor = float32_shared_constructor
......@@ -446,8 +472,8 @@ if cuda_available:
ftensor3, ftensor4,
scalar, vector, matrix, row, col,
tensor3, tensor4)
from .basic_ops import (host_from_gpu, gpu_from_host,
as_cuda_array, as_cuda_ndarray_variable)
from .basic_ops import (host_from_gpu, gpu_from_host, as_cuda_array,
as_cuda_ndarray_variable)
import cuda_ndarray
from . import opt, dnn
from .rng_curand import CURAND_RandomStreams
......@@ -497,10 +523,11 @@ def use(device,
raise EnvironmentError("You forced the use of gpu device %s, "
"but CUDA initialization failed "
"with error:\n%s" % (
device, cuda_initialization_error_message))
device,
cuda_initialization_error_message))
elif not nvcc_compiler.is_nvcc_available():
_logger.error('nvcc compiler not found on $PATH.'
' Check your nvcc installation and try again.')
_logger.error("nvcc compiler not found on $PATH. "
"Check your nvcc installation and try again.")
return
elif not cuda_available:
error_addendum = ""
......@@ -509,10 +536,10 @@ def use(device,
error_addendum = (" (error: %s)" %
cuda_initialization_error_message)
except NameError:
# cuda_initialization_error_message is not available b/c compilation failed
# cuda_initialization_error_message is not available b/c compilation failed
pass
_logger.warning('CUDA is installed, but device %s is not available %s',
device, error_addendum)
_logger.warning("CUDA is installed, but device %s is not available %s",
device, error_addendum)
return
if device == 'gpu':
......@@ -586,12 +613,12 @@ def use(device,
if dnn_available():
(hdr_v, runtime_v) = dnn_version()
cudnn_version = runtime_v
# 5100 should not print warning with cudnn 5 final.
if cudnn_version > 5100:
# 5200 should not print warning with cudnn 5 final.
if cudnn_version >= 5200:
warn = ("Your cuDNN version is more recent than the one"
" Theano officially supports."
" If you see any problems, try updating Theano or"
" downgrading cuDNN to version 5.")
" downgrading cuDNN to version 5.1.")
except Exception:
cudnn_version = dnn_available.msg
print("Using gpu device %d: %s (CNMeM is %s, cuDNN %s)" % (
......@@ -625,8 +652,8 @@ def use(device,
elif use.device_number != device and device != 'gpu':
_logger.warning(("Ignoring call to use(%s), GPU number %i "
"is already in use."),
str(device), use.device_number)
"is already in use."),
str(device), use.device_number)
if move_shared_float32_to_gpu:
handle_shared_float32(True)
......@@ -704,11 +731,10 @@ elif config.init_gpu_device.startswith('gpu'):
"We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'")
_logger.warning(("GPU device %s will be initialized, and used if a GPU is "
"needed. "
"However, no computation, nor shared variables, will be implicitly "
"moved to that device. If you want that behavior, use the 'device' "
"flag instead."),
config.init_gpu_device)
"needed. However, no computation, nor shared variables, "
"will be implicitly moved to that device. If you want "
"that behavior, use the 'device' flag instead."),
config.init_gpu_device)
use(device=config.init_gpu_device,
force=config.force_device,
default_to_move_computation_to_gpu=False,
......
......@@ -700,6 +700,8 @@ def local_gpu_solve(node):
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
"""
if node.outputs[0].dtype != 'float32':
return
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
......@@ -1352,8 +1354,9 @@ def cast(x, dtype):
@register_opt()
@local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
@local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias],
'local_gpu_crossentorpy_softmax_argmax_1hot_with_bias')
def local_gpu_crossentropy_softmax_argmax_1hot_with_bias(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
x, b, y = node.inputs
if x.owner and isinstance(x.owner.op, HostFromGpu):
......@@ -1381,8 +1384,9 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
@register_opt()
@local_optimizer([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
@local_optimizer([tensor.nnet.CrossentropySoftmax1HotWithBiasDx],
'local_gpu_crossentorpy_softmax_1hot_with_bias_dx')
def local_gpu_crossentropy_softmax_1hot_with_bias_dx(node):
if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
dnll, sm, yidx = node.inputs
if sm.owner and isinstance(sm.owner.op, HostFromGpu):
......
......@@ -1014,6 +1014,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
inc_sub = cuda.GpuIncSubtensor
adv_sub1 = cuda.GpuAdvancedSubtensor1
adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
dimshuffle = cuda.GpuDimShuffle
mode = mode_with_gpu
dtype = 'float32'
type = tcn.CudaNdarrayType
......@@ -1075,7 +1076,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
# Test with c_contiguous input
t = self.adv_sub1()(n, idx)
t.owner.op.perform_using_take = True # input c_contiguous, so we reshape
val = self.eval_output_and_check(t, list=True)
val = self.eval_output_and_check(t, op_type=self.adv_sub1)
val = numpy.asarray(val)
good = data[idx]
......
......@@ -823,7 +823,7 @@ def test_batchnorm_inference():
utt.assert_allclose(outputs[3], outputs[3 + 5]) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5]) # dvar
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=5e-5) # dvar
def test_dnn_tag():
......@@ -938,10 +938,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
def test_conv3d(self):
if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
raise SkipTest('"cuDNN 3D convolution requires cuDNN v2')
ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
img = ftensor5('img')
kerns = ftensor5('kerns')
out = ftensor5('out')
img = T.ftensor5('img')
kerns = T.ftensor5('kerns')
out = T.ftensor5('out')
img_val = numpy.asarray(
numpy.random.rand(10, 2, 6, 4, 11),
dtype='float32'
......@@ -1026,10 +1025,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
def test_conv3d_gradw(self):
if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
raise SkipTest('"cuDNN 3D convolution requires cuDNN v2')
ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
img = ftensor5('img')
kerns = ftensor5('kerns')
out = ftensor5('out')
img = T.ftensor5('img')
kerns = T.ftensor5('kerns')
out = T.ftensor5('out')
img_val = numpy.asarray(
numpy.random.rand(9, 2, 4, 8, 13),
dtype='float32'
......@@ -1116,10 +1114,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
def test_conv3d_gradi(self):
if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
raise SkipTest('"cuDNN 3D convolution requires cuDNN v2')
ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
img = ftensor5('img')
kerns = ftensor5('kerns')
out = ftensor5('out')
img = T.ftensor5('img')
kerns = T.ftensor5('kerns')
out = T.ftensor5('out')
img_val = numpy.asarray(
numpy.random.rand(8, 4, 6, 7, 11),
dtype='float32'
......
......@@ -3,12 +3,14 @@ This file test tensor op that should also operate on CudaNdaray.
"""
from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest
from nose_parameterized import parameterized
import numpy
import theano
from theano import tensor
import theano.tensor as T
import theano.tests.unittest_tools as utt
# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda
......@@ -139,6 +141,8 @@ def test_get_diagonal_subtensor_view():
test_conv3d2d.test_get_diagonal_subtensor_view(wrap=cuda.CudaNdarray)
def test_conv3d():
test_conv3d2d.test_conv3d(mode=mode_with_gpu,
shared=cuda.shared_constructor)
@parameterized.expand(('valid', 'full'), utt.custom_name_func)
def test_conv3d(border_mode):
test_conv3d2d.check_conv3d(border_mode=border_mode,
mode=mode_with_gpu,
shared=cuda.shared_constructor)
......@@ -2024,7 +2024,7 @@ class Scan(PureOp):
# it will be the sum of the external gradient signal and the
# gradient obtained by propagating Y's external gradient signal
# to X.
known_grads = dict([(k.copy(), v) for (k, v) in known_grads.items()])
known_grads = OrderedDict([(k.copy(), v) for (k, v) in known_grads.items()])
grads = gradient.grad(
cost=None,
......@@ -2094,7 +2094,7 @@ class Scan(PureOp):
dC_dXts.append(dC_dXt)
known_grads = {}
known_grads = OrderedDict()
dc_dxts_idx = 0
for i in range(len(diff_outputs)):
if i < idx_nitsot_start or i >= idx_nitsot_end:
......
......@@ -2472,6 +2472,8 @@ def _hv_switch(op, expected_function):
def expected_f(self, a, format=None, dtype=None):
return expected_function(a, format, dtype)
XStackTester.__name__ = op.__name__ + "Tester"
if hasattr(XStackTester, '__qualname__'):
XStackTester.__qualname__ = XStackTester.__name__
return XStackTester
HStackTester = _hv_switch(HStack, sp.hstack)
......@@ -2687,6 +2689,8 @@ def elemwise_checker(op, expected_f, gap=None, test_dtypes=None,
if name is None:
name = op.__name__.capitalize() + 'Tester'
Tester.__name__ = name
if hasattr(Tester, '__qualname__'):
Tester.__qualname__ = name
assert 'Roundhalftoeven' not in Tester.__name__
return Tester
......
......@@ -463,8 +463,8 @@ if int(config.tensor.cmp_sloppy) > 1:
# When config.tensor.cmp_sloppy>1 we are even more sloppy. This is
# useful to test the GPU as they don't use extended precision and
# this cause some difference bigger then the normal sloppy.
float16_atol = 5e-3
float16_rtol = 1e-2
float16_atol = 1e-2
float16_rtol = 5e-2
float32_atol = 5e-4
float32_rtol = 1e-3
......@@ -472,8 +472,8 @@ if int(config.tensor.cmp_sloppy) > 1:
float64_rtol = 1e-4
float64_atol = 1e-3
elif int(config.tensor.cmp_sloppy):
float16_atol = 1e-3
float16_rtol = 5e-3
float16_atol = 5e-3
float16_rtol = 1e-2
float32_atol = 1e-4
float32_rtol = 1e-3
......@@ -483,8 +483,8 @@ elif int(config.tensor.cmp_sloppy):
else:
# If you change those value in test don't forget to put them back
# when the test end. Don't forget the case when the test fail.
float16_atol = 5e-4
float16_rtol = 5e-4
float16_atol = 1e-3
float16_rtol = 1e-3
float32_atol = 1e-5
float32_rtol = 1e-5
......@@ -1030,6 +1030,34 @@ def tensor4(name=None, dtype=None):
tensor4s, ftensor4s, dtensor4s, itensor4s, ltensor4s = _multi(
tensor4, ftensor4, dtensor4, itensor4, ltensor4)
ctensor5 = TensorType('complex64', ((False,) * 5))
ztensor5 = TensorType('complex128', ((False,) * 5))
ftensor5 = TensorType('float32', ((False,) * 5))
dtensor5 = TensorType('float64', ((False,) * 5))
btensor5 = TensorType('int8', ((False,) * 5))
wtensor5 = TensorType('int16', ((False,) * 5))
itensor5 = TensorType('int32', ((False,) * 5))
ltensor5 = TensorType('int64', ((False,) * 5))
def tensor5(name=None, dtype=None):
"""Return a symbolic 5-D variable.
Parameters
----------
dtype: numeric type
None means to use theano.config.floatX.
name
A name to attach to this variable.
"""
if dtype is None:
dtype = config.floatX
type = TensorType(dtype, (False, False, False, False, False))
return type(name)
tensor5s, ftensor5s, dtensor5s, itensor5s, ltensor5s = _multi(
tensor5, ftensor5, dtensor5, itensor5, ltensor5)
Tensor = TensorType
......@@ -2270,12 +2298,15 @@ pprint.assign(fill, printing.FunctionPrinter('fill'))
@constructor
def ones_like(model, dtype=None):
def ones_like(model, dtype=None, opt=False):
"""equivalent of numpy.ones_like
Parameters
----------
model : tensor
dtype : data-type, optional
opt : If True, we will return a constant instead of a graph when possible.
Useful for Theano optimization, not for user building a graph as this
have the consequence that model isn't always in the graph.
Returns
-------
......@@ -2284,17 +2315,22 @@ def ones_like(model, dtype=None):
"""
if dtype is None:
dtype = model.type.dtype
ret = fill(model, constant(1.0, dtype=dtype))
return ret
ret = constant(1.0, dtype=dtype)
if opt and ret.type == model.type:
return ret
return fill(model, ret)
@constructor
def zeros_like(model, dtype=None):
def zeros_like(model, dtype=None, opt=False):
"""equivalent of numpy.zeros_like
Parameters
----------
model : tensor
dtype : data-type, optional
opt : If True, we will return a constant instead of a graph when possible.
Useful for Theano optimization, not for user building a graph as this
have the consequence that model isn't always in the graph.
Returns
-------
......@@ -2304,7 +2340,10 @@ def zeros_like(model, dtype=None):
if dtype is None:
dtype = model.type.dtype
return fill(model, constant(0.0, dtype=dtype))
ret = constant(0.0, dtype=dtype)
if opt and ret.type == model.type:
return ret
return fill(model, ret)
def zeros(shape, dtype=None):
......@@ -2780,13 +2819,14 @@ class Alloc(gof.Op):
}
// This function takes care of broadcasting
PyArray_CopyInto(%(zz)s, %(vv)s);
if (PyArray_CopyInto(%(zz)s, %(vv)s) == -1)
%(fail)s
""" % dict(vv=vv, ndim=ndim, zz=zz, fail=fail)
return code
def c_code_cache_version(self):
return (1,)
return (2,)
def infer_shape(self, node, input_shapes):
return [node.inputs[1:]]
......@@ -3135,7 +3175,7 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
@constructor
def var(input, axis=None, keepdims=False):
def var(input, axis=None, ddof=0, keepdims=False):
"""
Computes the variance along the given axis(es) of a tensor `input`.
......@@ -3144,6 +3184,8 @@ def var(input, axis=None, keepdims=False):
axis: None or int or (list of int) (see `Sum`)
Compute the variance along this axis of the tensor.
None means all axes (like numpy).
ddof: Degrees of freedom; 0 would compute the ML estimate, 1 would compute
the unbiased estimate.
keepdims : bool
If this is set to True, the axes which are reduced are
left in the result as dimensions with size one. With this option,
......@@ -3158,6 +3200,9 @@ def var(input, axis=None, keepdims=False):
"""
if isinstance(ddof, (bool)):
raise ValueError('Parameter keepdims is now at index 3: (input, axis=None, ddof=0, keepdims=False)')
input_ndim = input.type.ndim
if axis is None:
axis = list(range(input_ndim))
......@@ -3175,13 +3220,19 @@ def var(input, axis=None, keepdims=False):
centered_input = input - mean_input
# return the mean sqr
v = mean((centered_input ** 2), axis, keepdims=keepdims)
if ddof == 0:
v = mean((centered_input ** 2), axis, keepdims=keepdims)
else:
shp = shape(input) - ddof
v = sum((centered_input ** 2), axis=axis, keepdims=keepdims)
for i in axis:
v = true_div(v, shp[i])
v.name = 'var'
return v
@constructor
def std(input, axis=None, keepdims=False):
def std(input, axis=None, ddof=0, keepdims=False):
"""
Computes the standard deviation along the given axis(es) of a tensor `input`.
......@@ -3205,7 +3256,10 @@ def std(input, axis=None, keepdims=False):
"""
ret = sqrt(var(input=input, axis=axis, keepdims=keepdims))
if isinstance(ddof, (bool)):
raise ValueError('Parameter keepdims is now at index 3: (input, axis=None, ddof=0, keepdims=False)')
ret = sqrt(var(input=input, axis=axis, ddof=ddof, keepdims=keepdims))
ret.name = 'std'
return ret
......@@ -4047,6 +4101,11 @@ def roll(x, shift, axis=None):
else:
axis = 0
# Shift may be larger than the size of the axis. If so, since the
# roll operation is cyclic, we can take the shift modulo the size
# of the axis
shift = shift % x.shape[axis]
# A slice of all elements in a dimension ':'
allslice = slice(None)
# List of slices describing the front half [:, :, shift:, :]
......@@ -4381,7 +4440,7 @@ class Reshape(Op):
def __init__(self, ndim, name=None):
self.ndim = ndim
self.name = name
assert name is None, 'name attribute for Reshape has been deprecated'
def __str__(self):
return '%s{%s}' % (self.__class__.__name__, self.ndim)
......@@ -4557,7 +4616,7 @@ class Reshape(Op):
return Op.c_code(self, node, name, inputs, outputs, sub)
def reshape(x, newshape, ndim=None, name=None):
def reshape(x, newshape, ndim=None):
if ndim is None:
newshape = as_tensor_variable(newshape)
if newshape.ndim != 1:
......@@ -4573,7 +4632,7 @@ def reshape(x, newshape, ndim=None, name=None):
"to know what the number of dimensions of the reshaped "
"variable will be. You can provide the 'ndim' keyword "
"argument to 'reshape' to avoid this problem." % newshape)
op = Reshape(ndim, name)
op = Reshape(ndim)
rval = op(x, newshape)
return rval
......
差异被折叠。
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论