Updated implementation, merge conflicts, added test

a77a0ef4 · Kumar Krishna Agrawal · bebe79eb · dd9adf80 · a77a0ef4 · a77a0ef4
--- a/.jenkins/jenkins_pretest.sh
+++ b/.jenkins/jenkins_pretest.sh
+#!/bin/bash
+
+# Script for Jenkins continuous integration pre-testing
+
+# Print commands as they are executed
+set -x
+
+# Anaconda python
+export PATH=/usr/local/miniconda2/bin:$PATH
+
+# Test flake8
+echo "===== Testing flake8"
+bin/theano-nose theano/tests/test_flake8.py || exit 1
+
+# Test documentation
+echo "===== Testing documentation build"
+python doc/scripts/docgen.py --nopdf --check || exit 1
+echo "===== Testing documentation code snippets"
+python doc/scripts/docgen.py --test --check || exit 1
--- a/.jenkins/jenkins_test1.sh
+++ b/.jenkins/jenkins_test1.sh
+#!/bin/bash
+
+# Script for Jenkins continuous integration testing of theano base
+
+# Print commands as they are executed
+set -x
+
+# Anaconda python
+export PATH=/usr/local/miniconda2/bin:$PATH
+
+echo "===== Testing theano core"
+
+# Test theano core
+PARTS="theano -e cuda -e gpuarray"
+THEANO_PARAM="${PARTS} --with-timer --timer-top-n 10"
+FLAGS="mode=FAST_RUN,floatX=float32"
+THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
--- a/.jenkins/jenkins_test2.sh
+++ b/.jenkins/jenkins_test2.sh
+#!/bin/bash
+
+# Script for Jenkins continuous integration testing of gpu backends
+
+# Print commands as they are executed
+set -x
+
+# Anaconda python
+export PATH=/usr/local/miniconda2/bin:$PATH
+
+# CUDA                                                                          
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
+
+echo "===== Testing old theano.sandbox.cuda backend"
+
+THEANO_CUDA_TESTS="theano/sandbox/cuda/tests \
+            theano/misc/tests/test_pycuda_example.py \
+            theano/misc/tests/test_pycuda_theano_simple.py \
+            theano/misc/tests/test_pycuda_utils.py \
+            theano/tensor/tests/test_opt.py:TestCompositeCodegen \
+            theano/tensor/tests/test_opt.py:test_shapeoptimizer \
+            theano/tensor/tests/test_opt.py:test_fusion \
+            theano/compile/tests/test_debugmode.py:Test_preallocated_output \
+            theano/sparse/tests/test_basic.py:DotTests \
+            theano/sandbox/tests/test_multinomial.py:test_gpu_opt \
+            theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPU_serial \
+            theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPU_parallel \
+            theano/sandbox/tests/test_rng_mrg.py:test_GPU_nstreams_limit \
+            theano/sandbox/tests/test_rng_mrg.py:test_overflow_gpu_old_backend \
+            theano/scan_module/tests/test_scan.py:T_Scan_Cuda"
+THEANO_PARAM="${THEANO_CUDA_TESTS} --with-timer --timer-top-n 10"
+FLAGS="mode=FAST_RUN,init_gpu_device=gpu,floatX=float32"
+THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
+
+echo "===== Testing gpuarray backend"
+
+GPUARRAY_CONFIG="Release"
+DEVICE=cuda0
+LIBDIR=~/tmp/local
+
+# Make fresh clones of libgpuarray (with no history since we don't need it)
+rm -rf libgpuarray
+git clone --depth 1 "https://github.com/Theano/libgpuarray.git"
+
+# Clean up previous installs (to make sure no old files are left) 
+rm -rf $LIBDIR
+mkdir $LIBDIR
+
+# Build libgpuarray
+mkdir libgpuarray/build
+(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)
+
+# Finally install                                                               
+(cd libgpuarray/build && make install)
+
+# Export paths
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib64/
+export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib64/
+export CPATH=$CPATH:$LIBDIR/include
+export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
+
+# Build the pygpu modules                                                       
+(cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
+ls $LIBDIR
+mkdir $LIBDIR/lib/python
+export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
+# Then install                                                                  
+(cd libgpuarray && python setup.py install --home=$LIBDIR)
+
+# Testing theano (the gpuarray parts)                                           
+THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
+                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
+                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \
+                       theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray"
+FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN"
+THEANO_FLAGS=${FLAGS} time nosetests -v ${THEANO_GPUARRAY_TESTS}
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,7 +33,8 @@ install:
  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then conda create --yes -q -n pyenv mkl python=2.6 numpy=1.7.1 scipy=0.11 nose=1.3.0 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx; fi
  - if [[ $TRAVIS_PYTHON_VERSION == '3.3' ]]; then conda create --yes -q -n pyenv mkl python=3.3 numpy=1.9.1 scipy=0.14.0 nose=1.3.4 pip flake8=2.3 six=1.9.0 pep8=1.6.2 pyflakes=0.8.1 sphinx; fi
  - source activate pyenv
-  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install pydot; else pip install pydot-ng; fi
+# pydot 1.2 broke support of python 2.6. They won't try to maintain it.
+  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install pydot==1.1.0; else pip install pydot-ng; fi
  - pip install . --no-deps
  - pip install flake8-future-import nose-parameterized==0.5.0


--- a/doc/.static/version_switch.js
+++ b/doc/.static/version_switch.js
@@ -30,10 +30,14 @@
  function build_vswitch() {
  // Build HTML string for version selector, based on ReadTheDocs theme's versions.html

+    var vlabel = current_version.replace("theano_versions/", "");
+    if (vlabel == 'theano') {
+      vlabel = 'release';
+    }
    var vswitch = ['<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions" align=left>'];
    vswitch.push('<span class="rst-current-version" data-toggle="rst-current-version">');
    vswitch.push('<span class="fa fa-book"></span>');
-    vswitch.push('v: ', current_version.replace("theano_versions/", ""), ' ');
+    vswitch.push('v: ', vlabel, ' ');
    vswitch.push('<span class="fa fa-caret-down"></span>');   
    vswitch.push('</span>');
    

--- a/doc/css.inc
+++ b/doc/css.inc
+.. _css:
+
+.. raw:: html
+
+    <style> .black {color:black} </style>
+    <style> .blue {color:blue} </style>
+    <style> .red {color:red} </style>
+    <style> .green {color:green} </style>
+    <style> .pink {color:pink} </style>
+.. role:: blue
+.. role:: red
+.. role:: green
+.. role:: pink
+.. role:: black
--- a/doc/dev_start_guide.txt
+++ b/doc/dev_start_guide.txt
@@ -10,12 +10,15 @@ Contributing
 You want to contribute to Theano? That is great! This page explain our
 workflow and some resource for doing so.

-Looking for an idea for a first contribution? Check `github issue
-<https://github.com/Theano/Theano/issues?q=is%3Aopen+is%3Aissue+label%3A%22Easy+fix%22>`
+Looking for an idea for a first contribution? Check the `github issues
+<https://github.com/Theano/Theano/issues?q=is%3Aopen+is%3Aissue+label%3A%22Easy+fix%22>`_
 with a label ``easy fix``. They are good starter. It is recommanded
 that you write on the issue you want to work on it. This help make
 sure it is up to date and see if nobody else is working on it. Also,
-we can sometimes provides more information about it.
+we can sometimes provides more information about it.  There is also
+the label `NeedSomeoneToFinish
+<https://github.com/Theano/Theano/labels/NeedSomeoneToFinish>`_ that is
+interresting to check. The difficulty level is variable.

 Resources
 =========
@@ -82,8 +85,8 @@ make sure there are no global impacts.
 Also, if you are changing GPU code, travis doesn't test that, because
 there are no GPUs on the test nodes.

-To run the test suite with the default options, you can follow the
-instructions of :ref:`testing_installation`.
+To run the test suite with the default options, see 
+:ref:`test_theano`.

 Each night we execute all the unit tests automatically, with several
 sets of options. The result is sent by email to the `theano-buildbot`_
@@ -123,7 +126,11 @@ To setup VIM:

 #. Install flake8 (if not already installed) with::

-    pip install flake8
+    pip install "flake8<3"
+
+   .. warning:: Starting version 3.0.0, flake8 changed its dependancies and 
+      moved its Python API to a legacy module, breaking Theano's flake8 tests.
+      We recommend using a version prior to 3.  

   .. note:: You can use ``easy_install`` instead of ``pip``, and ``pep8``
      instead of ``flake8`` if you prefer. The important thing is that the
@@ -357,7 +364,7 @@ You can choose another name than "central" to reference Theano/Theano
 to "central."

 You can then test your installation of Theano by following the steps of
-:ref:`testing_installation`.
+:ref:`test_theano`.


 Using your local copy

--- a/doc/faq.txt
+++ b/doc/faq.txt
@@ -10,21 +10,6 @@ Does Theano support Python 3?
 ------------------------------
 We support both Python 2 >= 2.6 and Python 3 >= 3.3.

-TypeError: object of type 'TensorVariable' has no len()
-------------------------------------------------------
-
-If you receive the following error, it is because the Python function *__len__* cannot
-be implemented on Theano variables:
-
-.. code-block:: python
-
-   TypeError: object of type 'TensorVariable' has no len()
-
-Python requires that *__len__* returns an integer, yet it cannot be done as Theano's variables are symbolic. However, `var.shape[0]` can be used as a workaround.
-
-This error message cannot be made more explicit because the relevant aspects of Python's
-internals cannot be modified.
-
 Output slight numerical difference
 ----------------------------------

@@ -39,7 +24,6 @@ Every Computer Scientist Should Know About Floating-Point Arithmetic
 <https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html>`_.


-
 Faster gcc optimization
 -----------------------

@@ -179,33 +163,6 @@ but requires that all nodes in the graph have a C implementation:
    f(10.)


-Out of memory... but not really
-------------------------------
-
-Occasionally Theano may fail to allocate memory when there appears to be more
-than enough reporting:
-
-    Error allocating X bytes of device memory (out of memory). Driver report Y
-    bytes free and Z total.
-
-where X is far less than Y and Z (i.e. X << Y < Z).
-
-This scenario arises when an operation requires allocation of a large contiguous
-block of memory but no blocks of sufficient size are available.
-
-GPUs do not have virtual memory and as such all allocations must be assigned to
-a continuous memory region. CPUs do not have this limitation because or their
-support for virtual memory. Multiple allocations on a GPU can result in memory
-fragmentation which can makes it more difficult to find contiguous regions
-of memory of sufficient size during subsequent memory allocations.
-
-A known example is related to writing data to shared variables. When updating a
-shared variable Theano will allocate new space if the size of the data does not
-match the size of the space already assigned to the variable. This can lead to
-memory fragmentation which means that a continugous block of memory of
-sufficient capacity may not be available even if the free memory overall is
-large enough.
-
 Related Projects
 ----------------

@@ -226,55 +183,3 @@ Here is a list of some of the known limitations:
  interact with the rest of the graph).

 - Neither *goto* nor *recursion* is supported or planned within expression graphs.
-
-
-"float32 / int{32, 64} gives float64"
-------------------------------------
-
-It should be noted that using float32 and int{32, 64} together
-inside a function would provide float64 as output.
-
-Since the GPU can't compute this kind of output, it would be
-preferable not to use those dtypes together.
-
-To help you find where float64 are created, see the
-:attr:`warn_float64` Theano flag.
-
-Theano memory/speed trade-off
-----------------------------
-
-There is a few things you can easily do to change the trade-off
-between speed and memory usage. It nothing is said, this affect the
-CPU and GPU memory usage.
-
-Could speed up and lower memory usage:
-
- :ref:`cuDNN <libdoc_cuda_dnn>` default cuDNN convolution use less
-   memory then Theano version. But some flags allow it to use more
-   memory. GPU only.
- Shortly avail, multi-GPU.
-
-Could raise memory usage but speed up computation:
-
- :attr:`config.lib.cnmem` =1  # Do not raise much memory usage, but if you are at the limit of GPU memory available. GPU only.
- :attr:`config.allow_gc` =False
- :attr:`config.optimizer_excluding` =low_memory , GPU only for now.
-
-Could lower the memory usage, but raise computation time:
-
- :attr:`config.scan.allow_gc` =True # Probably not significant slowdown if config.lib.cnmem is used.
- :attr:`config.scan.allow_output_prealloc` =False
- Use :func:`batch_normalization()
-  <theano.tensor.nnet.bn.batch_normalization>`. It use less memory
-  then building a corresponding Theano graph.
- Disable one or scan more optimizations:
-    - ``optimizer_excluding=scanOp_pushout_seqs_ops``
-    - ``optimizer_excluding=scan_pushout_dot1``
-    - ``optimizer_excluding=scanOp_pushout_output``
- Disable all optimization tagged as raising memory usage:
-  ``optimizer_excluding=more_mem`` (currently only the 3 scan optimizations above)
- `float16 <https://github.com/Theano/Theano/issues/2908>`_.
-
-If you want to analyze the memory usage during computation, the
-simplest is to let the memory error happen during Theano execution and
-use the Theano flags :attr:`exception_verbosity=high`.
--- a/doc/generate_dtype_tensor_table.py
+++ b/doc/generate_dtype_tensor_table.py
@@ -17,9 +17,10 @@ shapes = [
        ('col', (False, True)),
        ('matrix', (False,False)),
        ('tensor3', (False,False,False)),
-        ('tensor4', (False,False,False,False)),]
+        ('tensor4', (False,False,False,False)),
+        ('tensor5', (False,False,False,False,False)),]

-hdr = '============ =========== ==== =========== ================================='
+hdr = '============ =========== ==== ============ ==================================='
 print(hdr)
 print('Constructor  dtype       ndim shape        broadcastable')
 print(hdr)
@@ -27,7 +28,7 @@ for letter in letters:
    for shape in shapes:
        suff = ',)' if len(shape[1])==1 else ')'
        s = '(' + ','.join('1' if b else '?' for b in shape[1]) + suff
-        print('%s%-10s  %-10s  %-4s %-10s  %-20s' %(
+        print('%s%-10s  %-10s  %-4s %-11s  %-20s' %(
                letter[0], shape[0], letter[1], len(shape[1]), s, shape[1]
                ))
 print(hdr)
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -125,6 +125,7 @@ Roughly in order of what you'll want to check out:
 * :ref:`install` -- How to install Theano.
 * :ref:`introduction` -- What is Theano?
 * :ref:`tutorial` -- Learn the basics.
+* :ref:`troubleshooting` -- Tips and tricks for common debugging.
 * :ref:`libdoc` -- Theano's functionality, module by module.
 * :ref:`faq` -- A set of commonly asked questions.
 * :ref:`optimizations` -- Guide to Theano's graph optimizations.
@@ -237,12 +238,15 @@ StackOverflow, follow their guidance for `answering questions <http://stackoverf

   NEWS
   introduction
+   requirements
   install
+   updating
   tutorial/index
   extending/index
   dev_start_guide
   optimizations
   library/index
+   troubleshooting
   glossary
   links
   internal/index

--- a/doc/install.txt
+++ b/doc/install.txt
--- a/doc/install_centos6.txt
+++ b/doc/install_centos6.txt
-:orphan:
+.. include:: css.inc

 .. _install_centos6:
 

-Easy Installation of an optimized Theano on CentOS 6
-====================================================
+CentOS 6 Installation Instructions
+##################################

-.. note::
+.. warning::
+    If you want to install the bleeding-edge or development version of Theano
+    from GitHub, please make sure you are reading `the latest version of this
+    page <http://deeplearning.net/software/theano_versions/dev/install_centos6.html>`_.

-    It is possible to have a faster installation of Theano than the one these
-    instructions will provide, but this will make the installation more
-    complicated and/or may require that you buy software. This is a simple set
-    of installation instructions that will leave you with a relatively
-    well-optimized version that uses only free software. With more work or by
-    investing money (i.e. buying a license to a proprietary BLAS
-    implementation), it is possible to gain further performance.
+.. include:: requirements.txt

-.. note::
+.. include:: install_generic.inc
+    :start-line: 5

-   If you are behind a proxy, you must do some extra configuration steps
-   before starting the installation. You must set the environment
-   variable ``http_proxy`` to the proxy address. Using bash this is
-   accomplished with the command
-   ``export http_proxy="http://user:pass@my.site:port/"``
-   You can also provide the ``--proxy=[user:pass@]url:port`` parameter
-   to pip. The ``[user:pass@]`` portion is optional.
-
-.. note::
-
-   We use ``pip`` for 2 reasons. First, it allows "``import module;
-   module.test()``" to work correctly. Second, the installation of NumPy
-   1.6 or 1.6.1 with ``easy_install`` raises an ImportError at the end of
-   the installation. To my knowledge we can ignore this error, but
-   this is not completely safe. ``easy_install`` with NumPy 1.5.1 does not
-   raise this error.
-
-
-
-Installation steps
-~~~~~~~~~~~~~~~~~~
-
- 1) ``sudo yum install python-devel python-nose python-setuptools gcc
-    gcc-gfortran gcc-c++ blas-devel lapack-devel atlas-devel``
- 2) ``sudo easy_install pip``
- 3) ``sudo pip install numpy==1.6.1``
- 4) ``sudo pip install scipy==0.10.1``
- 5) ``sudo pip install Theano``
-
-
-Test the newly installed packages
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- 1) NumPy (~30s): ``python -c "import numpy; numpy.test()"``
- 2) SciPy (~1m): ``python -c "import scipy; scipy.test()"``
- 3) Theano (~30m): ``python -c "import theano; theano.test()"``
-
-
-Speed test Theano/BLAS
-~~~~~~~~~~~~~~~~~~~~~~
-
-It is recommended to test your Theano/BLAS integration. There are many versions
-of BLAS that exist and there can be up to 10x speed difference between them.
-Also, having Theano link directly against BLAS instead of using NumPy/SciPy as
-an intermediate layer reduces the computational overhead. This is
-important for BLAS calls to ``ger``, ``gemv`` and small ``gemm`` operations
-(automatically called when needed when you use ``dot()``). To run the
-Theano/BLAS speed test:
-
-.. code-block:: bash
-
-    python /usr/lib/python2.*/site-packages/theano/misc/check_blas.py
-
-This will print a table with different versions of BLAS/numbers of
-threads on multiple CPUs and GPUs. It will also print some Theano/NumPy
-configuration information. Then, it will print the running time of the same
-benchmarks for your installation. Try to find a CPU similar to yours in
-the table, and check that the single-threaded timings are roughly the same.
-
-
-Updating Theano
-~~~~~~~~~~~~~~~
-
-If you followed these installation instructions, you can execute this command
-to update only Theano:
+Requirements through System Packages (not recommended)
+------------------------------------------------------

 .. code-block:: bash

-    sudo pip install --upgrade --no-deps theano
-
-
-If you want to also update NumPy/SciPy, you can run this:
-
-.. code-block:: bash
-
-    sudo pip install --upgrade theano
-
-Bleeding edge
-~~~~~~~~~~~~~
-
-Do like in the section "Updating Theano", but use
-``git+git://github.com/Theano/Theano.git`` instead of ``theano``.
+    sudo yum install python-devel python-nose python-setuptools gcc gcc-gfortran gcc-c++ blas-devel lapack-devel atlas-devel
+    sudo easy_install pip
--- a/doc/install_generic.inc
+++ b/doc/install_generic.inc
+.. include:: css.inc
+
+.. _install_generic:
+
+
+Installation
+============
+
+Stable Installation
+-------------------
+
+Install the latest stable version of Theano with:
+
+.. raw:: html
+
+    <div class="highlight"><pre><span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> Theano[test, doc]</pre></div>
+
+- Any argument between <...> is optional.
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- [test] will install the requirements for testing.
+
+- [doc] will install the requirements in order to generate the documentation.
+
+If you encountered any trouble, head to the :ref:`troubleshooting` page.
+
+libgpuarray
+^^^^^^^^^^^
+
+For the stable version of Theano you need a specific version of libgpuarray, 
+that has been tagged ``v-9998``.
+Download it with:
+
+.. raw:: html
+
+    <div class='highlight'><pre>
+    git clone https://github.com/Theano/libgpuarray.git --tags
+    git checkout origin/v-9998
+    cd libgpuarray
+    </pre></div>
+
+and then follow the `Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
+
+
+Bleeding-Edge Installation (recommended)
+----------------------------------------
+
+Install the latest, bleeding-edge, development version of Theano with:
+
+.. raw:: html
+
+    <div class='highlight'><pre><span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> <span class="pink">&lt;--no-deps&gt;</span> git+https://github.com/Theano/Theano.git#egg=Theano</pre></div>
+
+- Any argument between <...> is optional.
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- Use :pink:`no-deps` when you don't want the dependencies of Theano to be installed through pip. This is important when they have already been installed as system packages.
+
+If you encountered any trouble, head to the :ref:`troubleshooting` page.
+
+libgpuarray
+^^^^^^^^^^^
+
+Install the latest, development version of libgpuarray following the
+`Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
+
+
+Developer Installation
+----------------------
+
+Install the developer version of Theano with:
+
+.. raw:: html
+
+    <div class="highlight"><pre>git clone git://github.com/Theano/Theano.git
+    cd Theano
+    <span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> <span class="pink">&lt;--no-deps&gt;</span> <span class="green">-e .</span></pre></div>
+
+- Any argument between <...> is optional.
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- Use :pink:`no-deps` when you don't want the dependencies of Theano to be installed through pip. This is important when they have already been installed as system packages.
+
+- :green:`-e` makes your installation *editable*, i.e., it links it to your
+  source directory.
+
+If you encountered any trouble, head to the :ref:`troubleshooting` page.
+
+libgpuarray
+^^^^^^^^^^^
+
+Install the latest, development version of libgpuarray following the
+`Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
--- a/doc/install_macos.txt
+++ b/doc/install_macos.txt
+.. include:: css.inc
+
+.. _install_macos:
+
+
+Mac OS Installation Instructions
+################################
+
+.. warning::
+    If you want to install the bleeding-edge or development version of Theano
+    from GitHub, please make sure you are reading `the latest version of this
+    page <http://deeplearning.net/software/theano_versions/dev/install_macos.html>`_.
+
+There are various ways to install Theano dependencies on a Mac. Here
+we describe the process in detail with Anaconda, Homebrew or MacPorts 
+but if you did it differently and it worked, please let us know the 
+details on the `theano-users`_ mailing-list, so that we can add 
+alternative instructions here.
+
+.. _theano-users: http://groups.google.com/group/theano-users?pli=1
+
+.. include:: requirements.txt
+
+.. _gpu_macos:
+
+.. attention::
+
+    For MacOS you should be able to follow the above instructions to
+    setup CUDA, but be aware of the following caveats:
+
+       * If you want to compile the CUDA SDK code, you may need to temporarily
+         revert back to Apple's gcc (``sudo port select gcc``) as their Makefiles
+         are not compatible with MacPort's gcc.
+
+       * If CUDA seems unable to find a CUDA-capable GPU, you may need to manually
+         toggle your GPU on, which can be done with
+         `gfxCardStatus <http://codykrieger.com/gfxCardStatus>`__.
+
+.. attention::
+
+    Theano officially supports only clang on OS X.  This can be installed
+    by getting XCode from the App Store and running it once to install the
+    command-line tools.
+
+.. include:: install_generic.inc
+    :start-line: 5
+
+Requirements through Homebrew (not recommended)
+-----------------------------------------------
+
+Install python with homebrew:
+
+.. code-block:: bash
+
+    $ brew install python # or python3 if you prefer
+
+This will install pip.  Then use pip to install numpy, scipy:
+
+.. code-block:: bash
+
+    $ pip install numpy scipy
+
+If you want to use openblas instead of Accelerate, you have to install
+numpy and scipy with hombrew:
+
+.. code-block:: bash
+
+    $ brew tap homebrew/python
+    $ brew install numpy --with-openblas
+    $ brew install scipy --with-openblas
+
+
+Requirements through MacPorts (not recommended)
+-----------------------------------------------
+
+Using `MacPorts <http://www.macports.org/>`__ to install all required
+Theano dependencies is easy, but be aware that it will take a long time
+(a few hours) to build and install everything.
+
+- MacPorts requires installing XCode first (which can be found in the
+  Mac App Store), if you do not have it already.
+  If you can't install it from the App Store, look in your MacOS X installation
+  DVD for an old version. Then update your Mac to update XCode.
+
+- Download and install `MacPorts <http://www.macports.org/>`__, then
+  ensure its package list is up-to-date with ``sudo port selfupdate``.
+
+- Then, in order to install one or more of the required libraries, use
+  ``port install``, e.g. as follows:
+
+    .. code-block:: bash
+
+        $ sudo port install py27-numpy +atlas py27-scipy +atlas py27-pip
+
+  This will install all the required Theano dependencies. gcc will
+  be automatically installed (since it is a SciPy dependency), but be
+  aware that it takes a long time to compile (hours)!
+  Having NumPy and SciPy linked with ATLAS (an optimized BLAS
+  implementation) is not mandatory, but recommended if you care about
+  performance.
+
+- You might have some different versions of gcc, SciPy, NumPy, Python installed
+  on your system, perhaps via Xcode. It is a good idea to use **either** the
+  MacPorts version of everything **or** some other set of compatible versions
+  (e.g. provided by Xcode or Fink). The advantages of MacPorts are the
+  transparency with which everything can be installed and the fact that
+  packages are updated quite frequently. The following steps describe how to
+  make sure you are using the MacPorts version of these packages.
+
+- In order to use the MacPorts version of Python, you will probably
+  need to explicitly select it with ``sudo port select python python27``. The
+  reason this is necessary is because you may have an Apple-provided Python
+  (via, for example, an Xcode installation). After performing this step, you
+  should check that the symbolic link provided by ``which python`` points to
+  the MacPorts python. For instance, on MacOS X Lion with MacPorts 2.0.3,
+  the output of ``which python`` is ``/opt/local/bin/python`` and this symbolic
+  link points to ``/opt/local/bin/python2.7``. When executing ``sudo
+  port select python python27-apple`` (which you should **not** do), the link
+  points to ``/usr/bin/python2.7``.
+
+- Similarly, make sure that you are using the MacPorts-provided gcc:
+  use ``sudo port select gcc`` to see which gcc installs you have on the
+  system. Then execute for instance ``sudo port select gcc mp-gcc44``
+  to create a symlink that points to the correct (MacPorts) gcc (version 4.4
+  in this case).
+
+- At this point, if you have not done so already, it may be a good idea to
+  close and restart your terminal, to make sure all configuration changes
+  are properly taken into account.
+
+- Afterwards, please check that the ``scipy`` module that is imported in
+  Python is the right one (and is a recent one). For instance, ``import
+  scipy`` followed by ``print scipy.__version__`` and ``print scipy.__path__``
+  should result in a version number of at least 0.7.0 and a path that starts
+  with ``/opt/local`` (the path where MacPorts installs its packages). If this
+  is not the case, then you might have some old installation of ``scipy`` in your
+  ``PYTHONPATH`` so you should edit ``PYTHONPATH`` accordingly.
+
+- Please follow the same procedure with ``numpy``.
+
+- This is covered in the MacPorts installation process, but make sure that
+  your ``PATH`` environment variable contains ``/opt/local/bin`` and
+  ``/opt/local/sbin`` before any other paths (to ensure that the Python and
+  gcc binaries that you installed with MacPorts are visible first).
+
+- MacPorts does not create automatically ``nosetests`` and ``pip`` symlinks
+  pointing to the MacPorts version, so you can add them yourself with
+
+    .. code-block:: bash
+
+        $ sudo ln -s /opt/local/bin/nosetests-2.7 /opt/local/bin/nosetests
+        $ sudo ln -s /opt/local/bin/pip-2.7 /opt/local/bin/pip
--- a/doc/install_others.txt
+++ b/doc/install_others.txt
+.. _install_others:
+
+
+Other Platform-specific Installations
+=====================================
+
+.. warning::
+
+    These instructions are not kept up to date.
+
+NVIDIA Jetson TX1 embedded platform
+-----------------------------------
+
+.. code-block:: bash
+
+    sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libblas-dev git
+    pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --user  # Need Theano 0.8 or more recent
+
+Gentoo
+------
+
+Brian Vandenberg emailed `installation instructions on Gentoo
+<http://groups.google.com/d/msg/theano-dev/-8WCMn2FMR0/bJPasoZXaqoJ>`_,
+focusing on how to install the appropriate dependencies.
+
+Nicolas Pinto provides `ebuild scripts <https://github.com/npinto/sekyfsr-gentoo-overlay/tree/master/sci-libs/Theano>`_.
+
+Docker images
+-------------
+
+Builds of Theano are available as `Docker <https://www.docker.com/whatisdocker>`_ images:
+`Theano Docker (CPU) <https://hub.docker.com/r/kaixhin/theano/>`_ or `Theano Docker (CUDA) <https://hub.docker.com/r/kaixhin/cuda-theano/>`_.
+These are updated on a weekly basis with bleeding-edge builds of Theano. Examples of running bash in a Docker container
+are as follows:
+
+.. code-block:: bash
+
+    sudo docker run -it kaixhin/theano
+    sudo docker run -it --device /dev/nvidiactl --device /dev/nvidia-uvm --device /dev/nvidia0 kaixhin/cuda-theano:7.0
+
+For a guide to Docker, see the `official docs <https://docs.docker.com/userguide/>`_. For more details on how to use the
+Theano Docker images, including requirements for CUDA support, consult the `source project <https://github.com/Kaixhin/dockerfiles>`_.
--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
--- a/doc/install_windows.txt
+++ b/doc/install_windows.txt
-:orphan:
-
 .. _install_windows:

+Windows Installation Instructions
+=================================
+
+.. warning::
+    If you want to install the bleeding-edge or development version of Theano
+    from GitHub, please make sure you are reading `the latest version of this
+    page <http://deeplearning.net/software/theano_versions/dev/install_windows.html>`_.
+
+.. warning::

-Installation of Theano on Windows
-==================================
+    Theano is mainly developed and tested on Linux Machines.

 These instructions show step-by-step how to install Theano and
 required dependencies on a 32- or 64-bit system using freely available
@@ -26,6 +32,8 @@ C/C++ (for Python 2.7 family this has to be Microsoft Visual Studio
 version supporting Visual Studio 2008), and GCC (for non-CUDA C code
 generated by Theano).

+.. _gpu_windows:
+
 Visual Studio and CUDA
 ######################

@@ -37,7 +45,6 @@ Studio installation to proceed). Afterwards, the Visual Studio 2010
 can be safely removed. If someone knows how to install CUDA 5.5
 without a proper Visual Studio installation, please let us know.

-
 First we need to install Microsoft Visual Studio 2010 Express, which
 is required to install CUDA. You can download it from
 `Visual Studio Express
@@ -79,7 +86,7 @@ The package will be installed to ``C:\Program Files
 (x86)\Common Files\Microsoft\Visual C++ for Python\9.0``.

 Finally download the ``stdint.h`` header from
-`here <http://msinttypes.googlecode.com/svn/trunk/stdint.h>`_ and save it as
+`here <https://sourceforge.net/p/mspgcc/msp430-libc/ci/master/tree/include/stdint.h>`_ and save it as
 ``C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for
 Python\9.0\VC\include\stdint.h``.

@@ -619,6 +626,3 @@ follows:
   dependencies. In the case where it is a dependency, you can use the
   `Dependency Walker <http://www.dependencywalker.com/>`__ utility to figure out
   which one.
-
-
-.. _gpu_windows:
--- a/doc/library/compile/debugmode.txt
+++ b/doc/library/compile/debugmode.txt
@@ -5,7 +5,7 @@
 :mod:`debugmode`
 =================

-.. module:: debugmode
+.. module:: theano.compile.debugmode
   :platform: Unix, Windows
   :synopsis: defines DebugMode
 .. moduleauthor:: LISA

--- a/doc/library/compile/function.txt
+++ b/doc/library/compile/function.txt
@@ -5,7 +5,7 @@
 :mod:`function` - defines theano.function
 ===========================================

-.. module:: function
+.. module:: theano.compile.function
   :platform: Unix, Windows
   :synopsis: defines theano.function and related classes
 .. moduleauthor:: LISA

--- a/doc/library/compile/io.txt
+++ b/doc/library/compile/io.txt
@@ -10,7 +10,7 @@
 :mod:`io` - defines theano.function [TODO]
 ===========================================

-.. module:: io
+.. module:: theano.compile.io
   :platform: Unix, Windows
   :synopsis: defines In and Out
 .. moduleauthor:: LISA

--- a/doc/library/compile/mode.txt
+++ b/doc/library/compile/mode.txt
@@ -5,7 +5,7 @@
 :mod:`mode` -- controlling compilation
 ======================================

-.. module:: mode
+.. module:: theano.compile.mode
   :platform: Unix, Windows
   :synopsis: controlling compilation
 .. moduleauthor:: LISA

--- a/doc/library/compile/nanguardmode.txt
+++ b/doc/library/compile/nanguardmode.txt
@@ -5,7 +5,7 @@
 :mod:`nanguardmode`
 ===================

-.. module:: nanguardmode
+.. module:: theano.compile.nanguardmode
   :platform: Unix, Windows
   :synopsis: defines NanGuardMode
 .. moduleauthor:: LISA

--- a/doc/library/compile/profilemode.txt
+++ b/doc/library/compile/profilemode.txt
@@ -6,7 +6,7 @@
 ================================================


-.. module:: profilemode
+.. module:: theano.compile.profilemode
   :platform: Unix, Windows
   :synopsis: profiling Theano functions with ProfileMode
 .. moduleauthor:: LISA

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -487,6 +487,21 @@ import theano and print the config variable, as in:
        automatically to get more memory. But this can cause
        fragmentation, see note above.

+.. attribute:: config.gpuarray.sched
+
+    String value: ``'default'``, ``'multi'``, ``'single'``
+
+    Default: ``'default'``
+
+    Control the stream mode of contexts.
+
+    The sched parameter passed for context creation to pygpu.  With
+    CUDA, using "multi" mean using the parameter
+    cudaDeviceScheduleYield. This is useful to lower the CPU overhead
+    when waiting for GPU. One user found that it speeds up his other
+    processes that was doing data augmentation.
+
+
 .. attribute:: config.gpuarray.single_stream

    Boolean value

--- a/doc/library/gof/fgraph.txt
+++ b/doc/library/gof/fgraph.txt
@@ -2,26 +2,40 @@
 .. _libdoc_gof_fgraph:

 ================================================
-:mod:`fgraph` -- Graph Container [doc TODO]
+:mod:`fg` -- Graph Container [doc TODO]
 ================================================

-.. module:: fgraph
+.. module:: theano.gof.fg
   :platform: Unix, Windows
   :synopsis: Theano Internals
 .. moduleauthor:: LISA


-Guide
-=====
+.. _fgraph:

 FunctionGraph
 -------------

+.. autoclass:: theano.gof.FunctionGraph
+    :members: 
+
+    ***TODO***
+
+    .. note:: FunctionGraph(inputs, outputs) clones the inputs by
+        default. To avoid this behavior, add the parameter
+        clone=False. This is needed as we do not want cached constants
+        in fgraph.
+
 .. _libdoc_gof_fgraphfeature:

+.. _fgraphfeature:
+
 FunctionGraph Features
 ----------------------

+.. autoclass:: theano.gof.toolbox.Feature 
+    :members: 
+
 .. _libdoc_gof_fgraphfeaturelist:

 FunctionGraph Feature List
@@ -29,14 +43,3 @@ FunctionGraph Feature List
 * ReplaceValidate
 * DestroyHandler

-Reference
-=========
-
-.. class:: FunctionGraph
-
-    ***TODO***
-
-    .. note:: FunctionGraph(inputs, outputs) clones the inputs by
-        default. To avoid this behavior, add the parameter
-        clone=False. This is needed as we do not want cached constants
-        in fgraph.
--- a/doc/library/gof/graph.txt
+++ b/doc/library/gof/graph.txt
@@ -4,14 +4,12 @@
 :mod:`graph` -- Interface for the Theano graph
 ==============================================

-.. module:: graph
-   :platform: Unix, Windows
-   :synopsis: Interface for types of symbolic variables
-.. moduleauthor:: LISA
-
 ---------
 Reference
 ---------

 .. automodule:: theano.gof.graph
+   :platform: Unix, Windows
+   :synopsis: Interface for types of symbolic variables
   :members:
+.. moduleauthor:: LISA
--- a/doc/library/gof/type.txt
+++ b/doc/library/gof/type.txt
@@ -4,14 +4,12 @@
 :mod:`type` -- Interface for types of variables
 ================================================

-.. module:: type
-   :platform: Unix, Windows
-   :synopsis: Interface for types of symbolic variables
-.. moduleauthor:: LISA
-
 ---------
 Reference
 ---------

 .. automodule:: theano.gof.type
+   :platform: Unix, Windows
+   :synopsis: Interface for types of symbolic variables
   :members:
+.. moduleauthor:: LISA
--- a/doc/library/gof/utils.txt
+++ b/doc/library/gof/utils.txt
@@ -8,14 +8,12 @@

   from theano.gof.utils import *

-.. module:: utils
-   :platform: Unix, Windows
-   :synopsis: Utilities functions operating on the graph
-.. moduleauthor:: LISA
-
 ---------
 Reference
 ---------

 .. automodule:: theano.gof.utils
+   :platform: Unix, Windows
+   :synopsis: Utilities functions operating on the graph
   :members:
+.. moduleauthor:: LISA
--- a/doc/library/gpuarray/dnn.txt
+++ b/doc/library/gpuarray/dnn.txt
@@ -61,7 +61,7 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
     usage
   * ``none`` : use a slower implementation with minimal memory usage
   * ``large`` : use a sometimes faster implementation with large memory usage
-   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft`` : use the Fast Fourier Transform implementation of convolution
     (very high memory usage)
   * ``guess_once`` : the first time a convolution is executed, the
     implementation to use is chosen according to cuDNN's heuristics and reused
@@ -83,7 +83,7 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
   * ``none`` (default) : use the default non-deterministic convolution
     implementation
   * ``deterministic`` : use a slower but deterministic implementation
-   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft`` : use the Fast Fourier Transform implementation of convolution
     (very high memory usage)
   * ``guess_once`` : the first time a convolution is executed, the
     implementation to use is chosen according to cuDNN's heuristics and reused

--- a/doc/library/printing.txt
+++ b/doc/library/printing.txt
@@ -4,7 +4,7 @@
 :mod:`printing` -- Graph Printing and Symbolic Print Statement
 ===============================================================

-.. module:: printing
+.. module:: theano.printing
   :platform: Unix, Windows
   :synopsis: Provides the Print Op and graph-printing routines.
 .. moduleauthor:: LISA

--- a/doc/library/sandbox/cuda/dnn.txt
+++ b/doc/library/sandbox/cuda/dnn.txt
@@ -64,9 +64,9 @@ get an error when cuDNN can not be used with them, use this flag:
     usage
   * ``none`` : use a slower implementation with minimal memory usage
   * ``large`` : use a sometimes faster implementation with large memory usage
-   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft`` : use the Fast Fourier Transform implementation of convolution
     (very high memory usage)
-   * ``fft_tiling`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft_tiling`` : use the Fast Fourier Transform implementation of convolution
     with tiling (high memory usage, but less then fft)
   * ``guess_once`` : the first time a convolution is executed, the
     implementation to use is chosen according to cuDNN's heuristics and reused
@@ -89,7 +89,7 @@ get an error when cuDNN can not be used with them, use this flag:
   * ``none`` (default) : use the default non-deterministic convolution
     implementation
   * ``deterministic`` : use a slower but deterministic implementation
-   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft`` : use the Fast Fourier Transform implementation of convolution
     (very high memory usage)
   * ``guess_once`` : the first time a convolution is executed, the
     implementation to use is chosen according to cuDNN's heuristics and reused
@@ -104,7 +104,7 @@ get an error when cuDNN can not be used with them, use this flag:
     implementation selected every time the shapes of the inputs and kernels
     don't match the shapes from the last execution.

-   * (algo_bwd_data only) ``fft_tiling`` : use the Fast Fourrier
+   * (algo_bwd_data only) ``fft_tiling`` : use the Fast Fourier
     Transform implementation of convolution with tiling (high memory
     usage, but less then fft)

@@ -173,3 +173,14 @@ Softmax Ops
 .. automodule:: theano.sandbox.cuda.dnn
   :noindex:
   :members: GpuDnnSoftmax, GpuDnnSoftmaxGrad
+
+
+
+.. _libdoc_cuda_dnn_bn:
+
+Batch Normalization
+===================
+
+.. automodule:: theano.sandbox.cuda.dnn
+   :noindex:
+   :members: dnn_batch_normalization_train, dnn_batch_normalization_test
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -85,6 +85,10 @@ floating-point precision.

    Return a Variable for a 4-dimensional ndarray

+.. function:: tensor5(name=None, dtype=config.floatX)
+
+    Return a Variable for a 5-dimensional ndarray
+
 .. #COMMENT
    Each of the types described above can be constructed by two methods:
    a singular version (e.g., :ref:`dmatrix <libdoc_tensor_creation>`)
@@ -112,9 +116,9 @@ They are all callable, and accept an optional ``name`` argument.  So for example
    table generated by
    $ python Theano/doc/generate_dtype_tensor_table.py

-============ =========== ==== =========== =================================
+============ =========== ==== ============ ===================================
 Constructor  dtype       ndim shape        broadcastable
-============ =========== ==== =========== =================================
+============ =========== ==== ============ ===================================
 bscalar      int8        0    ()           ()
 bvector      int8        1    (?,)         (False,)
 brow         int8        2    (1,?)        (True, False)
@@ -122,6 +126,7 @@ bcol         int8        2    (?,1)       (False, True)
 bmatrix      int8        2    (?,?)        (False, False)
 btensor3     int8        3    (?,?,?)      (False, False, False)
 btensor4     int8        4    (?,?,?,?)    (False, False, False, False)
+btensor5     int8        5    (?,?,?,?,?)  (False, False, False, False, False)
 wscalar      int16       0    ()           ()
 wvector      int16       1    (?,)         (False,)
 wrow         int16       2    (1,?)        (True, False)
@@ -129,6 +134,7 @@ wcol         int16       2    (?,1)       (False, True)
 wmatrix      int16       2    (?,?)        (False, False)
 wtensor3     int16       3    (?,?,?)      (False, False, False)
 wtensor4     int16       4    (?,?,?,?)    (False, False, False, False)
+wtensor5     int16       5    (?,?,?,?,?)  (False, False, False, False, False)
 iscalar      int32       0    ()           ()
 ivector      int32       1    (?,)         (False,)
 irow         int32       2    (1,?)        (True, False)
@@ -136,6 +142,7 @@ icol         int32       2    (?,1)       (False, True)
 imatrix      int32       2    (?,?)        (False, False)
 itensor3     int32       3    (?,?,?)      (False, False, False)
 itensor4     int32       4    (?,?,?,?)    (False, False, False, False)
+itensor5     int32       5    (?,?,?,?,?)  (False, False, False, False, False)
 lscalar      int64       0    ()           ()
 lvector      int64       1    (?,)         (False,)
 lrow         int64       2    (1,?)        (True, False)
@@ -143,6 +150,7 @@ lcol         int64       2    (?,1)       (False, True)
 lmatrix      int64       2    (?,?)        (False, False)
 ltensor3     int64       3    (?,?,?)      (False, False, False)
 ltensor4     int64       4    (?,?,?,?)    (False, False, False, False)
+ltensor5     int64       5    (?,?,?,?,?)  (False, False, False, False, False)
 dscalar      float64     0    ()           ()
 dvector      float64     1    (?,)         (False,)
 drow         float64     2    (1,?)        (True, False)
@@ -150,6 +158,7 @@ dcol         float64     2    (?,1)       (False, True)
 dmatrix      float64     2    (?,?)        (False, False)
 dtensor3     float64     3    (?,?,?)      (False, False, False)
 dtensor4     float64     4    (?,?,?,?)    (False, False, False, False)
+dtensor5     float64     5    (?,?,?,?,?)  (False, False, False, False, False)
 fscalar      float32     0    ()           ()
 fvector      float32     1    (?,)         (False,)
 frow         float32     2    (1,?)        (True, False)
@@ -157,6 +166,7 @@ fcol         float32     2    (?,1)       (False, True)
 fmatrix      float32     2    (?,?)        (False, False)
 ftensor3     float32     3    (?,?,?)      (False, False, False)
 ftensor4     float32     4    (?,?,?,?)    (False, False, False, False)
+ftensor5     float32     5    (?,?,?,?,?)  (False, False, False, False, False)
 cscalar      complex64   0    ()           ()
 cvector      complex64   1    (?,)         (False,)
 crow         complex64   2    (1,?)        (True, False)
@@ -164,6 +174,7 @@ ccol         complex64   2    (?,1)       (False, True)
 cmatrix      complex64   2    (?,?)        (False, False)
 ctensor3     complex64   3    (?,?,?)      (False, False, False)
 ctensor4     complex64   4    (?,?,?,?)    (False, False, False, False)
+ctensor5     complex64   5    (?,?,?,?,?)  (False, False, False, False, False)
 zscalar      complex128  0    ()           ()
 zvector      complex128  1    (?,)         (False,)
 zrow         complex128  2    (1,?)        (True, False)
@@ -171,7 +182,8 @@ zcol         complex128  2    (?,1)       (False, True)
 zmatrix      complex128  2    (?,?)        (False, False)
 ztensor3     complex128  3    (?,?,?)      (False, False, False)
 ztensor4     complex128  4    (?,?,?,?)    (False, False, False, False)
-============ =========== ==== =========== =================================
+ztensor5     complex128  5    (?,?,?,?,?)  (False, False, False, False, False)
+============ =========== ==== ============ ===================================

 Plural Constructors
 --------------------------
@@ -220,11 +232,11 @@ If you would like to construct a tensor variable with a non-standard
 broadcasting pattern, or a larger number of dimensions you'll need to create
 your own :class:`TensorType` instance.  You create such an instance by passing
 the dtype and broadcasting pattern to the constructor.  For example, you
-can create your own 5-dimensional tensor type
+can create your own 6-dimensional tensor type

->>> dtensor5 = TensorType('float64', (False,)*5)
->>> x = dtensor5()
->>> z = dtensor5('z')
+>>> dtensor6 = TensorType('float64', (False,)*6)
+>>> x = dtensor6()
+>>> z = dtensor6('z')

 You can also redefine some of the provided types and they will interact
 correctly:
@@ -1095,13 +1107,11 @@ Indexing

 Like NumPy, Theano distinguishes between *basic* and *advanced* indexing.
 Theano fully supports basic indexing
-(see `NumPy's indexing  <http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html>`_).
-
-`Integer advanced indexing
-<http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#integer>`_
-will be supported in 0.6rc4 (or the development version). We do not
-support boolean masks, as Theano does not have a boolean type (we use
-int8 for the output of logic operators).
+(see `NumPy's indexing  <http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html>`_) 
+and `integer advanced indexing
+<http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#integer>`_. We do not
+support boolean masks, as Theano does not have a boolean type (we use int8 for the output of 
+logic operators).

 .. testsetup:: indexing


--- a/doc/library/tensor/nnet/bn.txt
+++ b/doc/library/tensor/nnet/bn.txt
@@ -10,4 +10,7 @@
 .. moduleauthor:: LISA


+.. seealso:: :ref:`cuDNN batch normalization <libdoc_cuda_dnn_bn>`
+    must be added manually.
+
 .. autofunction:: theano.tensor.nnet.bn.batch_normalization
--- a/doc/library/tensor/nnet/index.txt
+++ b/doc/library/tensor/nnet/index.txt
@@ -4,7 +4,7 @@
 :mod:`nnet`  -- Ops related to neural networks
 ==================================================

-.. module:: nnet
+.. module:: theano.tensor.nnet
   :platform: Unix, Windows
   :synopsis: various ops relating to neural networks
 .. moduleauthor:: LISA

--- a/doc/library/tensor/nnet/nnet.txt
+++ b/doc/library/tensor/nnet/nnet.txt
@@ -4,7 +4,7 @@
 :mod:`nnet` -- Ops for neural networks
 ======================================================

-.. module:: tensor.nnet
+.. module:: theano.tensor.nnet.nnet
   :platform: Unix, Windows
   :synopsis: Ops for neural networks
 .. moduleauthor:: LISA

--- a/doc/library/tensor/raw_random.txt
+++ b/doc/library/tensor/raw_random.txt
@@ -5,7 +5,7 @@
 :mod:`raw_random` -- Low-level random numbers
 =============================================

-.. module:: raw_random
+.. module:: theano.tensor.raw_random
   :synopsis: symbolic random variables
 .. moduleauthor:: LISA


--- a/doc/library/tensor/shared_randomstreams.txt
+++ b/doc/library/tensor/shared_randomstreams.txt
@@ -4,7 +4,7 @@
 :mod:`shared_randomstreams` -- Friendly random numbers
 ======================================================

-.. module:: shared_randomstreams
+.. module:: theano.tensor.shared_randomstreams
   :platform: Unix, Windows
   :synopsis: symbolic random variables
 .. moduleauthor:: LISA

--- a/doc/requirements.txt
+++ b/doc/requirements.txt
+Requirements
+============
+
+.. note::
+
+    We only support the installation of the requirements through conda.
+
+.. _BLAS: http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms
+.. _Python: http://www.python.org/
+
+    Python_ >= 2.6 or >= 3.3
+        The development package (python-dev or python-devel on most Linux distributions) is recommended (see just below). Python 2.4 was supported up to and including the release 0.6. Python 3 is supported past the 3.3 release.
+
+    `NumPy <http://numpy.scipy.org/>`_ >= 1.7.1 < 1.11.1
+        Earlier versions could work, but we don’t test it.
+
+    `SciPy <http://scipy.org>`_ >= 0.11 < 0.17.1
+        Only currently required for sparse matrix and special functions support, but highly recommended. SciPy >=0.8 could work, but earlier versions have known bugs with sparse matrices.
+
+    `BLAS`_ installation (with Level 3 functionality)
+        * **Recommended**: MKL, which is free through Conda. 
+        * Alternatively, we suggest to install OpenBLAS, with the development headers (``-dev``, ``-devel``, depending on your Linux distribution).
+
+**Optional requirements**
+
+    ``python-dev``, ``g++`` >= 4.2
+        **Highly recommended.** Theano can fall back on a NumPy-based Python execution model, but a C compiler allows for vastly faster execution.
+
+    `nose <http://nose.readthedocs.io/en/latest/>`_ >= 1.3.0
+        Recommended, to run Theano's test-suite.
+
+    `Sphinx <http://sphinx.pocoo.org/>`_ >= 0.5.1, `pygments <http://pygments.org/>`_
+        For building the documentation. LaTeX_ and dvipng_ are also necessary for math to show up as images.
+
+    `pydot-ng <https://github.com/pydot/pydot-ng>`_
+        To handle large picture for gif/images.
+
+    `NVIDIA CUDA drivers and SDK`_
+        **Highly recommended** Required for GPU code generation/execution on NVIDIA gpus. See instruction below.
+
+    `libgpuarray`_
+        Required for GPU/CPU code generation on CUDA and OpenCL devices (see: :ref:`gpuarray`.)
+
+Requirements installation through Conda (recommended)
+-----------------------------------------------------
+
+Install Miniconda
+^^^^^^^^^^^^^^^^^
+
+Follow this `link <http://conda.pydata.org/miniconda.html>`__ to install Miniconda.
+
+.. note::
+
+    If you want fast compiled code (recommended), make sure you have g++ (Windows/Linux) or Clang (OS X) installed.
+
+Install requirements and optional packages
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+    conda install numpy scipy mkl <nose> <sphinx> <pydot-ng>
+
+* Arguments between <...> are optional.
+
+
+Install and configure the GPU drivers (recommended)
+---------------------------------------------------
+
+.. warning::
+
+    OpenCL support is still minimal for now.
+
+1. Install CUDA drivers
+
+    * Follow `this link <https://developer.nvidia.com/cuda-downloads>`__
+      to install the CUDA driver and the CUDA Toolkit.
+    * You must reboot the computer after the driver installation.
+    * Test that it was loaded correctly after the reboot, executing the
+      command `nvidia-smi` from the command line.
+
+    .. note::
+
+        Sanity check: The *bin* subfolder should contain an *nvcc*
+        program. This folder is called the *cuda root* directory.
+
+2. Fix 'lib' path
+    * Add the 'lib' subdirectory (and/or 'lib64' subdirectory if you have a
+      64-bit OS) to your ``$LD_LIBRARY_PATH`` environment 
+      variable.
+
+3. Set Theano's config flags
+
+    To use the GPU you need to define the *cuda root*. You can do it in one 
+    of the following ways:
+
+    * Define a $CUDA_ROOT environment variable to equal the cuda root directory, as in ``CUDA_ROOT=/path/to/cuda/root``, or
+    * add a ``cuda.root`` flag to :envvar:`THEANO_FLAGS`, as in ``THEANO_FLAGS='cuda.root=/path/to/cuda/root'``, or
+    * add a [cuda] section to your .theanorc file containing the option ``root = /path/to/cuda/root``.
+
+.. _LaTeX: http://www.latex-project.org/
+.. _dvipng: http://savannah.nongnu.org/projects/dvipng/
+.. _NVIDIA CUDA drivers and SDK: http://developer.nvidia.com/object/gpucomputing.html
+.. _libgpuarray: http://deeplearning.net/software/libgpuarray/installation.html
--- a/doc/sandbox/fg.txt
+++ b/doc/sandbox/fg.txt
-
-.. _fgraph:
-
-=============
-FunctionGraph
-=============
-
-TODO: clean up/update the doc/check if complete
-WRITEME
-
-.. autoclass:: theano.gof.fg.FunctionGraph
-
-.. _fgraphfeature:
-
-Feature
-=======
-
-.. autoclass:: theano.gof.toolbox.Feature
-    :members:
--- a/doc/sandbox/index2.txt
+++ b/doc/sandbox/index2.txt
@@ -8,7 +8,6 @@ Advanced Topics (under construction)
 .. toctree::
    :maxdepth: 2

-    fg
    compilation
    ccodegen
    function

--- a/doc/scripts/docgen.py
+++ b/doc/scripts/docgen.py
@@ -54,6 +54,10 @@ if __name__ == '__main__':
    pythonpath = os.pathsep.join([throot, pythonpath])
    sys.path[0:0] = [throot]  # We must not use os.environ.

+    # Make sure we don't use gpu to compile documentation
+    env_th_flags = os.environ.get('THEANO_FLAGS', '')
+    os.environ['THEANO_FLAGS'] = 'device=cpu,force_device=True'
+
    def call_sphinx(builder, workdir):
        import sphinx
        if options['--check']:
@@ -99,3 +103,6 @@ if __name__ == '__main__':

    # To go back to the original current directory.
    os.chdir(currentdir)
+
+    # Reset THEANO_FLAGS
+    os.environ['THEANO_FLAGS'] = env_th_flags
--- a/doc/troubleshooting.txt
+++ b/doc/troubleshooting.txt
--- a/doc/tutorial/adding.txt
+++ b/doc/tutorial/adding.txt
@@ -175,13 +175,13 @@ by :ref:`broadcasting <libdoc_tensor_broadcastable>`.

 The following types are available:

-* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4``
-* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4``
-* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4``
-* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4``
-* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4``
-* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4``
-* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4``
+* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4, btensor5``
+* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4, wtensor5``
+* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4, itensor5``
+* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4, ltensor5``
+* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4, ftensor5``
+* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4, dtensor5``
+* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4, ctensor5``

 The previous list is not exhaustive and a guide to all types compatible
 with NumPy arrays may be found here: :ref:`tensor creation<libdoc_tensor_creation>`.

--- a/doc/tutorial/conv_arithmetic.txt
+++ b/doc/tutorial/conv_arithmetic.txt
--- a/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides.gif
+++ b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides.gif
--- a/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.gif
+++ b/doc/tutorial/conv_arithmetic_figures/arbitrary_padding_no_strides_transposed.gif
--- a/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides.gif
+++ b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides.gif
--- a/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides_transposed.gif
+++ b/doc/tutorial/conv_arithmetic_figures/full_padding_no_strides_transposed.gif
--- a/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides.gif
+++ b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides.gif
--- a/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides_transposed.gif
+++ b/doc/tutorial/conv_arithmetic_figures/no_padding_no_strides_transposed.gif
--- a/doc/tutorial/conv_arithmetic_figures/no_padding_strides.gif
+++ b/doc/tutorial/conv_arithmetic_figures/no_padding_strides.gif
--- a/doc/tutorial/conv_arithmetic_figures/no_padding_strides_transposed.gif
+++ b/doc/tutorial/conv_arithmetic_figures/no_padding_strides_transposed.gif
--- a/doc/tutorial/conv_arithmetic_figures/numerical_no_padding_no_strides.gif
+++ b/doc/tutorial/conv_arithmetic_figures/numerical_no_padding_no_strides.gif
--- a/doc/tutorial/conv_arithmetic_figures/numerical_padding_strides.gif
+++ b/doc/tutorial/conv_arithmetic_figures/numerical_padding_strides.gif
--- a/doc/tutorial/conv_arithmetic_figures/padding_strides.gif
+++ b/doc/tutorial/conv_arithmetic_figures/padding_strides.gif
--- a/doc/tutorial/conv_arithmetic_figures/padding_strides_odd.gif
+++ b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd.gif
--- a/doc/tutorial/conv_arithmetic_figures/padding_strides_odd_transposed.gif
+++ b/doc/tutorial/conv_arithmetic_figures/padding_strides_odd_transposed.gif
--- a/doc/tutorial/conv_arithmetic_figures/padding_strides_transposed.gif
+++ b/doc/tutorial/conv_arithmetic_figures/padding_strides_transposed.gif
--- a/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides.gif
+++ b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides.gif
--- a/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides_transposed.gif
+++ b/doc/tutorial/conv_arithmetic_figures/same_padding_no_strides_transposed.gif
--- a/doc/tutorial/index.txt
+++ b/doc/tutorial/index.txt
@@ -49,6 +49,7 @@ Advanced
    sparse
    using_gpu
    using_multi_gpu
+    conv_arithmetic

 Advanced configuration and debugging
 ------------------------------------

--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
@@ -4,6 +4,13 @@
 Multi cores support in Theano
 =============================

+Convolution and Pooling
+=======================
+
+Since Theano 0.9dev2, the convolution and pooling are parallelized on
+CPU.
+
+
 BLAS operation
 ==============


--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
-
 .. _using_gpu:

 =============
@@ -19,11 +18,33 @@ There are two ways currently to use a gpu, one that should support any OpenCL
 device as well as NVIDIA cards (:ref:`gpuarray`), and the old backend that
 only supports NVIDIA cards (:ref:`cuda`).

+Using the GPU in Theano is as simple as setting the ``device`` configuration
+flag to ``device=cuda`` (or ``device=gpu`` for the old backend). You can optionally target a specific gpu by specifying
+the number of the gpu as in e.g. ``device=cuda2``. You also need to set the
+default floating point precision.
+For example: ``THEANO_FLAGS='cuda.root=/path/to/cuda/root,device=cuda,floatX=float32'``.
+You can also set these options in the .theanorc file's ``[global]`` section:
+
+     .. code-block:: cfg
+
+        [global]
+        device = cuda
+        floatX = float32
+
 .. warning::

-  If you want to use the new GpuArray backend, make sure to have the 
-  development version of Theano installed. The 0.8.X releases have not
-  been optimized to work correctly with the new backend.
+  The old CUDA backend will be deprecated soon, in favor of the new libgpuarray
+  backend.
+
+.. note::
+
+    * If your computer has multiple GPUs and you use ``device=cuda``, the driver
+      selects the one to use (usually gpu0).
+    * You can use the program ``nvidia-smi`` to change this policy.
+    * By default, when ``device`` indicates preference for GPU computations,
+      Theano will fall back to the CPU if there is a problem with the GPU.
+      You can use the flag ``force_device=True`` to instead raise an error when
+      Theano cannot use the GPU.

 .. _gpuarray:

@@ -31,19 +52,32 @@ GpuArray Backend
 ----------------

 If you have not done so already, you will need to install libgpuarray
-as well as at least one computing toolkit.  Instructions for doing so
-are provided at `libgpuarray <http://deeplearning.net/software/libgpuarray/installation.html>`_.
+as well as at least one computing toolkit (CUDA or OpenCL). Detailed
+instructions to accomplish that are provided at 
+`libgpuarray <http://deeplearning.net/software/libgpuarray/installation.html>`_.
+
+To install Nvidia's GPU-programming toolchain (CUDA) and configure
+Theano to use it, see the installation instructions for
+:ref:`Linux <gpu_linux>`, :ref:`MacOS <gpu_macos>` and :ref:`Windows <gpu_windows>`.

 While all types of devices are supported if using OpenCL, for the
 remainder of this section, whatever compute device you are using will
 be referred to as GPU.

+.. warning::
+
+  If you want to use the new GpuArray backend, make sure to have the
+  development version of Theano installed. The 0.8.X releases have not
+  been optimized to work correctly with the new backend.
+
 .. warning::

  The backend was designed to support OpenCL, however current support is
  incomplete. A lot of very useful ops still do not support it because they
  were ported from the old backend with minimal change.

+  .. _testing_the_gpu:
+
 Testing Theano with GPU
 ~~~~~~~~~~~~~~~~~~~~~~~


--- a/doc/updating.txt
+++ b/doc/updating.txt
+.. include:: css.inc
+
+.. _updating:
+
+Updating Theano
+===============
+
+Follow one of these three sections depending on how you installed Theano.
+
+You should update frequently, bugs are fixed on a very regular basis, and features are
+added even more frequently!
+
+Stable Installation
+-------------------
+
+The following command will update only Theano:
+
+.. raw:: html
+
+    <pre><span class="red">&#60;sudo&#62;</span> pip install <span class="blue">&#60;--user&#62;</span> <span class="pink">&#60;--no-deps&#62;</span> theano</pre>
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- Use :pink:`no-deps` when you don't want the dependencies of Theano to not be installed through pip. This is important when they have already been installed as system packages.
+
+.. warning::
+
+    If you installed NumPy/SciPy with yum/apt-get, updating NumPy/SciPy
+    with pip/easy_install is not always a good idea. This can make Theano
+    crash due to problems with BLAS. The versions of
+    NumPy/SciPy in the distribution are sometimes linked against faster
+    versions of BLAS. Installing NumPy/SciPy with
+    yum/apt-get/pip/easy_install won't install the development package
+    needed to recompile it with the fast version.
+    To fix a possible crash, you can clear
+    the Theano cache like this:
+
+    .. code-block:: bash
+
+       theano-cache clear
+
+Bleeding-Edge Installation
+--------------------------
+
+The following command will update your bleeding-edge version of Theano
+
+.. raw:: html
+
+    <div style="width:100%"><pre><span class="red">&#60;sudo&#62;</span> pip install <span class="blue">&#60;--user&#62;</span> <span class="pink">&#60;--no-deps&#62;</span> git+https://github.com/Theano/Theano.git#egg=Theano</pre></div>
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- Use :pink:`no-deps` when you don't want the dependencies of Theano to not be installed through pip. This is important when they have already been installed as system packages.
+
+.. warning::
+
+    If you installed NumPy/SciPy with yum/apt-get, updating NumPy/SciPy
+    with pip/easy_install is not always a good idea. This can make Theano
+    crash due to problems with BLAS. The versions of
+    NumPy/SciPy in the distribution are sometimes linked against faster
+    versions of BLAS. Installing NumPy/SciPy with
+    yum/apt-get/pip/easy_install won't install the development package
+    needed to recompile it with the fast version.
+    To fix a possible crash, you can clear
+    the Theano cache like this:
+
+    .. code-block:: bash
+
+       theano-cache clear
+
+Developer Installation
+----------------------
+
+To update your library to the latest revision, change directory (``cd``)
+to your ``Theano`` folder and execute the following command:
+
+.. warning::
+
+    The following assumes you have knowledge of git and know how to do a rebase.
+
+.. code-block:: bash
+
+    git pull --rebase
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@ PLATFORMS           = ["Windows", "Linux", "Solaris", "Mac OS-X", "Unix"]
 MAJOR               = 0
 MINOR               = 9
 MICRO               = 0
-SUFFIX              = "dev1"  # Should be blank except for rc's, betas, etc.
+SUFFIX              = "dev2"  # Should be blank except for rc's, betas, etc.
 ISRELEASED          = False

 VERSION             = '%d.%d.%d%s' % (MAJOR, MINOR, MICRO, SUFFIX)
@@ -166,7 +166,7 @@ def do_setup():
          install_requires=['numpy>=1.7.1', 'scipy>=0.11', 'six>=1.9.0'],
          # pygments is a dependency for Sphinx code highlight
          extras_require={
-              'test': ['nose>=1.3.0', 'nose-parameterized>=0.5.0'],
+              'test': ['nose>=1.3.0', 'nose-parameterized>=0.5.0', 'flake8<3'],
              'doc': ['Sphinx>=0.5.1', 'pygments']
          },
          package_data={

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -147,7 +147,7 @@ class BadThunkOutput(DebugModeError):
        print("  thunk2  :", self.thunk2, file=sio)

        # Don't import it at the top of the file to prevent circular import.
-        utt = theano.tests.unittest_tools
+        import theano.tests.unittest_tools as utt
        print(utt.str_diagnostic(self.val1, self.val2, None, None), file=sio)
        ret = sio.getvalue()
        return ret
@@ -1769,12 +1769,13 @@ class _Linker(gof.link.LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            assert type(self) is _Linker
-            return type(self)(maker=self.maker).accept(fgraph, no_recycling)
+            return type(self)(maker=self.maker).accept(
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -735,9 +735,13 @@ class Function(object):
        kwargs : dict
            The function inputs can be passed as keyword argument. For this, use
            the name of the input or the input instance as the key.
+
            Keyword argument ``output_subset`` is a list of either indices of the
            function's outputs or the keys belonging to the `output_keys` dict
-            and represent outputs that are requested to be calculated.
+            and represent outputs that are requested to be calculated. Regardless
+            of the presence of ``output_subset``, the updates are always calculated
+            and processed. To disable the updates, you should use the ``copy``
+            method with ``delete_updates=True``.

        Returns
        -------
@@ -1496,9 +1500,10 @@ class FunctionMaker(object):
                     if not spec.borrow]
        if no_borrow:
            self.linker = linker.accept(
-                fgraph, no_recycling=infer_reuse_pattern(fgraph, no_borrow))
+                fgraph, no_recycling=infer_reuse_pattern(fgraph, no_borrow),
+                profile=profile)
        else:
-            self.linker = linker.accept(fgraph)
+            self.linker = linker.accept(fgraph, profile=profile)

        if hasattr(linker, 'accept_var_updates'):
            # hacky thing so VMLinker knows about updates

--- a/theano/compile/nanguardmode.py
+++ b/theano/compile/nanguardmode.py
@@ -41,7 +41,7 @@ def flatten(l):
    return rval


-def contains_nan(arr, node=None):
+def contains_nan(arr, node=None, var=None):
    """
    Test whether a numpy.ndarray contains any `np.nan` values.

@@ -50,6 +50,7 @@ def contains_nan(arr, node=None):
    arr : np.ndarray or output of any Theano op
    node : None or an Apply instance.
        If arr is the output of a Theano op, the node associated to it.
+    var : The Theano symbolic variable.

    Returns
    -------
@@ -68,6 +69,8 @@ def contains_nan(arr, node=None):
        return False
    elif isinstance(arr, np.random.mtrand.RandomState):
        return False
+    elif var and getattr(var.tag, 'is_rng', False):
+        return False
    elif isinstance(arr, slice):
        return False
    elif arr.size == 0:
@@ -86,7 +89,7 @@ def contains_nan(arr, node=None):
    return np.isnan(np.min(arr))


-def contains_inf(arr, node=None):
+def contains_inf(arr, node=None, var=None):
    """
    Test whether a numpy.ndarray contains any `np.inf` values.

@@ -95,6 +98,7 @@ def contains_inf(arr, node=None):
    arr : np.ndarray or output of any Theano op
    node : None or an Apply instance.
        If the output of a Theano op, the node associated to it.
+    var : The Theano symbolic variable.

    Returns
    -------
@@ -114,6 +118,8 @@ def contains_inf(arr, node=None):
        return False
    elif isinstance(arr, np.random.mtrand.RandomState):
        return False
+    elif var and getattr(var.tag, 'is_rng', False):
+        return False
    elif isinstance(arr, slice):
        return False
    elif arr.size == 0:
@@ -215,44 +221,47 @@ class NanGuardMode(Mode):
        assert nan_is_error or inf_is_error or big_is_error
        compile_gpu_func(nan_is_error, inf_is_error, big_is_error)

-        def do_check_on(var, nd):
+        def do_check_on(value, nd, var=None):
            """
-            Checks `var` for NaNs / Infs. If detected, raises an exception
+            Checks `value` for NaNs / Infs. If detected, raises an exception
            and / or prints information about `nd`, `f`, and `is_input` to
            help the user determine the cause of the invalid values.

            Parameters
            ----------
-            var : numpy.ndarray
+            value : numpy.ndarray
                The value to be checked.
            nd : theano.gof.Apply
                The Apply node being executed.
+            var : theano.gof.Variable
+                Not used if nd is there. Otherwise, used to print the stack
+                trace for inputs of the graph.

            """
            error = False
            sio = StringIO()
            if nan_is_error:
-                if contains_nan(var, nd):
+                if contains_nan(value, nd, var):
                    print('NaN detected', file=sio)
                    error = True
            if inf_is_error:
-                if contains_inf(var, nd):
+                if contains_inf(value, nd, var):
                    print('Inf detected', file=sio)
                    error = True
            if big_is_error:
                err = False
-                if isinstance(var, theano.gof.type.CDataType._cdata_type):
+                if isinstance(value, theano.gof.type.CDataType._cdata_type):
                    err = False
-                elif isinstance(var, np.random.mtrand.RandomState):
+                elif isinstance(value, np.random.mtrand.RandomState):
                    err = False
-                elif isinstance(var, slice):
+                elif isinstance(value, slice):
                    err = False
-                elif var.size == 0:
+                elif value.size == 0:
                    err = False
-                elif cuda.cuda_available and isinstance(var, cuda.CudaNdarray):
-                    err = (f_gpuabsmax(var.reshape(var.size)) > 1e10)
+                elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray):
+                    err = (f_gpuabsmax(value.reshape(value.size)) > 1e10)
                else:
-                    err = (np.abs(var).max() > 1e10)
+                    err = (np.abs(value).max() > 1e10)
                if err:
                    print('Big value detected', file=sio)
                    error = True
@@ -264,6 +273,11 @@ class NanGuardMode(Mode):
                else:
                    print("NanGuardMode found an error in an input of the "
                          "graph.", file=sio)
+                # Add the stack trace
+                if nd:
+                    var = nd.outputs[0]
+                print(theano.gof.utils.get_variable_trace_string(var),
+                      file=sio)
                msg = sio.getvalue()
                if config.NanGuardMode.action == 'raise':
                    raise AssertionError(msg)
@@ -281,7 +295,7 @@ class NanGuardMode(Mode):

        def nan_check_input(var, value):
            if getattr(var.tag, 'nan_guard_mode_check', True):
-                do_check_on(value, None)
+                do_check_on(value, None, var=var)

        wrap_linker = theano.gof.vm.VM_Linker(callback=nan_check,
                                              callback_input=nan_check_input)

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -402,6 +402,14 @@ class Shape_i(gof.Op):
    def infer_shape(self, node, input_shapes):
        return [()]

+    def connection_pattern(self, node):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [[False]]
+
    def grad(self, inp, grads):
        return [theano.gradient.grad_not_implemented(
                op=self, x_pos=0, x=inp[0],
@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None):
    return var.shape[i]


+def shape_i_op(i):
+    key = i
+    if key not in shape_i_op.cache:
+        shape_i_op.cache[key] = Shape_i(i)
+    return shape_i_op.cache[key]
+shape_i_op.cache = {}
+
+
 def register_shape_i_c_code(typ, code, check_input, version=()):
    """
    Tell Shape_i how to generate C code for a Theano Type.

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -54,7 +54,7 @@ def _atexit_print_fn():
        destination_file = open(config.profiling.destination, 'w')

    for ps in _atexit_print_list:
-        if ps.fct_callcount or ps.compile_time > 0:
+        if ps.fct_callcount >= 1 or ps.compile_time > 1:
            ps.summary(file=destination_file,
                       n_ops_to_print=config.profiling.n_ops,
                       n_apply_to_print=config.profiling.n_apply)
@@ -72,7 +72,8 @@ def _atexit_print_fn():
        for ps in to_sum[1:]:
            for attr in ["compile_time", "fct_call_time", "fct_callcount",
                         "vm_call_time", "optimizer_time", "linker_time",
-                         "validate_time", "import_time"]:
+                         "validate_time", "import_time",
+                         "linker_node_make_thunks"]:
                setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))

            # merge dictonary
@@ -190,6 +191,8 @@ class ProfileStats(object):
    import_time = 0.0
    # time spent in importing compiled python module.

+    linker_node_make_thunks = 0.0
+
    line_width = config.profiling.output_line_width

    nb_nodes = -1
@@ -665,6 +668,8 @@ class ProfileStats(object):
        print('    Theano Linker time (includes C, CUDA code '
              'generation/compiling): %es' % self.linker_time, file=file)
        print('       Import time %es' % self.import_time, file=file)
+        print('       Node make_thunk time %es' % self.linker_node_make_thunks,
+              file=file)
        print('', file=file)

        # The validation time is a subset of optimizer_time

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -242,6 +242,15 @@ AddConfigVar('gpuarray.preallocate',
             FloatParam(0),
             in_c_key=False)

+AddConfigVar('gpuarray.sched',
+             """The sched parameter passed for context creation to pygpu.
+                With CUDA, using "multi" is equivalent to using the parameter
+                cudaDeviceScheduleYield. This is useful to lower the
+                CPU overhead when waiting for GPU. One user found that it
+                speeds up his other processes that was doing data augmentation.
+             """,
+             EnumStr("default", "multi", "single"))
+
 AddConfigVar('gpuarray.single_stream',
             """
             If your computations are mostly lots of small elements,
@@ -345,8 +354,9 @@ AddConfigVar('dnn.conv.algo_bwd_filter',
 AddConfigVar('dnn.conv.precision',
             "Default data precision to use for the computation in cuDNN "
             "convolutions (defaults to the same dtype as the inputs of the "
-             "convolutions).",
-             EnumStr('as_input', 'float16', 'float32', 'float64'),
+             "convolutions, or float32 if inputs are float16).",
+             EnumStr('as_input_f32', 'as_input', 'float16', 'float32',
+                     'float64'),
             in_c_key=False)


@@ -374,7 +384,7 @@ AddConfigVar('dnn.enabled',
             " to not using it if not present."
             " If True and cuDNN can not be used, raise an error."
             " If False, disable cudnn",
-             StrParam("auto", "True", "False"),
+             EnumStr("auto", "True", "False"),
             in_c_key=False)

 # This flag determines whether or not to raise error/warning message if
@@ -1620,6 +1630,8 @@ def short_platform(r=None, p=None):

    return p
 compiledir_format_dict['short_platform'] = short_platform()
+# Allow to have easily one compiledir per device.
+compiledir_format_dict['device'] = config.device
 compiledir_format_keys = ", ".join(sorted(compiledir_format_dict.keys()))
 default_compiledir_format = ("compiledir_%(short_platform)s-%(processor)s-"
                             "%(python_version)s-%(python_bitwidth)s")

--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -8,6 +8,7 @@ import os
 import shlex
 import sys
 import warnings
+from functools import wraps

 from six import StringIO

@@ -96,6 +97,7 @@ def change_flags(**kwargs):
    Useful during tests.
    """
    def change_flags_exec(f):
+        @wraps(f)
        def inner(*args, **kwargs_):
            old_val = {}
            for k in kwargs:
@@ -117,9 +119,6 @@ def change_flags(**kwargs):
                    assert len(l) == 1
                    l[0].__set__(None, old_val[k])

-        # Make sure that the name of the decorated function remains the same.
-        inner.__name__ = f.__name__
-
        return inner
    return change_flags_exec


--- a/theano/d3viz/formatting.py
+++ b/theano/d3viz/formatting.py
@@ -25,8 +25,12 @@ except ImportError:
    try:
        # fall back on pydot if necessary
        import pydot as pd
+        if hasattr(pd, 'find_graphviz'):
            if pd.find_graphviz():
                pydot_imported = True
+        else:
+            pd.Dot.create(pd.Dot())
+            pydot_imported = True
    except ImportError:
        pass  # tests should not fail on optional dependency


--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -548,7 +548,7 @@ class CLinker(link.Linker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Associate linker with fgraph

@@ -557,7 +557,8 @@ class CLinker(link.Linker):
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            # A linker can be tied to only one FunctionGraph.
-            return type(self)(self.schedule).accept(fgraph, no_recycling)
+            return type(self)(self.schedule).accept(
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.fetch_variables()
        self.no_recycling = no_recycling
@@ -1737,7 +1738,7 @@ class OpWiseCLinker(link.LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Associate linker with fgraph
        """
@@ -1750,7 +1751,7 @@ class OpWiseCLinker(link.LocalLinker):
                allow_gc=self.allow_gc,
                nice_errors=self.nice_errors,
                schedule=self.schedule,
-            ).accept(fgraph, no_recycling)
+            ).accept(fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self
@@ -1897,7 +1898,7 @@ class DualLinker(link.Linker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Update/tie self with fgraph
        """
@@ -1905,7 +1906,7 @@ class DualLinker(link.Linker):
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            return type(self)(self.checker, self.schedule).accept(
-                fgraph, no_recycling)
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1873,7 +1873,8 @@ class GCC_compiler(Compiler):

        if ('g++' not in theano.config.cxx and
                'clang++' not in theano.config.cxx and
-                'clang-omp++' not in theano.config.cxx):
+                'clang-omp++' not in theano.config.cxx and
+                'icpc' not in theano.config.cxx):
            _logger.warn(
                "OPTIMIZATION WARNING: your Theano flag `cxx` seems not to be"
                " the g++ compiler. So we disable the compiler optimization"

--- a/theano/gof/fg.py
+++ b/theano/gof/fg.py
@@ -593,10 +593,10 @@ class FunctionGraph(utils.object2):

    # callback utils #
    def execute_callbacks(self, name, *args, **kwargs):
-        """
-        Calls
-          getattr(feature, name)(*args)
-        for each feature which has a method called after name.
+        """Execute callbacks
+
+        Calls `getattr(feature, name)(*args)` for each feature which has
+        a method called after name.

        """
        t0 = time.time()
@@ -614,11 +614,11 @@ class FunctionGraph(utils.object2):
        self.execute_callbacks_time += time.time() - t0

    def collect_callbacks(self, name, *args):
-        """
-        Returns a dictionary d such that:
-          d[feature] == getattr(feature, name)(*args)
-        For each feature which has a method called after name.
+        """Collects callbacks

+        Returns a dictionary d such that
+        `d[feature] == getattr(feature, name)(*args)`
+        For each feature which has a method called after name.
        """
        d = {}
        for feature in self._features:
@@ -631,17 +631,18 @@ class FunctionGraph(utils.object2):

    # misc #
    def toposort(self):
-        """
-        Return an ordering of the graph's Apply nodes such that:
-        - All the nodes of the inputs of a node are before that node.
-        - Satisfies the orderings provided by each feature that has
+        """Toposort
+
+        Return an ordering of the graph's Apply nodes such that
+
+        * All the nodes of the inputs of a node are before that node.
+        * Satisfies the orderings provided by each feature that has
          an 'orderings' method.

        If a feature has an 'orderings' method, it will be called with
        this FunctionGraph as sole argument. It should return a dictionary of
-        {node: predecessors} where predecessors is a list of nodes
-        that should be computed before the key node.
-
+        `{node: predecessors}` where predecessors is a list of nodes that
+        should be computed before the key node.
        """
        if len(self.apply_nodes) < 2:
            # optimization
@@ -760,17 +761,20 @@ class FunctionGraph(utils.object2):
        return self.clone_get_equiv(check_integrity)[0]

    def clone_get_equiv(self, check_integrity=True, attach_feature=True):
-        """Clone the graph and get a memo( a dict )that map old node to new node
-        ----------------------------
+        """Clone the graph and get a dict that maps old nodes to new ones
+
        Parameters:
-            check_integrity - { bool } Whether to check integrity.
+            check_integrity: bool
+                Whether to check integrity. Default is True.
+            attach_feature: bool
+                Whether to attach feature of origin graph to cloned graph.
                Default is True.
-            attach_feature - { bool } Whether to attach feature of origin graph to
-                                cloned graph. Default is True.
-        ----------------------------
+
        Returns:
-            e - { FunctionGraph } Cloned fgraph. Every node in cloned graph is cloned.
-            equiv - { dict } A dict that map old node to new node.
+            e: FunctionGraph
+                Cloned fgraph. Every node in cloned graph is cloned.
+            equiv: dict
+                A dict that map old node to new node.
        """
        equiv = graph.clone_get_equiv(self.inputs, self.outputs)


--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
 """
 Node classes (`Apply`, `Variable`) and expression graph algorithms.
-
-To read about what theano graphs are from a user perspective, have a look at
-`graph.html <../doc/graph.html>`__.
-
 """
 from __future__ import absolute_import, print_function, division

@@ -1246,9 +1242,9 @@ def as_string(i, o,
        Input `Variable` s.
    o : list
        Output `Variable` s.
-    leaf_formatter : function
+    leaf_formatter : callable
        Takes a `Variable`  and returns a string to describe it.
-    node_formatter : function
+    node_formatter : callable
        Takes an `Op`  and the list of strings corresponding to its arguments
        and returns a string to describe it.


--- a/theano/gof/lazylinker_c.c
+++ b/theano/gof/lazylinker_c.c
@@ -789,15 +789,47 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
 {
  CLazyLinker * self = (CLazyLinker*)_self;
  static char *kwlist[] = {
-    (char*)"time_thunks",
+    (char *)"time_thunks",
    (char *)"n_calls",
+    (char *)"output_subset",
    NULL};
  int n_calls=1;
-  if (! PyArg_ParseTupleAndKeywords(args, kwds, "|ii", kwlist,
+  PyObject *output_subset_ptr = NULL;
+  if (! PyArg_ParseTupleAndKeywords(args, kwds, "|iiO", kwlist,
                                    &self->do_timing,
-                                    &n_calls))
+                                    &n_calls,
+                                    &output_subset_ptr))
    return NULL;
+
  int err = 0;
+  // parse an output_subset list
+  // it is stored as a bool list of length n_output_vars: calculate a var or not
+  char *output_subset = NULL;
+  int output_subset_size = -1;
+  if (output_subset_ptr != NULL)
+    {
+      if (! PyList_Check(output_subset_ptr))
+        {
+          err = 1;
+          PyErr_SetString(PyExc_RuntimeError, "Output_subset is not a list");
+        }
+      else
+        {
+          output_subset_size = PyList_Size(output_subset_ptr);
+          output_subset = (char*)calloc(self->n_output_vars, sizeof(char));
+          for (int it = 0; it < output_subset_size; ++it)
+            {
+              PyObject *elem = PyList_GetItem(output_subset_ptr, it);
+              if (! PyInt_Check(elem))
+                {
+                  err = 1;
+                  PyErr_SetString(PyExc_RuntimeError, "Some elements of output_subset list are not int");
+                }
+              output_subset[PyInt_AsLong(elem)] = 1;
+            }
+        }
+    }
+
  self->position_of_error = -1;
  // create constants used to fill the var_compute_cells
  PyObject * one = PyInt_FromLong(1);
@@ -833,10 +865,14 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
            }
        }

+      int first_updated = self->n_output_vars - self->n_updates;
      for (int i = 0; i < self->n_output_vars && (!err); ++i)
+        {
+          if (i >= first_updated || output_subset == NULL || output_subset[i] == 1)
            {
              err = lazy_rec_eval(self, self->output_vars[i], one, zero);
            }
+        }

      if (!err)
        {
@@ -848,7 +884,8 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
            {
              Py_ssize_t src = self->output_vars[i];
              PyObject * item = PyList_GetItem(self->var_value_cells[src], 0);
-              if (self->var_computed[src] != 1)
+              if ((output_subset == NULL || output_subset[i]) &&
+                  self->var_computed[src] != 1)
                {
                  err = 1;
                  PyErr_Format(PyExc_AssertionError,
@@ -901,6 +938,9 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
          PyList_SetItem(self->var_value_cells[i], 0, Py_None);
        }
    }
+  if (output_subset != NULL)
+    free(output_subset);
+
  Py_DECREF(one);
  Py_DECREF(zero);
  if (err)
@@ -1014,7 +1054,7 @@ static PyTypeObject lazylinker_ext_CLazyLinkerType = {

 static PyObject * get_version(PyObject *dummy, PyObject *args)
 {
-  PyObject *result = PyFloat_FromDouble(0.21);
+  PyObject *result = PyFloat_FromDouble(0.211);
  return result;
 }


--- a/theano/gof/lazylinker_c.py
+++ b/theano/gof/lazylinker_c.py
@@ -15,7 +15,7 @@ from theano.gof import cmodule
 _logger = logging.getLogger('theano.gof.lazylinker_c')

 force_compile = False
-version = 0.21  # must match constant returned in function get_version()
+version = 0.211  # must match constant returned in function get_version()
 lazylinker_ext = None


@@ -145,4 +145,4 @@ except ImportError:
        release_lock()

 from lazylinker_ext.lazylinker_ext import *  # noqa
-assert force_compile or (version == get_version())
+assert force_compile or (version == get_version())  # noqa
--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -762,7 +762,7 @@ class PerformLinker(LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """

        Parameters
@@ -781,7 +781,8 @@ class PerformLinker(LocalLinker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
-            return type(self)(allow_gc=self.allow_gc).accept(fgraph, no_recycling)
+            return type(self)(allow_gc=self.allow_gc).accept(
+                fgraph, no_recycling, profile)
            # raise Exception("Cannot accept from a Linker that is already tied to another FunctionGraph.")
        self.fgraph = fgraph
        self.no_recycling = no_recycling
@@ -944,7 +945,7 @@ class WrapLinker(Linker):
            linkers=[l.clone(allow_gc=allow_gc) for l in self.linkers],
            wrapper=self.wrapper)

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """

        Parameters

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            for (t, count, n_created, o) in count_opt[::-1]:
                print(blanc, '  %.3fs - %d - %d - %s' % (
                    t, count, n_created, o), file=stream)
-            print(blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
+            print(blanc, '  %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
                not_used_time, len(not_used)), file=stream)
            not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
            for (t, o) in not_used[::-1]:

--- a/theano/gof/tests/test_destroyhandler.py
+++ b/theano/gof/tests/test_destroyhandler.py
@@ -5,7 +5,8 @@ from theano.gof.type import Type
 from theano.gof import graph
 from theano.gof.graph import Variable, Apply
 from theano.gof.op import Op
-from theano.gof.opt import *  # noqa
+from theano.gof.opt import (OpKeyOptimizer, PatternSub, NavigatorOptimizer,
+                            TopoOptimizer, OpSub)

 from theano.gof import destroyhandler
 from theano.gof.fg import FunctionGraph, InconsistencyError

--- a/theano/gof/tests/test_link.py
+++ b/theano/gof/tests/test_link.py
@@ -11,7 +11,7 @@ from theano.gof.type import Type
 from theano.gof.op import Op
 from theano.gof import fg

-from theano.gof.link import *  # noqa
+from theano.gof.link import PerformLinker, WrapLinker, Container
 from theano.compat import cmp



--- a/theano/gof/tests/test_opt.py
+++ b/theano/gof/tests/test_opt.py
@@ -3,9 +3,11 @@ from __future__ import absolute_import, print_function, division
 from theano.gof.type import Type
 from theano.gof.graph import Variable, Apply, Constant
 from theano.gof.op import Op
-from theano.gof.opt import *  # noqa
+from theano.gof.opt import (OpKeyOptimizer, PatternSub, TopoOptimizer, OpSub,
+                            MergeOptimizer, config, theano,
+                            EquilibriumOptimizer, logging, pre_constant_merge,
+                            pre_greedy_local_optimizer)
 from theano.gof.fg import FunctionGraph
-from theano.gof.toolbox import *  # noqa

 from theano import tensor as T


--- a/theano/gof/tests/test_toolbox.py
+++ b/theano/gof/tests/test_toolbox.py
@@ -5,7 +5,7 @@ from theano.gof.type import Type
 from theano.gof.op import Op

 from theano.gof.fg import FunctionGraph
-from theano.gof.toolbox import *  # noqa
+from theano.gof.toolbox import NodeFinder


 def as_variable(x):

--- a/theano/gof/tests/test_vm.py
+++ b/theano/gof/tests/test_vm.py
--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/dnn_batchnorm.c
+++ b/theano/gpuarray/dnn_batchnorm.c
--- a/theano/gpuarray/dnn_batchnorm_base.c
+++ b/theano/gpuarray/dnn_batchnorm_base.c
--- a/theano/gpuarray/dnn_batchnorm_grad.c
+++ b/theano/gpuarray/dnn_batchnorm_grad.c
--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
--- a/theano/gpuarray/tests/test_extra_ops.py
+++ b/theano/gpuarray/tests/test_extra_ops.py
--- a/theano/gpuarray/tests/test_fft.py
+++ b/theano/gpuarray/tests/test_fft.py
--- a/theano/gpuarray/tests/test_multinomial.py
+++ b/theano/gpuarray/tests/test_multinomial.py
--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
--- a/theano/gpuarray/tests/test_pickle.py
+++ b/theano/gpuarray/tests/test_pickle.py
--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
--- a/theano/gradient.py
+++ b/theano/gradient.py
--- a/theano/misc/elemwise_openmp_speedup.py
+++ b/theano/misc/elemwise_openmp_speedup.py
--- a/theano/printing.py
+++ b/theano/printing.py
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
--- a/theano/sandbox/cuda/tests/test_tensor_op.py
+++ b/theano/sandbox/cuda/tests/test_tensor_op.py
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
--- a/theano/tensor/nlinalg.py
+++ b/theano/tensor/nlinalg.py
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
--- a/theano/tensor/nnet/corr_gemm.c
+++ b/theano/tensor/nnet/corr_gemm.c
--- a/theano/tensor/nnet/tests/__init__.py
+++ b/theano/tensor/nnet/tests/__init__.py
--- a/theano/tensor/nnet/tests/speed_test_conv.py
+++ b/theano/tensor/nnet/tests/speed_test_conv.py
--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
--- a/theano/tensor/nnet/tests/test_conv3d.py
+++ b/theano/tensor/nnet/tests/test_conv3d.py
--- a/theano/tensor/nnet/tests/test_conv3d2d.py
+++ b/theano/tensor/nnet/tests/test_conv3d2d.py
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
--- a/theano/tensor/nnet/tests/test_neighbours.py
+++ b/theano/tensor/nnet/tests/test_neighbours.py
--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
--- a/theano/tensor/signal/tests/test_pool.py
+++ b/theano/tensor/signal/tests/test_pool.py
--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py
--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py