Merge remote-tracking branch 'refs/remotes/Theano/master'

27146e2e · ruslanagit · 65cff731 · 4aad70d8 · 27146e2e · 27146e2e
--- a/.gitignore
+++ b/.gitignore
@@ -37,4 +37,5 @@ Theano.suo
 .ipynb_checkpoints
 .pydevproject
 .ropeproject
-core
\ No newline at end of file
+core
+.idea
--- a/.jenkins/jenkins_pretest.sh
+++ b/.jenkins/jenkins_pretest.sh
+#!/bin/bash
+
+# Script for Jenkins continuous integration pre-testing
+
+# Print commands as they are executed
+set -x
+
+# Anaconda python
+export PATH=/usr/local/miniconda2/bin:$PATH
+
+# Test flake8
+echo "===== Testing flake8"
+bin/theano-nose theano/tests/test_flake8.py || exit 1
+
+# Test documentation
+echo "===== Testing documentation build"
+python doc/scripts/docgen.py --nopdf --check || exit 1
+echo "===== Testing documentation code snippets"
+python doc/scripts/docgen.py --test --check || exit 1
--- a/.jenkins/jenkins_test1.sh
+++ b/.jenkins/jenkins_test1.sh
+#!/bin/bash
+
+# Script for Jenkins continuous integration testing of theano base
+
+# Print commands as they are executed
+set -x
+
+# Anaconda python
+export PATH=/usr/local/miniconda2/bin:$PATH
+
+echo "===== Testing theano core"
+
+# Test theano core
+PARTS="theano -e cuda -e gpuarray"
+THEANO_PARAM="${PARTS} --with-timer --timer-top-n 10"
+FLAGS="mode=FAST_RUN,floatX=float32"
+THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
--- a/.jenkins/jenkins_test2.sh
+++ b/.jenkins/jenkins_test2.sh
+#!/bin/bash
+
+# Script for Jenkins continuous integration testing of gpu backends
+
+# Print commands as they are executed
+set -x
+
+# Anaconda python
+export PATH=/usr/local/miniconda2/bin:$PATH
+
+# CUDA                                                                          
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
+
+echo "===== Testing old theano.sandbox.cuda backend"
+
+THEANO_CUDA_TESTS="theano/sandbox/cuda/tests \
+            theano/misc/tests/test_pycuda_example.py \
+            theano/misc/tests/test_pycuda_theano_simple.py \
+            theano/misc/tests/test_pycuda_utils.py \
+            theano/tensor/tests/test_opt.py:TestCompositeCodegen \
+            theano/tensor/tests/test_opt.py:test_shapeoptimizer \
+            theano/tensor/tests/test_opt.py:test_fusion \
+            theano/compile/tests/test_debugmode.py:Test_preallocated_output \
+            theano/sparse/tests/test_basic.py:DotTests \
+            theano/sandbox/tests/test_multinomial.py:test_gpu_opt \
+            theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPU_serial \
+            theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPU_parallel \
+            theano/sandbox/tests/test_rng_mrg.py:test_GPU_nstreams_limit \
+            theano/sandbox/tests/test_rng_mrg.py:test_overflow_gpu_old_backend \
+            theano/scan_module/tests/test_scan.py:T_Scan_Cuda"
+THEANO_PARAM="${THEANO_CUDA_TESTS} --with-timer --timer-top-n 10"
+FLAGS="mode=FAST_RUN,init_gpu_device=gpu,floatX=float32"
+THEANO_FLAGS=${FLAGS} bin/theano-nose ${THEANO_PARAM}
+
+echo "===== Testing gpuarray backend"
+
+GPUARRAY_CONFIG="Release"
+DEVICE=cuda0
+LIBDIR=~/tmp/local
+
+# Make fresh clones of libgpuarray (with no history since we don't need it)
+rm -rf libgpuarray
+git clone --depth 1 "https://github.com/Theano/libgpuarray.git"
+
+# Clean up previous installs (to make sure no old files are left) 
+rm -rf $LIBDIR
+mkdir $LIBDIR
+
+# Build libgpuarray
+mkdir libgpuarray/build
+(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)
+
+# Finally install                                                               
+(cd libgpuarray/build && make install)
+
+# Export paths
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib64/
+export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib64/
+export CPATH=$CPATH:$LIBDIR/include
+export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
+
+# Build the pygpu modules                                                       
+(cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
+ls $LIBDIR
+mkdir $LIBDIR/lib/python
+export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
+# Then install                                                                  
+(cd libgpuarray && python setup.py install --home=$LIBDIR)
+
+# Testing theano (the gpuarray parts)                                           
+THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
+                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
+                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \
+                       theano/scan_module/tests/test_scan.py:T_Scan_Gpuarray"
+FLAGS="init_gpu_device=$DEVICE,gpuarray.preallocate=1000,mode=FAST_RUN"
+THEANO_FLAGS=${FLAGS} time nosetests -v ${THEANO_GPUARRAY_TESTS}
--- a/doc/.static/version_switch.js
+++ b/doc/.static/version_switch.js
@@ -30,10 +30,14 @@
  function build_vswitch() {
  // Build HTML string for version selector, based on ReadTheDocs theme's versions.html

+    var vlabel = current_version.replace("theano_versions/", "");
+    if (vlabel == 'theano') {
+      vlabel = 'release';
+    }
    var vswitch = ['<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions" align=left>'];
    vswitch.push('<span class="rst-current-version" data-toggle="rst-current-version">');
    vswitch.push('<span class="fa fa-book"></span>');
-    vswitch.push('v: ', current_version.replace("theano_versions/", ""), ' ');
+    vswitch.push('v: ', vlabel, ' ');
    vswitch.push('<span class="fa fa-caret-down"></span>');   
    vswitch.push('</span>');
    

--- a/doc/css.inc
+++ b/doc/css.inc
+.. _css:
+
+.. raw:: html
+
+    <style> .black {color:black} </style>
+    <style> .blue {color:blue} </style>
+    <style> .red {color:red} </style>
+    <style> .green {color:green} </style>
+    <style> .pink {color:pink} </style>
+.. role:: blue
+.. role:: red
+.. role:: green
+.. role:: pink
+.. role:: black
--- a/doc/dev_start_guide.txt
+++ b/doc/dev_start_guide.txt
@@ -10,14 +10,14 @@ Contributing
 You want to contribute to Theano? That is great! This page explain our
 workflow and some resource for doing so.

-Looking for an idea for a first contribution? Check `github issue
-<https://github.com/Theano/Theano/issues?q=is%3Aopen+is%3Aissue+label%3A%22Easy+fix%22>`
+Looking for an idea for a first contribution? Check the `github issues
+<https://github.com/Theano/Theano/issues?q=is%3Aopen+is%3Aissue+label%3A%22Easy+fix%22>`_
 with a label ``easy fix``. They are good starter. It is recommanded
 that you write on the issue you want to work on it. This help make
 sure it is up to date and see if nobody else is working on it. Also,
 we can sometimes provides more information about it.  There is also
 the label `NeedSomeoneToFinish
-<https://github.com/Theano/Theano/labels/NeedSomeoneToFinish>` that is
+<https://github.com/Theano/Theano/labels/NeedSomeoneToFinish>`_ that is
 interresting to check. The difficulty level is variable.

 Resources
@@ -85,8 +85,8 @@ make sure there are no global impacts.
 Also, if you are changing GPU code, travis doesn't test that, because
 there are no GPUs on the test nodes.

-To run the test suite with the default options, you can follow the
-instructions of :ref:`testing_installation`.
+To run the test suite with the default options, see 
+:ref:`test_theano`.

 Each night we execute all the unit tests automatically, with several
 sets of options. The result is sent by email to the `theano-buildbot`_
@@ -126,7 +126,11 @@ To setup VIM:

 #. Install flake8 (if not already installed) with::

-    pip install flake8
+    pip install "flake8<3"
+
+   .. warning:: Starting version 3.0.0, flake8 changed its dependancies and 
+      moved its Python API to a legacy module, breaking Theano's flake8 tests.
+      We recommend using a version prior to 3.  

   .. note:: You can use ``easy_install`` instead of ``pip``, and ``pep8``
      instead of ``flake8`` if you prefer. The important thing is that the
@@ -150,6 +154,8 @@ To setup VIM:
        Plugin 'scrooloose/syntastic'
        Plugin 'jimf/vim-pep8-text-width'

+        call vundle#end()
+
        " Syntastic settings
        " You can run checkers explicitly by calling :SyntasticCheck <checker
        let g:syntastic_python_checkers = ['flake8'] "use one of the following checkers:
@@ -360,7 +366,7 @@ You can choose another name than "central" to reference Theano/Theano
 to "central."

 You can then test your installation of Theano by following the steps of
-:ref:`testing_installation`.
+:ref:`test_theano`.


 Using your local copy

--- a/doc/extending/optimization.txt
+++ b/doc/extending/optimization.txt
@@ -872,9 +872,9 @@ To understand this profile here is some explanation of how optimizations work:
           0.131s for callback
           time      - (name, class, index) - validate time

-      Then it will print, with some additional indentation, each sub-optimizer's profile
-      information. These sub-profiles are ordered by the time they took to execute,
-      not by their execution order.
+  Then it will print, with some additional indentation, each sub-optimizer's profile
+  information. These sub-profiles are ordered by the time they took to execute,
+  not by their execution order.

  * ``OPT_FAST_RUN`` is the name of the optimizer
  * 1.152s is the total time spent in that optimizer

--- a/doc/faq.txt
+++ b/doc/faq.txt
@@ -10,21 +10,6 @@ Does Theano support Python 3?
 ------------------------------
 We support both Python 2 >= 2.6 and Python 3 >= 3.3.

-TypeError: object of type 'TensorVariable' has no len()
-------------------------------------------------------
-
-If you receive the following error, it is because the Python function *__len__* cannot
-be implemented on Theano variables:
-
-.. code-block:: python
-
-   TypeError: object of type 'TensorVariable' has no len()
-
-Python requires that *__len__* returns an integer, yet it cannot be done as Theano's variables are symbolic. However, `var.shape[0]` can be used as a workaround.
-
-This error message cannot be made more explicit because the relevant aspects of Python's
-internals cannot be modified.
-
 Output slight numerical difference
 ----------------------------------

@@ -39,7 +24,6 @@ Every Computer Scientist Should Know About Floating-Point Arithmetic
 <https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html>`_.


-
 Faster gcc optimization
 -----------------------

@@ -179,33 +163,6 @@ but requires that all nodes in the graph have a C implementation:
    f(10.)


-Out of memory... but not really
-------------------------------
-
-Occasionally Theano may fail to allocate memory when there appears to be more
-than enough reporting:
-
-    Error allocating X bytes of device memory (out of memory). Driver report Y
-    bytes free and Z total.
-
-where X is far less than Y and Z (i.e. X << Y < Z).
-
-This scenario arises when an operation requires allocation of a large contiguous
-block of memory but no blocks of sufficient size are available.
-
-GPUs do not have virtual memory and as such all allocations must be assigned to
-a continuous memory region. CPUs do not have this limitation because or their
-support for virtual memory. Multiple allocations on a GPU can result in memory
-fragmentation which can makes it more difficult to find contiguous regions
-of memory of sufficient size during subsequent memory allocations.
-
-A known example is related to writing data to shared variables. When updating a
-shared variable Theano will allocate new space if the size of the data does not
-match the size of the space already assigned to the variable. This can lead to
-memory fragmentation which means that a continugous block of memory of
-sufficient capacity may not be available even if the free memory overall is
-large enough.
-
 Related Projects
 ----------------

@@ -226,55 +183,3 @@ Here is a list of some of the known limitations:
  interact with the rest of the graph).

 - Neither *goto* nor *recursion* is supported or planned within expression graphs.
-
-
-"float32 / int{32, 64} gives float64"
-------------------------------------
-
-It should be noted that using float32 and int{32, 64} together
-inside a function would provide float64 as output.
-
-Since the GPU can't compute this kind of output, it would be
-preferable not to use those dtypes together.
-
-To help you find where float64 are created, see the
-:attr:`warn_float64` Theano flag.
-
-Theano memory/speed trade-off
-----------------------------
-
-There is a few things you can easily do to change the trade-off
-between speed and memory usage. It nothing is said, this affect the
-CPU and GPU memory usage.
-
-Could speed up and lower memory usage:
-
- :ref:`cuDNN <libdoc_cuda_dnn>` default cuDNN convolution use less
-   memory then Theano version. But some flags allow it to use more
-   memory. GPU only.
- Shortly avail, multi-GPU.
-
-Could raise memory usage but speed up computation:
-
- :attr:`config.lib.cnmem` =1  # Do not raise much memory usage, but if you are at the limit of GPU memory available. GPU only.
- :attr:`config.allow_gc` =False
- :attr:`config.optimizer_excluding` =low_memory , GPU only for now.
-
-Could lower the memory usage, but raise computation time:
-
- :attr:`config.scan.allow_gc` =True # Probably not significant slowdown if config.lib.cnmem is used.
- :attr:`config.scan.allow_output_prealloc` =False
- Use :func:`batch_normalization()
-  <theano.tensor.nnet.bn.batch_normalization>`. It use less memory
-  then building a corresponding Theano graph.
- Disable one or scan more optimizations:
-    - ``optimizer_excluding=scanOp_pushout_seqs_ops``
-    - ``optimizer_excluding=scan_pushout_dot1``
-    - ``optimizer_excluding=scanOp_pushout_output``
- Disable all optimization tagged as raising memory usage:
-  ``optimizer_excluding=more_mem`` (currently only the 3 scan optimizations above)
- `float16 <https://github.com/Theano/Theano/issues/2908>`_.
-
-If you want to analyze the memory usage during computation, the
-simplest is to let the memory error happen during Theano execution and
-use the Theano flags :attr:`exception_verbosity=high`.
--- a/doc/generate_dtype_tensor_table.py
+++ b/doc/generate_dtype_tensor_table.py
@@ -17,17 +17,18 @@ shapes = [
        ('col', (False, True)),
        ('matrix', (False,False)),
        ('tensor3', (False,False,False)),
-        ('tensor4', (False,False,False,False)),]
+        ('tensor4', (False,False,False,False)),
+        ('tensor5', (False,False,False,False,False)),]

-hdr = '============ =========== ==== =========== ================================='
+hdr = '============ =========== ==== ============ ==================================='
 print(hdr)
-print('Constructor  dtype       ndim shape       broadcastable')
+print('Constructor  dtype       ndim shape        broadcastable')
 print(hdr)
 for letter in letters:
    for shape in shapes:
        suff = ',)' if len(shape[1])==1 else ')'
        s = '(' + ','.join('1' if b else '?' for b in shape[1]) + suff
-        print('%s%-10s  %-10s  %-4s %-10s  %-20s' %(
+        print('%s%-10s  %-10s  %-4s %-11s  %-20s' %(
                letter[0], shape[0], letter[1], len(shape[1]), s, shape[1]
                ))
 print(hdr)
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -125,6 +125,7 @@ Roughly in order of what you'll want to check out:
 * :ref:`install` -- How to install Theano.
 * :ref:`introduction` -- What is Theano?
 * :ref:`tutorial` -- Learn the basics.
+* :ref:`troubleshooting` -- Tips and tricks for common debugging.
 * :ref:`libdoc` -- Theano's functionality, module by module.
 * :ref:`faq` -- A set of commonly asked questions.
 * :ref:`optimizations` -- Guide to Theano's graph optimizations.
@@ -237,12 +238,15 @@ StackOverflow, follow their guidance for `answering questions <http://stackoverf

   NEWS
   introduction
+   requirements
   install
+   updating
   tutorial/index
   extending/index
   dev_start_guide
   optimizations
   library/index
+   troubleshooting
   glossary
   links
   internal/index

--- a/doc/install.txt
+++ b/doc/install.txt
--- a/doc/install_centos6.txt
+++ b/doc/install_centos6.txt
-:orphan:
+.. include:: css.inc

 .. _install_centos6:
+ 

+CentOS 6 Installation Instructions
+##################################

-Easy Installation of an optimized Theano on CentOS 6
-====================================================
+.. warning::
+    If you want to install the bleeding-edge or development version of Theano
+    from GitHub, please make sure you are reading `the latest version of this
+    page <http://deeplearning.net/software/theano_versions/dev/install_centos6.html>`_.

-.. note::
+.. include:: requirements.txt

-    It is possible to have a faster installation of Theano than the one these
-    instructions will provide, but this will make the installation more
-    complicated and/or may require that you buy software. This is a simple set
-    of installation instructions that will leave you with a relatively
-    well-optimized version that uses only free software. With more work or by
-    investing money (i.e. buying a license to a proprietary BLAS
-    implementation), it is possible to gain further performance.
+.. include:: install_generic.inc
+    :start-line: 5

-.. note::
-
-   If you are behind a proxy, you must do some extra configuration steps
-   before starting the installation. You must set the environment
-   variable ``http_proxy`` to the proxy address. Using bash this is
-   accomplished with the command
-   ``export http_proxy="http://user:pass@my.site:port/"``
-   You can also provide the ``--proxy=[user:pass@]url:port`` parameter
-   to pip. The ``[user:pass@]`` portion is optional.
-
-.. note::
-
-   We use ``pip`` for 2 reasons. First, it allows "``import module;
-   module.test()``" to work correctly. Second, the installation of NumPy
-   1.6 or 1.6.1 with ``easy_install`` raises an ImportError at the end of
-   the installation. To my knowledge we can ignore this error, but
-   this is not completely safe. ``easy_install`` with NumPy 1.5.1 does not
-   raise this error.
-
-
-
-Installation steps
-~~~~~~~~~~~~~~~~~~
-
- 1) ``sudo yum install python-devel python-nose python-setuptools gcc
-    gcc-gfortran gcc-c++ blas-devel lapack-devel atlas-devel``
- 2) ``sudo easy_install pip``
- 3) ``sudo pip install numpy==1.6.1``
- 4) ``sudo pip install scipy==0.10.1``
- 5) ``sudo pip install Theano``
-
-
-Test the newly installed packages
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- 1) NumPy (~30s): ``python -c "import numpy; numpy.test()"``
- 2) SciPy (~1m): ``python -c "import scipy; scipy.test()"``
- 3) Theano (~30m): ``python -c "import theano; theano.test()"``
-
-
-Speed test Theano/BLAS
-~~~~~~~~~~~~~~~~~~~~~~
-
-It is recommended to test your Theano/BLAS integration. There are many versions
-of BLAS that exist and there can be up to 10x speed difference between them.
-Also, having Theano link directly against BLAS instead of using NumPy/SciPy as
-an intermediate layer reduces the computational overhead. This is
-important for BLAS calls to ``ger``, ``gemv`` and small ``gemm`` operations
-(automatically called when needed when you use ``dot()``). To run the
-Theano/BLAS speed test:
+Requirements through System Packages (not recommended)
+------------------------------------------------------

 .. code-block:: bash

-    python /usr/lib/python2.*/site-packages/theano/misc/check_blas.py
-
-This will print a table with different versions of BLAS/numbers of
-threads on multiple CPUs and GPUs. It will also print some Theano/NumPy
-configuration information. Then, it will print the running time of the same
-benchmarks for your installation. Try to find a CPU similar to yours in
-the table, and check that the single-threaded timings are roughly the same.
-
-
-Updating Theano
-~~~~~~~~~~~~~~~
-
-If you followed these installation instructions, you can execute this command
-to update only Theano:
-
-.. code-block:: bash
-
-    sudo pip install --upgrade --no-deps theano
-
-
-If you want to also update NumPy/SciPy, you can run this:
-
-.. code-block:: bash
-
-    sudo pip install --upgrade theano
-
-Bleeding edge
-~~~~~~~~~~~~~
-
-Do like in the section "Updating Theano", but use
-``git+git://github.com/Theano/Theano.git`` instead of ``theano``.
+    sudo yum install python-devel python-nose python-setuptools gcc gcc-gfortran gcc-c++ blas-devel lapack-devel atlas-devel
+    sudo easy_install pip
--- a/doc/install_generic.inc
+++ b/doc/install_generic.inc
+.. include:: css.inc
+
+.. _install_generic:
+
+
+Installation
+============
+
+Stable Installation
+-------------------
+
+Install the latest stable version of Theano with:
+
+.. raw:: html
+
+    <div class="highlight"><pre><span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> Theano[test, doc]</pre></div>
+
+- Any argument between <...> is optional.
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- [test] will install the requirements for testing.
+
+- [doc] will install the requirements in order to generate the documentation.
+
+If you encountered any trouble, head to the :ref:`troubleshooting` page.
+
+libgpuarray
+^^^^^^^^^^^
+
+For the stable version of Theano you need a specific version of libgpuarray, 
+that has been tagged ``v-9998``.
+Download it with:
+
+.. raw:: html
+
+    <div class='highlight'><pre>
+    git clone https://github.com/Theano/libgpuarray.git --tags
+    git checkout origin/v-9998
+    cd libgpuarray
+    </pre></div>
+
+and then follow the `Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
+
+
+Bleeding-Edge Installation (recommended)
+----------------------------------------
+
+Install the latest, bleeding-edge, development version of Theano with:
+
+.. raw:: html
+
+    <div class='highlight'><pre><span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> <span class="pink">&lt;--no-deps&gt;</span> git+https://github.com/Theano/Theano.git#egg=Theano</pre></div>
+
+- Any argument between <...> is optional.
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- Use :pink:`no-deps` when you don't want the dependencies of Theano to be installed through pip. This is important when they have already been installed as system packages.
+
+If you encountered any trouble, head to the :ref:`troubleshooting` page.
+
+libgpuarray
+^^^^^^^^^^^
+
+Install the latest, development version of libgpuarray following the
+`Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
+
+
+Developer Installation
+----------------------
+
+Install the developer version of Theano with:
+
+.. raw:: html
+
+    <div class="highlight"><pre>git clone git://github.com/Theano/Theano.git
+    cd Theano
+    <span class="red">&lt;sudo&gt;</span> pip install <span class="blue">&lt;--user&gt;</span> <span class="pink">&lt;--no-deps&gt;</span> <span class="green">-e .</span></pre></div>
+
+- Any argument between <...> is optional.
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- Use :pink:`no-deps` when you don't want the dependencies of Theano to be installed through pip. This is important when they have already been installed as system packages.
+
+- :green:`-e` makes your installation *editable*, i.e., it links it to your
+  source directory.
+
+If you encountered any trouble, head to the :ref:`troubleshooting` page.
+
+libgpuarray
+^^^^^^^^^^^
+
+Install the latest, development version of libgpuarray following the
+`Step-by-step instructions <http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install>`__.
--- a/doc/install_macos.txt
+++ b/doc/install_macos.txt
+.. include:: css.inc
+
+.. _install_macos:
+
+
+Mac OS Installation Instructions
+################################
+
+.. warning::
+    If you want to install the bleeding-edge or development version of Theano
+    from GitHub, please make sure you are reading `the latest version of this
+    page <http://deeplearning.net/software/theano_versions/dev/install_macos.html>`_.
+
+There are various ways to install Theano dependencies on a Mac. Here
+we describe the process in detail with Anaconda, Homebrew or MacPorts 
+but if you did it differently and it worked, please let us know the 
+details on the `theano-users`_ mailing-list, so that we can add 
+alternative instructions here.
+
+.. _theano-users: http://groups.google.com/group/theano-users?pli=1
+
+.. include:: requirements.txt
+
+.. _gpu_macos:
+
+.. attention::
+
+    For MacOS you should be able to follow the above instructions to
+    setup CUDA, but be aware of the following caveats:
+
+       * If you want to compile the CUDA SDK code, you may need to temporarily
+         revert back to Apple's gcc (``sudo port select gcc``) as their Makefiles
+         are not compatible with MacPort's gcc.
+
+       * If CUDA seems unable to find a CUDA-capable GPU, you may need to manually
+         toggle your GPU on, which can be done with
+         `gfxCardStatus <http://codykrieger.com/gfxCardStatus>`__.
+
+.. attention::
+
+    Theano officially supports only clang on OS X.  This can be installed
+    by getting XCode from the App Store and running it once to install the
+    command-line tools.
+
+.. include:: install_generic.inc
+    :start-line: 5
+
+Requirements through Homebrew (not recommended)
+-----------------------------------------------
+
+Install python with homebrew:
+
+.. code-block:: bash
+
+    $ brew install python # or python3 if you prefer
+
+This will install pip.  Then use pip to install numpy, scipy:
+
+.. code-block:: bash
+
+    $ pip install numpy scipy
+
+If you want to use openblas instead of Accelerate, you have to install
+numpy and scipy with hombrew:
+
+.. code-block:: bash
+
+    $ brew tap homebrew/python
+    $ brew install numpy --with-openblas
+    $ brew install scipy --with-openblas
+
+
+Requirements through MacPorts (not recommended)
+-----------------------------------------------
+
+Using `MacPorts <http://www.macports.org/>`__ to install all required
+Theano dependencies is easy, but be aware that it will take a long time
+(a few hours) to build and install everything.
+
+- MacPorts requires installing XCode first (which can be found in the
+  Mac App Store), if you do not have it already.
+  If you can't install it from the App Store, look in your MacOS X installation
+  DVD for an old version. Then update your Mac to update XCode.
+
+- Download and install `MacPorts <http://www.macports.org/>`__, then
+  ensure its package list is up-to-date with ``sudo port selfupdate``.
+
+- Then, in order to install one or more of the required libraries, use
+  ``port install``, e.g. as follows:
+
+    .. code-block:: bash
+
+        $ sudo port install py27-numpy +atlas py27-scipy +atlas py27-pip
+
+  This will install all the required Theano dependencies. gcc will
+  be automatically installed (since it is a SciPy dependency), but be
+  aware that it takes a long time to compile (hours)!
+  Having NumPy and SciPy linked with ATLAS (an optimized BLAS
+  implementation) is not mandatory, but recommended if you care about
+  performance.
+
+- You might have some different versions of gcc, SciPy, NumPy, Python installed
+  on your system, perhaps via Xcode. It is a good idea to use **either** the
+  MacPorts version of everything **or** some other set of compatible versions
+  (e.g. provided by Xcode or Fink). The advantages of MacPorts are the
+  transparency with which everything can be installed and the fact that
+  packages are updated quite frequently. The following steps describe how to
+  make sure you are using the MacPorts version of these packages.
+
+- In order to use the MacPorts version of Python, you will probably
+  need to explicitly select it with ``sudo port select python python27``. The
+  reason this is necessary is because you may have an Apple-provided Python
+  (via, for example, an Xcode installation). After performing this step, you
+  should check that the symbolic link provided by ``which python`` points to
+  the MacPorts python. For instance, on MacOS X Lion with MacPorts 2.0.3,
+  the output of ``which python`` is ``/opt/local/bin/python`` and this symbolic
+  link points to ``/opt/local/bin/python2.7``. When executing ``sudo
+  port select python python27-apple`` (which you should **not** do), the link
+  points to ``/usr/bin/python2.7``.
+
+- Similarly, make sure that you are using the MacPorts-provided gcc:
+  use ``sudo port select gcc`` to see which gcc installs you have on the
+  system. Then execute for instance ``sudo port select gcc mp-gcc44``
+  to create a symlink that points to the correct (MacPorts) gcc (version 4.4
+  in this case).
+
+- At this point, if you have not done so already, it may be a good idea to
+  close and restart your terminal, to make sure all configuration changes
+  are properly taken into account.
+
+- Afterwards, please check that the ``scipy`` module that is imported in
+  Python is the right one (and is a recent one). For instance, ``import
+  scipy`` followed by ``print scipy.__version__`` and ``print scipy.__path__``
+  should result in a version number of at least 0.7.0 and a path that starts
+  with ``/opt/local`` (the path where MacPorts installs its packages). If this
+  is not the case, then you might have some old installation of ``scipy`` in your
+  ``PYTHONPATH`` so you should edit ``PYTHONPATH`` accordingly.
+
+- Please follow the same procedure with ``numpy``.
+
+- This is covered in the MacPorts installation process, but make sure that
+  your ``PATH`` environment variable contains ``/opt/local/bin`` and
+  ``/opt/local/sbin`` before any other paths (to ensure that the Python and
+  gcc binaries that you installed with MacPorts are visible first).
+
+- MacPorts does not create automatically ``nosetests`` and ``pip`` symlinks
+  pointing to the MacPorts version, so you can add them yourself with
+
+    .. code-block:: bash
+
+        $ sudo ln -s /opt/local/bin/nosetests-2.7 /opt/local/bin/nosetests
+        $ sudo ln -s /opt/local/bin/pip-2.7 /opt/local/bin/pip
--- a/doc/install_others.txt
+++ b/doc/install_others.txt
+.. _install_others:
+
+
+Other Platform-specific Installations
+=====================================
+
+.. warning::
+
+    These instructions are not kept up to date.
+
+NVIDIA Jetson TX1 embedded platform
+-----------------------------------
+
+.. code-block:: bash
+
+    sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libblas-dev git
+    pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --user  # Need Theano 0.8 or more recent
+
+Gentoo
+------
+
+Brian Vandenberg emailed `installation instructions on Gentoo
+<http://groups.google.com/d/msg/theano-dev/-8WCMn2FMR0/bJPasoZXaqoJ>`_,
+focusing on how to install the appropriate dependencies.
+
+Nicolas Pinto provides `ebuild scripts <https://github.com/npinto/sekyfsr-gentoo-overlay/tree/master/sci-libs/Theano>`_.
+
+Docker images
+-------------
+
+Builds of Theano are available as `Docker <https://www.docker.com/whatisdocker>`_ images:
+`Theano Docker (CPU) <https://hub.docker.com/r/kaixhin/theano/>`_ or `Theano Docker (CUDA) <https://hub.docker.com/r/kaixhin/cuda-theano/>`_.
+These are updated on a weekly basis with bleeding-edge builds of Theano. Examples of running bash in a Docker container
+are as follows:
+
+.. code-block:: bash
+
+    sudo docker run -it kaixhin/theano
+    sudo docker run -it --device /dev/nvidiactl --device /dev/nvidia-uvm --device /dev/nvidia0 kaixhin/cuda-theano:7.0
+
+For a guide to Docker, see the `official docs <https://docs.docker.com/userguide/>`_. For more details on how to use the
+Theano Docker images, including requirements for CUDA support, consult the `source project <https://github.com/Kaixhin/dockerfiles>`_.
--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
--- a/doc/install_windows.txt
+++ b/doc/install_windows.txt
-:orphan:
-
 .. _install_windows:

+Windows Installation Instructions
+=================================
+
+.. warning::
+    If you want to install the bleeding-edge or development version of Theano
+    from GitHub, please make sure you are reading `the latest version of this
+    page <http://deeplearning.net/software/theano_versions/dev/install_windows.html>`_.
+
+.. warning::

-Installation of Theano on Windows
-==================================
+    Theano is mainly developed and tested on Linux Machines.

 These instructions show step-by-step how to install Theano and
 required dependencies on a 32- or 64-bit system using freely available
@@ -26,6 +32,8 @@ C/C++ (for Python 2.7 family this has to be Microsoft Visual Studio
 version supporting Visual Studio 2008), and GCC (for non-CUDA C code
 generated by Theano).

+.. _gpu_windows:
+
 Visual Studio and CUDA
 ######################

@@ -37,7 +45,6 @@ Studio installation to proceed). Afterwards, the Visual Studio 2010
 can be safely removed. If someone knows how to install CUDA 5.5
 without a proper Visual Studio installation, please let us know.

-
 First we need to install Microsoft Visual Studio 2010 Express, which
 is required to install CUDA. You can download it from
 `Visual Studio Express
@@ -79,7 +86,7 @@ The package will be installed to ``C:\Program Files
 (x86)\Common Files\Microsoft\Visual C++ for Python\9.0``.

 Finally download the ``stdint.h`` header from
-`here <http://msinttypes.googlecode.com/svn/trunk/stdint.h>`_ and save it as
+`here <https://sourceforge.net/p/mspgcc/msp430-libc/ci/master/tree/include/stdint.h>`_ and save it as
 ``C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for
 Python\9.0\VC\include\stdint.h``.

@@ -619,6 +626,3 @@ follows:
   dependencies. In the case where it is a dependency, you can use the
   `Dependency Walker <http://www.dependencywalker.com/>`__ utility to figure out
   which one.
-
-
-.. _gpu_windows:
--- a/doc/library/compile/function.txt
+++ b/doc/library/compile/function.txt
@@ -174,7 +174,8 @@ Reference
        list is not used in the graph. Possible values are 'raise',
        'warn', and 'ignore'.

-    :rtype: Function instance
+    :rtype: :class:`Function <theano.compile.function_module.Function>`
+            instance

    :returns: a callable object that will compute the outputs (given the inputs)
      and update the implicit function arguments according to the `updates`.

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -487,6 +487,21 @@ import theano and print the config variable, as in:
        automatically to get more memory. But this can cause
        fragmentation, see note above.

+.. attribute:: config.gpuarray.sched
+
+    String value: ``'default'``, ``'multi'``, ``'single'``
+
+    Default: ``'default'``
+
+    Control the stream mode of contexts.
+
+    The sched parameter passed for context creation to pygpu.  With
+    CUDA, using "multi" mean using the parameter
+    cudaDeviceScheduleYield. This is useful to lower the CPU overhead
+    when waiting for GPU. One user found that it speeds up his other
+    processes that was doing data augmentation.
+
+
 .. attribute:: config.gpuarray.single_stream

    Boolean value

--- a/doc/library/gpuarray/dnn.txt
+++ b/doc/library/gpuarray/dnn.txt
@@ -61,7 +61,7 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
     usage
   * ``none`` : use a slower implementation with minimal memory usage
   * ``large`` : use a sometimes faster implementation with large memory usage
-   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft`` : use the Fast Fourier Transform implementation of convolution
     (very high memory usage)
   * ``guess_once`` : the first time a convolution is executed, the
     implementation to use is chosen according to cuDNN's heuristics and reused
@@ -83,7 +83,7 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
   * ``none`` (default) : use the default non-deterministic convolution
     implementation
   * ``deterministic`` : use a slower but deterministic implementation
-   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft`` : use the Fast Fourier Transform implementation of convolution
     (very high memory usage)
   * ``guess_once`` : the first time a convolution is executed, the
     implementation to use is chosen according to cuDNN's heuristics and reused

--- a/doc/library/index.txt
+++ b/doc/library/index.txt
@@ -38,7 +38,7 @@ There are also some top-level imports that you might find more convenient:

 .. function:: function(...)
    
-    Alias for :func:`function.function`
+    Alias for :func:`theano.compile.function.function`


 .. function:: function_dump(...)

--- a/doc/library/sandbox/cuda/dnn.txt
+++ b/doc/library/sandbox/cuda/dnn.txt
@@ -64,9 +64,9 @@ get an error when cuDNN can not be used with them, use this flag:
     usage
   * ``none`` : use a slower implementation with minimal memory usage
   * ``large`` : use a sometimes faster implementation with large memory usage
-   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft`` : use the Fast Fourier Transform implementation of convolution
     (very high memory usage)
-   * ``fft_tiling`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft_tiling`` : use the Fast Fourier Transform implementation of convolution
     with tiling (high memory usage, but less then fft)
   * ``guess_once`` : the first time a convolution is executed, the
     implementation to use is chosen according to cuDNN's heuristics and reused
@@ -89,7 +89,7 @@ get an error when cuDNN can not be used with them, use this flag:
   * ``none`` (default) : use the default non-deterministic convolution
     implementation
   * ``deterministic`` : use a slower but deterministic implementation
-   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+   * ``fft`` : use the Fast Fourier Transform implementation of convolution
     (very high memory usage)
   * ``guess_once`` : the first time a convolution is executed, the
     implementation to use is chosen according to cuDNN's heuristics and reused
@@ -104,7 +104,7 @@ get an error when cuDNN can not be used with them, use this flag:
     implementation selected every time the shapes of the inputs and kernels
     don't match the shapes from the last execution.

-   * (algo_bwd_data only) ``fft_tiling`` : use the Fast Fourrier
+   * (algo_bwd_data only) ``fft_tiling`` : use the Fast Fourier
     Transform implementation of convolution with tiling (high memory
     usage, but less then fft)


--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -85,6 +85,10 @@ floating-point precision.

    Return a Variable for a 4-dimensional ndarray

+.. function:: tensor5(name=None, dtype=config.floatX)
+
+    Return a Variable for a 5-dimensional ndarray
+
 .. #COMMENT
    Each of the types described above can be constructed by two methods:
    a singular version (e.g., :ref:`dmatrix <libdoc_tensor_creation>`)
@@ -112,66 +116,74 @@ They are all callable, and accept an optional ``name`` argument.  So for example
    table generated by
    $ python Theano/doc/generate_dtype_tensor_table.py

-============ =========== ==== =========== =================================
-Constructor  dtype       ndim shape       broadcastable
-============ =========== ==== =========== =================================
-bscalar      int8        0    ()          ()
-bvector      int8        1    (?,)        (False,)
-brow         int8        2    (1,?)       (True, False)
-bcol         int8        2    (?,1)       (False, True)
-bmatrix      int8        2    (?,?)       (False, False)
-btensor3     int8        3    (?,?,?)     (False, False, False)
-btensor4     int8        4    (?,?,?,?)   (False, False, False, False)
-wscalar      int16       0    ()          ()
-wvector      int16       1    (?,)        (False,)
-wrow         int16       2    (1,?)       (True, False)
-wcol         int16       2    (?,1)       (False, True)
-wmatrix      int16       2    (?,?)       (False, False)
-wtensor3     int16       3    (?,?,?)     (False, False, False)
-wtensor4     int16       4    (?,?,?,?)   (False, False, False, False)
-iscalar      int32       0    ()          ()
-ivector      int32       1    (?,)        (False,)
-irow         int32       2    (1,?)       (True, False)
-icol         int32       2    (?,1)       (False, True)
-imatrix      int32       2    (?,?)       (False, False)
-itensor3     int32       3    (?,?,?)     (False, False, False)
-itensor4     int32       4    (?,?,?,?)   (False, False, False, False)
-lscalar      int64       0    ()          ()
-lvector      int64       1    (?,)        (False,)
-lrow         int64       2    (1,?)       (True, False)
-lcol         int64       2    (?,1)       (False, True)
-lmatrix      int64       2    (?,?)       (False, False)
-ltensor3     int64       3    (?,?,?)     (False, False, False)
-ltensor4     int64       4    (?,?,?,?)   (False, False, False, False)
-dscalar      float64     0    ()          ()
-dvector      float64     1    (?,)        (False,)
-drow         float64     2    (1,?)       (True, False)
-dcol         float64     2    (?,1)       (False, True)
-dmatrix      float64     2    (?,?)       (False, False)
-dtensor3     float64     3    (?,?,?)     (False, False, False)
-dtensor4     float64     4    (?,?,?,?)   (False, False, False, False)
-fscalar      float32     0    ()          ()
-fvector      float32     1    (?,)        (False,)
-frow         float32     2    (1,?)       (True, False)
-fcol         float32     2    (?,1)       (False, True)
-fmatrix      float32     2    (?,?)       (False, False)
-ftensor3     float32     3    (?,?,?)     (False, False, False)
-ftensor4     float32     4    (?,?,?,?)   (False, False, False, False)
-cscalar      complex64   0    ()          ()
-cvector      complex64   1    (?,)        (False,)
-crow         complex64   2    (1,?)       (True, False)
-ccol         complex64   2    (?,1)       (False, True)
-cmatrix      complex64   2    (?,?)       (False, False)
-ctensor3     complex64   3    (?,?,?)     (False, False, False)
-ctensor4     complex64   4    (?,?,?,?)   (False, False, False, False)
-zscalar      complex128  0    ()          ()
-zvector      complex128  1    (?,)        (False,)
-zrow         complex128  2    (1,?)       (True, False)
-zcol         complex128  2    (?,1)       (False, True)
-zmatrix      complex128  2    (?,?)       (False, False)
-ztensor3     complex128  3    (?,?,?)     (False, False, False)
-ztensor4     complex128  4    (?,?,?,?)   (False, False, False, False)
-============ =========== ==== =========== =================================
+============ =========== ==== ============ ===================================
+Constructor  dtype       ndim shape        broadcastable
+============ =========== ==== ============ ===================================
+bscalar      int8        0    ()           ()
+bvector      int8        1    (?,)         (False,)
+brow         int8        2    (1,?)        (True, False)
+bcol         int8        2    (?,1)        (False, True)
+bmatrix      int8        2    (?,?)        (False, False)
+btensor3     int8        3    (?,?,?)      (False, False, False)
+btensor4     int8        4    (?,?,?,?)    (False, False, False, False)
+btensor5     int8        5    (?,?,?,?,?)  (False, False, False, False, False)
+wscalar      int16       0    ()           ()
+wvector      int16       1    (?,)         (False,)
+wrow         int16       2    (1,?)        (True, False)
+wcol         int16       2    (?,1)        (False, True)
+wmatrix      int16       2    (?,?)        (False, False)
+wtensor3     int16       3    (?,?,?)      (False, False, False)
+wtensor4     int16       4    (?,?,?,?)    (False, False, False, False)
+wtensor5     int16       5    (?,?,?,?,?)  (False, False, False, False, False)
+iscalar      int32       0    ()           ()
+ivector      int32       1    (?,)         (False,)
+irow         int32       2    (1,?)        (True, False)
+icol         int32       2    (?,1)        (False, True)
+imatrix      int32       2    (?,?)        (False, False)
+itensor3     int32       3    (?,?,?)      (False, False, False)
+itensor4     int32       4    (?,?,?,?)    (False, False, False, False)
+itensor5     int32       5    (?,?,?,?,?)  (False, False, False, False, False)
+lscalar      int64       0    ()           ()
+lvector      int64       1    (?,)         (False,)
+lrow         int64       2    (1,?)        (True, False)
+lcol         int64       2    (?,1)        (False, True)
+lmatrix      int64       2    (?,?)        (False, False)
+ltensor3     int64       3    (?,?,?)      (False, False, False)
+ltensor4     int64       4    (?,?,?,?)    (False, False, False, False)
+ltensor5     int64       5    (?,?,?,?,?)  (False, False, False, False, False)
+dscalar      float64     0    ()           ()
+dvector      float64     1    (?,)         (False,)
+drow         float64     2    (1,?)        (True, False)
+dcol         float64     2    (?,1)        (False, True)
+dmatrix      float64     2    (?,?)        (False, False)
+dtensor3     float64     3    (?,?,?)      (False, False, False)
+dtensor4     float64     4    (?,?,?,?)    (False, False, False, False)
+dtensor5     float64     5    (?,?,?,?,?)  (False, False, False, False, False)
+fscalar      float32     0    ()           ()
+fvector      float32     1    (?,)         (False,)
+frow         float32     2    (1,?)        (True, False)
+fcol         float32     2    (?,1)        (False, True)
+fmatrix      float32     2    (?,?)        (False, False)
+ftensor3     float32     3    (?,?,?)      (False, False, False)
+ftensor4     float32     4    (?,?,?,?)    (False, False, False, False)
+ftensor5     float32     5    (?,?,?,?,?)  (False, False, False, False, False)
+cscalar      complex64   0    ()           ()
+cvector      complex64   1    (?,)         (False,)
+crow         complex64   2    (1,?)        (True, False)
+ccol         complex64   2    (?,1)        (False, True)
+cmatrix      complex64   2    (?,?)        (False, False)
+ctensor3     complex64   3    (?,?,?)      (False, False, False)
+ctensor4     complex64   4    (?,?,?,?)    (False, False, False, False)
+ctensor5     complex64   5    (?,?,?,?,?)  (False, False, False, False, False)
+zscalar      complex128  0    ()           ()
+zvector      complex128  1    (?,)         (False,)
+zrow         complex128  2    (1,?)        (True, False)
+zcol         complex128  2    (?,1)        (False, True)
+zmatrix      complex128  2    (?,?)        (False, False)
+ztensor3     complex128  3    (?,?,?)      (False, False, False)
+ztensor4     complex128  4    (?,?,?,?)    (False, False, False, False)
+ztensor5     complex128  5    (?,?,?,?,?)  (False, False, False, False, False)
+============ =========== ==== ============ ===================================

 Plural Constructors
 --------------------------
@@ -220,11 +232,11 @@ If you would like to construct a tensor variable with a non-standard
 broadcasting pattern, or a larger number of dimensions you'll need to create
 your own :class:`TensorType` instance.  You create such an instance by passing
 the dtype and broadcasting pattern to the constructor.  For example, you
-can create your own 5-dimensional tensor type
+can create your own 6-dimensional tensor type

->>> dtensor5 = TensorType('float64', (False,)*5)
->>> x = dtensor5()
->>> z = dtensor5('z')
+>>> dtensor6 = TensorType('float64', (False,)*6)
+>>> x = dtensor6()
+>>> z = dtensor6('z')

 You can also redefine some of the provided types and they will interact
 correctly:
@@ -1095,13 +1107,11 @@ Indexing

 Like NumPy, Theano distinguishes between *basic* and *advanced* indexing.
 Theano fully supports basic indexing
-(see `NumPy's indexing  <http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html>`_).
-
-`Integer advanced indexing
-<http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#integer>`_
-will be supported in 0.6rc4 (or the development version). We do not
-support boolean masks, as Theano does not have a boolean type (we use
-int8 for the output of logic operators).
+(see `NumPy's indexing  <http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html>`_) 
+and `integer advanced indexing
+<http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#integer>`_. We do not
+support boolean masks, as Theano does not have a boolean type (we use int8 for the output of 
+logic operators).

 .. testsetup:: indexing


--- a/doc/requirements.txt
+++ b/doc/requirements.txt
+Requirements
+============
+
+.. note::
+
+    We only support the installation of the requirements through conda.
+
+.. _BLAS: http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms
+.. _Python: http://www.python.org/
+
+    Python_ >= 2.6 or >= 3.3
+        The development package (python-dev or python-devel on most Linux distributions) is recommended (see just below). Python 2.4 was supported up to and including the release 0.6. Python 3 is supported past the 3.3 release.
+
+    `NumPy <http://numpy.scipy.org/>`_ >= 1.7.1 < 1.11.1
+        Earlier versions could work, but we don’t test it.
+
+    `SciPy <http://scipy.org>`_ >= 0.11 < 0.17.1
+        Only currently required for sparse matrix and special functions support, but highly recommended. SciPy >=0.8 could work, but earlier versions have known bugs with sparse matrices.
+
+    `BLAS`_ installation (with Level 3 functionality)
+        * **Recommended**: MKL, which is free through Conda. 
+        * Alternatively, we suggest to install OpenBLAS, with the development headers (``-dev``, ``-devel``, depending on your Linux distribution).
+
+**Optional requirements**
+
+    ``python-dev``, ``g++`` >= 4.2
+        **Highly recommended.** Theano can fall back on a NumPy-based Python execution model, but a C compiler allows for vastly faster execution.
+
+    `nose <http://nose.readthedocs.io/en/latest/>`_ >= 1.3.0
+        Recommended, to run Theano's test-suite.
+
+    `Sphinx <http://sphinx.pocoo.org/>`_ >= 0.5.1, `pygments <http://pygments.org/>`_
+        For building the documentation. LaTeX_ and dvipng_ are also necessary for math to show up as images.
+
+    `pydot-ng <https://github.com/pydot/pydot-ng>`_
+        To handle large picture for gif/images.
+
+    `NVIDIA CUDA drivers and SDK`_
+        **Highly recommended** Required for GPU code generation/execution on NVIDIA gpus. See instruction below.
+
+    `libgpuarray`_
+        Required for GPU/CPU code generation on CUDA and OpenCL devices (see: :ref:`gpuarray`.)
+
+Requirements installation through Conda (recommended)
+-----------------------------------------------------
+
+Install Miniconda
+^^^^^^^^^^^^^^^^^
+
+Follow this `link <http://conda.pydata.org/miniconda.html>`__ to install Miniconda.
+
+.. note::
+
+    If you want fast compiled code (recommended), make sure you have g++ (Windows/Linux) or Clang (OS X) installed.
+
+Install requirements and optional packages
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+    conda install numpy scipy mkl <nose> <sphinx> <pydot-ng>
+
+* Arguments between <...> are optional.
+
+
+Install and configure the GPU drivers (recommended)
+---------------------------------------------------
+
+.. warning::
+
+    OpenCL support is still minimal for now.
+
+1. Install CUDA drivers
+
+    * Follow `this link <https://developer.nvidia.com/cuda-downloads>`__
+      to install the CUDA driver and the CUDA Toolkit.
+    * You must reboot the computer after the driver installation.
+    * Test that it was loaded correctly after the reboot, executing the
+      command `nvidia-smi` from the command line.
+
+    .. note::
+
+        Sanity check: The *bin* subfolder should contain an *nvcc*
+        program. This folder is called the *cuda root* directory.
+
+2. Fix 'lib' path
+    * Add the 'lib' subdirectory (and/or 'lib64' subdirectory if you have a
+      64-bit OS) to your ``$LD_LIBRARY_PATH`` environment 
+      variable.
+
+3. Set Theano's config flags
+
+    To use the GPU you need to define the *cuda root*. You can do it in one 
+    of the following ways:
+
+    * Define a $CUDA_ROOT environment variable to equal the cuda root directory, as in ``CUDA_ROOT=/path/to/cuda/root``, or
+    * add a ``cuda.root`` flag to :envvar:`THEANO_FLAGS`, as in ``THEANO_FLAGS='cuda.root=/path/to/cuda/root'``, or
+    * add a [cuda] section to your .theanorc file containing the option ``root = /path/to/cuda/root``.
+
+.. _LaTeX: http://www.latex-project.org/
+.. _dvipng: http://savannah.nongnu.org/projects/dvipng/
+.. _NVIDIA CUDA drivers and SDK: http://developer.nvidia.com/object/gpucomputing.html
+.. _libgpuarray: http://deeplearning.net/software/libgpuarray/installation.html
--- a/doc/scripts/docgen.py
+++ b/doc/scripts/docgen.py
@@ -54,6 +54,10 @@ if __name__ == '__main__':
    pythonpath = os.pathsep.join([throot, pythonpath])
    sys.path[0:0] = [throot]  # We must not use os.environ.

+    # Make sure we don't use gpu to compile documentation
+    env_th_flags = os.environ.get('THEANO_FLAGS', '')
+    os.environ['THEANO_FLAGS'] = 'device=cpu,force_device=True'
+
    def call_sphinx(builder, workdir):
        import sphinx
        if options['--check']:
@@ -99,3 +103,6 @@ if __name__ == '__main__':

    # To go back to the original current directory.
    os.chdir(currentdir)
+
+    # Reset THEANO_FLAGS
+    os.environ['THEANO_FLAGS'] = env_th_flags
--- a/doc/troubleshooting.txt
+++ b/doc/troubleshooting.txt
--- a/doc/tutorial/adding.txt
+++ b/doc/tutorial/adding.txt
@@ -175,13 +175,13 @@ by :ref:`broadcasting <libdoc_tensor_broadcastable>`.

 The following types are available:

-* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4``
-* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4``
-* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4``
-* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4``
-* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4``
-* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4``
-* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4``
+* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4, btensor5``
+* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4, wtensor5``
+* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4, itensor5``
+* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4, ltensor5``
+* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4, ftensor5``
+* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4, dtensor5``
+* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4, ctensor5``

 The previous list is not exhaustive and a guide to all types compatible
 with NumPy arrays may be found here: :ref:`tensor creation<libdoc_tensor_creation>`.

--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
-
 .. _using_gpu:

 =============
@@ -19,11 +18,33 @@ There are two ways currently to use a gpu, one that should support any OpenCL
 device as well as NVIDIA cards (:ref:`gpuarray`), and the old backend that
 only supports NVIDIA cards (:ref:`cuda`).

+Using the GPU in Theano is as simple as setting the ``device`` configuration
+flag to ``device=cuda`` (or ``device=gpu`` for the old backend). You can optionally target a specific gpu by specifying
+the number of the gpu as in e.g. ``device=cuda2``. You also need to set the
+default floating point precision.
+For example: ``THEANO_FLAGS='cuda.root=/path/to/cuda/root,device=cuda,floatX=float32'``.
+You can also set these options in the .theanorc file's ``[global]`` section:
+
+     .. code-block:: cfg
+
+        [global]
+        device = cuda
+        floatX = float32
+
 .. warning::

-  If you want to use the new GpuArray backend, make sure to have the 
-  development version of Theano installed. The 0.8.X releases have not
-  been optimized to work correctly with the new backend.
+  The old CUDA backend will be deprecated soon, in favor of the new libgpuarray
+  backend.
+
+.. note::
+
+    * If your computer has multiple GPUs and you use ``device=cuda``, the driver
+      selects the one to use (usually gpu0).
+    * You can use the program ``nvidia-smi`` to change this policy.
+    * By default, when ``device`` indicates preference for GPU computations,
+      Theano will fall back to the CPU if there is a problem with the GPU.
+      You can use the flag ``force_device=True`` to instead raise an error when
+      Theano cannot use the GPU.

 .. _gpuarray:

@@ -31,19 +52,32 @@ GpuArray Backend
 ----------------

 If you have not done so already, you will need to install libgpuarray
-as well as at least one computing toolkit.  Instructions for doing so
-are provided at `libgpuarray <http://deeplearning.net/software/libgpuarray/installation.html>`_.
+as well as at least one computing toolkit (CUDA or OpenCL). Detailed
+instructions to accomplish that are provided at 
+`libgpuarray <http://deeplearning.net/software/libgpuarray/installation.html>`_.
+
+To install Nvidia's GPU-programming toolchain (CUDA) and configure
+Theano to use it, see the installation instructions for
+:ref:`Linux <gpu_linux>`, :ref:`MacOS <gpu_macos>` and :ref:`Windows <gpu_windows>`.

 While all types of devices are supported if using OpenCL, for the
 remainder of this section, whatever compute device you are using will
 be referred to as GPU.

+.. warning::
+
+  If you want to use the new GpuArray backend, make sure to have the
+  development version of Theano installed. The 0.8.X releases have not
+  been optimized to work correctly with the new backend.
+
 .. warning::

  The backend was designed to support OpenCL, however current support is
  incomplete. A lot of very useful ops still do not support it because they
  were ported from the old backend with minimal change.

+  .. _testing_the_gpu:
+
 Testing Theano with GPU
 ~~~~~~~~~~~~~~~~~~~~~~~

@@ -150,7 +184,7 @@ the GPU object directly.  The following code is modified to do just that.

 Here ``tensor.exp(x).transfer(None)`` means "copy ``exp(x)`` to the GPU",
 with ``None`` the default GPU context when not explicitly given.
-For information on how to set GPU contexts, see :ref:`tut_using_multi_gpu`. 
+For information on how to set GPU contexts, see :ref:`tut_using_multi_gpu`.

 The output is

@@ -227,10 +261,10 @@ Tips for Improving Performance on GPU
  ``.theanorc`` file if you plan to do a lot of GPU work.
 * The GPU backend supports *float64* variables, but they are still slower
  to compute than *float32*. The more *float32*, the better GPU performance
-  you will get. 
-* Prefer constructors like ``matrix``, ``vector`` and ``scalar`` (which 
+  you will get.
+* Prefer constructors like ``matrix``, ``vector`` and ``scalar`` (which
  follow the type set in ``floatX``) to ``dmatrix``, ``dvector`` and
-  ``dscalar``. The latter enforce double precision (*float64* on most 
+  ``dscalar``. The latter enforce double precision (*float64* on most
  machines), which slows down GPU computations on current hardware.
 * Minimize transfers to the GPU device by using ``shared`` variables
  to store frequently-accessed data (see :func:`shared()<shared.shared>`).

--- a/doc/updating.txt
+++ b/doc/updating.txt
+.. include:: css.inc
+
+.. _updating:
+
+Updating Theano
+===============
+
+Follow one of these three sections depending on how you installed Theano.
+
+You should update frequently, bugs are fixed on a very regular basis, and features are
+added even more frequently!
+
+Stable Installation
+-------------------
+
+The following command will update only Theano:
+
+.. raw:: html
+
+    <pre><span class="red">&#60;sudo&#62;</span> pip install <span class="blue">&#60;--user&#62;</span> <span class="pink">&#60;--no-deps&#62;</span> theano</pre>
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- Use :pink:`no-deps` when you don't want the dependencies of Theano to not be installed through pip. This is important when they have already been installed as system packages.
+
+.. warning::
+
+    If you installed NumPy/SciPy with yum/apt-get, updating NumPy/SciPy
+    with pip/easy_install is not always a good idea. This can make Theano
+    crash due to problems with BLAS. The versions of
+    NumPy/SciPy in the distribution are sometimes linked against faster
+    versions of BLAS. Installing NumPy/SciPy with
+    yum/apt-get/pip/easy_install won't install the development package
+    needed to recompile it with the fast version.
+    To fix a possible crash, you can clear
+    the Theano cache like this:
+
+    .. code-block:: bash
+
+       theano-cache clear
+
+Bleeding-Edge Installation
+--------------------------
+
+The following command will update your bleeding-edge version of Theano
+
+.. raw:: html
+
+    <div style="width:100%"><pre><span class="red">&#60;sudo&#62;</span> pip install <span class="blue">&#60;--user&#62;</span> <span class="pink">&#60;--no-deps&#62;</span> git+https://github.com/Theano/Theano.git#egg=Theano</pre></div>
+
+- Use :red:`sudo` for a root installation.
+
+- Use :blue:`user` for a user installation without admin rights. It will install Theano in your local site-packages.
+
+- Use :pink:`no-deps` when you don't want the dependencies of Theano to not be installed through pip. This is important when they have already been installed as system packages.
+
+.. warning::
+
+    If you installed NumPy/SciPy with yum/apt-get, updating NumPy/SciPy
+    with pip/easy_install is not always a good idea. This can make Theano
+    crash due to problems with BLAS. The versions of
+    NumPy/SciPy in the distribution are sometimes linked against faster
+    versions of BLAS. Installing NumPy/SciPy with
+    yum/apt-get/pip/easy_install won't install the development package
+    needed to recompile it with the fast version.
+    To fix a possible crash, you can clear
+    the Theano cache like this:
+
+    .. code-block:: bash
+
+       theano-cache clear
+
+Developer Installation
+----------------------
+
+To update your library to the latest revision, change directory (``cd``)
+to your ``Theano`` folder and execute the following command:
+
+.. warning::
+
+    The following assumes you have knowledge of git and know how to do a rebase.
+
+.. code-block:: bash
+
+    git pull --rebase
--- a/setup.py
+++ b/setup.py
@@ -166,7 +166,7 @@ def do_setup():
          install_requires=['numpy>=1.7.1', 'scipy>=0.11', 'six>=1.9.0'],
          # pygments is a dependency for Sphinx code highlight
          extras_require={
-              'test': ['nose>=1.3.0', 'nose-parameterized>=0.5.0'],
+              'test': ['nose>=1.3.0', 'nose-parameterized>=0.5.0', 'flake8<3'],
              'doc': ['Sphinx>=0.5.1', 'pygments']
          },
          package_data={

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -147,7 +147,7 @@ class BadThunkOutput(DebugModeError):
        print("  thunk2  :", self.thunk2, file=sio)

        # Don't import it at the top of the file to prevent circular import.
-        utt = theano.tests.unittest_tools
+        import theano.tests.unittest_tools as utt
        print(utt.str_diagnostic(self.val1, self.val2, None, None), file=sio)
        ret = sio.getvalue()
        return ret
@@ -1769,12 +1769,13 @@ class _Linker(gof.link.LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            assert type(self) is _Linker
-            return type(self)(maker=self.maker).accept(fgraph, no_recycling)
+            return type(self)(maker=self.maker).accept(
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -28,7 +28,7 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
                  on_unused_input=None,
                  extra_tag_to_remove=None):
    """
-    This is helpful to make a reproducable case for problem during Theano
+    This is helpful to make a reproducible case for problems during Theano
    compilation.

    Ex:
@@ -36,13 +36,13 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
    replace `theano.function(...)` by
    `theano.function_dump('filename.pkl', ...)`.

-    If you see this, you where probably asked to use this function to
+    If you see this, you were probably asked to use this function to
    help debug a particular case during the compilation of a Theano
-    function. `function_dump` allows to easily reproduce your
-    compilation without asking any code. It pickle all the objects and
+    function. `function_dump` allows you to easily reproduce your
+    compilation without generating any code. It pickles all the objects and
    parameters needed to reproduce a call to `theano.function()`. This
-    include shared variables and there values. If you do not want
-    that, you can set to replace shared variables values by zeros by
+    includes shared variables and their values. If you do not want
+    that, you can choose to replace shared variables values with zeros by
    calling set_value(...) on them before calling `function_dump`.

    To load such a dump and do the compilation:
@@ -53,9 +53,9 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
    >>> f = theano.function(**d)  # doctest: +SKIP

    Note:
-    The parameter extra_tag_to_remove, is passed to the StripPickler used.
+    The parameter `extra_tag_to_remove` is passed to the StripPickler used.
    To pickle graph made by Blocks, it must be:
-    ['annotations', 'replacement_of', 'aggregation_scheme', 'roles']
+    `['annotations', 'replacement_of', 'aggregation_scheme', 'roles']`

    """
    assert isinstance(filename, string_types)
@@ -78,7 +78,8 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
             rebuild_strict=True, allow_input_downcast=None, profile=None,
             on_unused_input=None):
    """
-    Return a callable object that will calculate `outputs` from `inputs`.
+    Return a :class:`callable object <theano.compile.function_module.Function>`
+    that will calculate `outputs` from `inputs`.

    Parameters
    ----------
@@ -100,6 +101,10 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
        If True, do not perform any automatic update on Variables. If False
        (default), perform them all. Else, perform automatic updates on all
        Variables that are neither in "updates" nor in "no_default_updates".
+    accept_inplace : bool
+        True iff the graph can contain inplace operations prior to the
+        optimization phase (default is False). *Note* this parameter is unsupported,
+        and its use is not recommended.
    name : str
        An optional name for this function. The profile mode will print the time
        spent in this function.
@@ -115,10 +120,10 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
        Ops in the graph, an Exception will be raised.
    allow_input_downcast: bool or None
        True means that the values passed as inputs when calling the function
-        can be silently downcasted to fit the dtype of the corresponding
+        can be silently down-casted to fit the dtype of the corresponding
        Variable, which may lose precision. False means that it will only be
        cast to a more general, or precise, type. None (default) is almost like
-        False, but allows downcasting of Python float scalars to floatX.
+        False, but allows down-casting of Python float scalars to floatX.
    profile: None, True, or ProfileStats instance
        Accumulate profiling information into a given ProfileStats instance.
        If argument is `True` then a new ProfileStats instance will be used.
@@ -131,7 +136,7 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,

    Returns
    -------
-    Function instance
+    :class:`theano.compile.function_module.Function` instance
        A callable object that will compute the outputs (given the inputs) and
        update the implicit function arguments according to the `updates`.

@@ -209,9 +214,9 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
         4. Linker
               The linker uses a Python loop to execute the code associated
               with all the Apply nodes in the graph in the correct order.
-               The CVM is a linker that replaces this Python loop with a C
-               loop to avoid continuously changing between Python and C.
-               The CVM is faster for 2 reasons:
+               The C Virtual Machine (CVM) is a linker that replaces this
+               Python loop with a C loop to avoid continuously changing
+               between Python and C. The CVM is faster for 2 reasons:
                 1) Its internal logic is in C, so no Python interpreter
                    overhead.
                 2) It makes native calls from the VM logic into thunks that
@@ -219,7 +224,6 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
               The VM is a linker that was developed to prototype the CVM. it
        was easier to develop the VM in Python then translate it to C instead
        of just writing it in C from scratch.
-               CVM stands for C Virtual Machine.

    """
    if isinstance(outputs, dict):
@@ -252,7 +256,7 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
            func_frame = stack[idx - 1]
            while "theano/gof" in func_frame[0] and idx > 0:
                idx -= 1
-                # This can hapen if we call var.eval()
+                # This can happen if we call var.eval()
                func_frame = stack[idx - 1]
            name = func_frame[0] + ':' + str(func_frame[1])


--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -735,9 +735,13 @@ class Function(object):
        kwargs : dict
            The function inputs can be passed as keyword argument. For this, use
            the name of the input or the input instance as the key.
+
            Keyword argument ``output_subset`` is a list of either indices of the
            function's outputs or the keys belonging to the `output_keys` dict
-            and represent outputs that are requested to be calculated.
+            and represent outputs that are requested to be calculated. Regardless
+            of the presence of ``output_subset``, the updates are always calculated
+            and processed. To disable the updates, you should use the ``copy``
+            method with ``delete_updates=True``.

        Returns
        -------
@@ -1496,9 +1500,10 @@ class FunctionMaker(object):
                     if not spec.borrow]
        if no_borrow:
            self.linker = linker.accept(
-                fgraph, no_recycling=infer_reuse_pattern(fgraph, no_borrow))
+                fgraph, no_recycling=infer_reuse_pattern(fgraph, no_borrow),
+                profile=profile)
        else:
-            self.linker = linker.accept(fgraph)
+            self.linker = linker.accept(fgraph, profile=profile)

        if hasattr(linker, 'accept_var_updates'):
            # hacky thing so VMLinker knows about updates
@@ -1722,8 +1727,8 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
        Default of None means to use `config.mode` (see below for descriptive
        string list).
    name : str
-        An optional name for this fct. If used, the profile mode will print the
-        time spent in this fct.
+        An optional name for this function. If used, the profile mode will print the
+        time spent in this function.
    accept_inplace : bool
        True iff the graph can contain inplace operations prior to the
        optimization phase (default is False).

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -5,8 +5,6 @@ WRITEME
 from __future__ import absolute_import, print_function, division
 import logging

-import numpy
-
 import theano
 from theano import gof
 import theano.gof.vm
@@ -18,35 +16,6 @@ from six import string_types
 _logger = logging.getLogger('theano.compile.mode')


-def check_equal(x, y):
-    """
-    Returns True iff x[0] and y[0] are equal (checks the dtype and shape if x
-    and y are numpy.ndarray instances). Used internally.
-
-    """
-    # I put the import here to allow using theano without scipy.
-    import scipy.sparse as sp
-    x, y = x[0], y[0]
-
-    # TODO: bug in current scipy, two sparse matrices are never equal,
-    # remove when moving to 0.7
-    if sp.issparse(x):
-        x = x.todense()
-    if sp.issparse(y):
-        y = y.todense()
-
-    if isinstance(x, numpy.ndarray) and isinstance(y, numpy.ndarray):
-        if (x.dtype != y.dtype or
-                x.shape != y.shape or
-                numpy.any(abs(x - y) > 1e-10)):
-            raise Exception("Output mismatch.",
-                            {'performlinker': x, 'clinker': y})
-    else:
-        if x != y:
-            raise Exception("Output mismatch.",
-                            {'performlinker': x, 'clinker': y})
-
-
 # If a string is passed as the linker argument in the constructor for
 # Mode, it will be used as the key to retrieve the real linker in this
 # dictionary
@@ -384,7 +353,7 @@ predefined_modes = {'FAST_COMPILE': FAST_COMPILE,
                    'FAST_RUN': FAST_RUN,
                    }

-instanciated_default_mode = None
+instantiated_default_mode = None


 def get_mode(orig_string):
@@ -395,17 +364,17 @@ def get_mode(orig_string):
    if not isinstance(string, string_types):
        return string  # it is hopefully already a mode...

-    global instanciated_default_mode
+    global instantiated_default_mode
    # The default mode is cached. However, config.mode can change
-    # If instanciated_default_mode has the right class, use it.
-    if orig_string is None and instanciated_default_mode:
+    # If instantiated_default_mode has the right class, use it.
+    if orig_string is None and instantiated_default_mode:
        if string in predefined_modes:
            default_mode_class = predefined_modes[string].__class__.__name__
        else:
            default_mode_class = string
-        if (instanciated_default_mode.__class__.__name__ ==
+        if (instantiated_default_mode.__class__.__name__ ==
                default_mode_class):
-            return instanciated_default_mode
+            return instantiated_default_mode

    if string in ['Mode', 'ProfileMode', 'DebugMode', 'NanGuardMode']:
        if string == 'DebugMode':
@@ -422,6 +391,7 @@ def get_mode(orig_string):
            # This might be required if the string is 'ProfileMode'
            from .profilemode import ProfileMode  # noqa
            from .profilemode import prof_mode_instance_to_print
+            # TODO: Can't we look up the name and invoke it rather than using eval here?
            ret = eval(string +
                       '(linker=config.linker, optimizer=config.optimizer)')
    elif string in predefined_modes:
@@ -437,7 +407,7 @@ def get_mode(orig_string):
            ret = ret.including(*theano.config.optimizer_including.split(':'))
        if theano.config.optimizer_requiring:
            ret = ret.requiring(*theano.config.optimizer_requiring.split(':'))
-        instanciated_default_mode = ret
+        instantiated_default_mode = ret

    # must tell python to print the summary at the end.
    if string == 'ProfileMode':

--- a/theano/compile/nanguardmode.py
+++ b/theano/compile/nanguardmode.py
@@ -41,7 +41,7 @@ def flatten(l):
    return rval


-def contains_nan(arr, node=None):
+def contains_nan(arr, node=None, var=None):
    """
    Test whether a numpy.ndarray contains any `np.nan` values.

@@ -50,6 +50,7 @@ def contains_nan(arr, node=None):
    arr : np.ndarray or output of any Theano op
    node : None or an Apply instance.
        If arr is the output of a Theano op, the node associated to it.
+    var : The Theano symbolic variable.

    Returns
    -------
@@ -68,6 +69,8 @@ def contains_nan(arr, node=None):
        return False
    elif isinstance(arr, np.random.mtrand.RandomState):
        return False
+    elif var and getattr(var.tag, 'is_rng', False):
+        return False
    elif isinstance(arr, slice):
        return False
    elif arr.size == 0:
@@ -86,7 +89,7 @@ def contains_nan(arr, node=None):
    return np.isnan(np.min(arr))


-def contains_inf(arr, node=None):
+def contains_inf(arr, node=None, var=None):
    """
    Test whether a numpy.ndarray contains any `np.inf` values.

@@ -95,6 +98,7 @@ def contains_inf(arr, node=None):
    arr : np.ndarray or output of any Theano op
    node : None or an Apply instance.
        If the output of a Theano op, the node associated to it.
+    var : The Theano symbolic variable.

    Returns
    -------
@@ -114,6 +118,8 @@ def contains_inf(arr, node=None):
        return False
    elif isinstance(arr, np.random.mtrand.RandomState):
        return False
+    elif var and getattr(var.tag, 'is_rng', False):
+        return False
    elif isinstance(arr, slice):
        return False
    elif arr.size == 0:
@@ -215,44 +221,47 @@ class NanGuardMode(Mode):
        assert nan_is_error or inf_is_error or big_is_error
        compile_gpu_func(nan_is_error, inf_is_error, big_is_error)

-        def do_check_on(var, nd):
+        def do_check_on(value, nd, var=None):
            """
-            Checks `var` for NaNs / Infs. If detected, raises an exception
+            Checks `value` for NaNs / Infs. If detected, raises an exception
            and / or prints information about `nd`, `f`, and `is_input` to
            help the user determine the cause of the invalid values.

            Parameters
            ----------
-            var : numpy.ndarray
+            value : numpy.ndarray
                The value to be checked.
            nd : theano.gof.Apply
                The Apply node being executed.
+            var : theano.gof.Variable
+                Not used if nd is there. Otherwise, used to print the stack
+                trace for inputs of the graph.

            """
            error = False
            sio = StringIO()
            if nan_is_error:
-                if contains_nan(var, nd):
+                if contains_nan(value, nd, var):
                    print('NaN detected', file=sio)
                    error = True
            if inf_is_error:
-                if contains_inf(var, nd):
+                if contains_inf(value, nd, var):
                    print('Inf detected', file=sio)
                    error = True
            if big_is_error:
                err = False
-                if isinstance(var, theano.gof.type.CDataType._cdata_type):
+                if isinstance(value, theano.gof.type.CDataType._cdata_type):
                    err = False
-                elif isinstance(var, np.random.mtrand.RandomState):
+                elif isinstance(value, np.random.mtrand.RandomState):
                    err = False
-                elif isinstance(var, slice):
+                elif isinstance(value, slice):
                    err = False
-                elif var.size == 0:
+                elif value.size == 0:
                    err = False
-                elif cuda.cuda_available and isinstance(var, cuda.CudaNdarray):
-                    err = (f_gpuabsmax(var.reshape(var.size)) > 1e10)
+                elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray):
+                    err = (f_gpuabsmax(value.reshape(value.size)) > 1e10)
                else:
-                    err = (np.abs(var).max() > 1e10)
+                    err = (np.abs(value).max() > 1e10)
                if err:
                    print('Big value detected', file=sio)
                    error = True
@@ -264,6 +273,11 @@ class NanGuardMode(Mode):
                else:
                    print("NanGuardMode found an error in an input of the "
                          "graph.", file=sio)
+                # Add the stack trace
+                if nd:
+                    var = nd.outputs[0]
+                print(theano.gof.utils.get_variable_trace_string(var),
+                      file=sio)
                msg = sio.getvalue()
                if config.NanGuardMode.action == 'raise':
                    raise AssertionError(msg)
@@ -281,7 +295,7 @@ class NanGuardMode(Mode):

        def nan_check_input(var, value):
            if getattr(var.tag, 'nan_guard_mode_check', True):
-                do_check_on(value, None)
+                do_check_on(value, None, var=var)

        wrap_linker = theano.gof.vm.VM_Linker(callback=nan_check,
                                              callback_input=nan_check_input)

--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -306,6 +306,10 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
        If False (default), perform them all. Else, perform automatic updates
        on all Variables that are neither in "updates" nor in
        "no_default_updates".
+    accept_inplace : bool
+        True iff the graph can contain inplace operations prior to the
+        optimization phase (default is False). *Note* this parameter is unsupported,
+        and its use is not recommended.
    name : None or string
        Attaches a name to the profiling result of this function.
    allow_input_downcast : bool

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -72,7 +72,8 @@ def _atexit_print_fn():
        for ps in to_sum[1:]:
            for attr in ["compile_time", "fct_call_time", "fct_callcount",
                         "vm_call_time", "optimizer_time", "linker_time",
-                         "validate_time", "import_time"]:
+                         "validate_time", "import_time",
+                         "linker_node_make_thunks"]:
                setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))

            # merge dictonary
@@ -190,6 +191,8 @@ class ProfileStats(object):
    import_time = 0.0
    # time spent in importing compiled python module.

+    linker_node_make_thunks = 0.0
+
    line_width = config.profiling.output_line_width

    nb_nodes = -1
@@ -665,6 +668,8 @@ class ProfileStats(object):
        print('    Theano Linker time (includes C, CUDA code '
              'generation/compiling): %es' % self.linker_time, file=file)
        print('       Import time %es' % self.import_time, file=file)
+        print('       Node make_thunk time %es' % self.linker_node_make_thunks,
+              file=file)
        print('', file=file)

        # The validation time is a subset of optimizer_time

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -242,6 +242,15 @@ AddConfigVar('gpuarray.preallocate',
             FloatParam(0),
             in_c_key=False)

+AddConfigVar('gpuarray.sched',
+             """The sched parameter passed for context creation to pygpu.
+                With CUDA, using "multi" is equivalent to using the parameter
+                cudaDeviceScheduleYield. This is useful to lower the
+                CPU overhead when waiting for GPU. One user found that it
+                speeds up his other processes that was doing data augmentation.
+             """,
+             EnumStr("default", "multi", "single"))
+
 AddConfigVar('gpuarray.single_stream',
             """
             If your computations are mostly lots of small elements,
@@ -1630,6 +1639,8 @@ def short_platform(r=None, p=None):

    return p
 compiledir_format_dict['short_platform'] = short_platform()
+# Allow to have easily one compiledir per device.
+compiledir_format_dict['device'] = config.device
 compiledir_format_keys = ", ".join(sorted(compiledir_format_dict.keys()))
 default_compiledir_format = ("compiledir_%(short_platform)s-%(processor)s-"
                             "%(python_version)s-%(python_bitwidth)s")

--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -8,6 +8,7 @@ import os
 import shlex
 import sys
 import warnings
+from functools import wraps

 from six import StringIO

@@ -96,6 +97,7 @@ def change_flags(**kwargs):
    Useful during tests.
    """
    def change_flags_exec(f):
+        @wraps(f)
        def inner(*args, **kwargs_):
            old_val = {}
            for k in kwargs:
@@ -117,9 +119,6 @@ def change_flags(**kwargs):
                    assert len(l) == 1
                    l[0].__set__(None, old_val[k])

-        # Make sure that the name of the decorated function remains the same.
-        inner.__name__ = f.__name__
-
        return inner
    return change_flags_exec


--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -548,7 +548,7 @@ class CLinker(link.Linker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Associate linker with fgraph

@@ -557,7 +557,8 @@ class CLinker(link.Linker):
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            # A linker can be tied to only one FunctionGraph.
-            return type(self)(self.schedule).accept(fgraph, no_recycling)
+            return type(self)(self.schedule).accept(
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.fetch_variables()
        self.no_recycling = no_recycling
@@ -717,7 +718,7 @@ class CLinker(link.Linker):
                              [get_c_declare, get_c_extract_out,
                                  (get_c_sync, get_c_cleanup)]]
            else:
-                raise Exception("what the fuck")
+                raise Exception("this shouldn't be possible, please report this exception")

            builder, block = struct_variable_codeblocks(variable, policy,
                                                        id, symbol, sub)
@@ -1737,7 +1738,7 @@ class OpWiseCLinker(link.LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Associate linker with fgraph
        """
@@ -1750,7 +1751,7 @@ class OpWiseCLinker(link.LocalLinker):
                allow_gc=self.allow_gc,
                nice_errors=self.nice_errors,
                schedule=self.schedule,
-            ).accept(fgraph, no_recycling)
+            ).accept(fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self
@@ -1897,7 +1898,7 @@ class DualLinker(link.Linker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Update/tie self with fgraph
        """
@@ -1905,7 +1906,7 @@ class DualLinker(link.Linker):
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            return type(self)(self.checker, self.schedule).accept(
-                fgraph, no_recycling)
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/gof/lazylinker_c.c
+++ b/theano/gof/lazylinker_c.c
@@ -789,15 +789,47 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
 {
  CLazyLinker * self = (CLazyLinker*)_self;
  static char *kwlist[] = {
-    (char*)"time_thunks",
+    (char *)"time_thunks",
    (char *)"n_calls",
+    (char *)"output_subset",
    NULL};
  int n_calls=1;
-  if (! PyArg_ParseTupleAndKeywords(args, kwds, "|ii", kwlist,
+  PyObject *output_subset_ptr = NULL;
+  if (! PyArg_ParseTupleAndKeywords(args, kwds, "|iiO", kwlist,
                                    &self->do_timing,
-                                    &n_calls))
+                                    &n_calls,
+                                    &output_subset_ptr))
    return NULL;
+
  int err = 0;
+  // parse an output_subset list
+  // it is stored as a bool list of length n_output_vars: calculate a var or not
+  char *output_subset = NULL;
+  int output_subset_size = -1;
+  if (output_subset_ptr != NULL)
+    {
+      if (! PyList_Check(output_subset_ptr))
+        {
+          err = 1;
+          PyErr_SetString(PyExc_RuntimeError, "Output_subset is not a list");
+        }
+      else
+        {
+          output_subset_size = PyList_Size(output_subset_ptr);
+          output_subset = (char*)calloc(self->n_output_vars, sizeof(char));
+          for (int it = 0; it < output_subset_size; ++it)
+            {
+              PyObject *elem = PyList_GetItem(output_subset_ptr, it);
+              if (! PyInt_Check(elem))
+                {
+                  err = 1;
+                  PyErr_SetString(PyExc_RuntimeError, "Some elements of output_subset list are not int");
+                }
+              output_subset[PyInt_AsLong(elem)] = 1;
+            }
+        }
+    }
+
  self->position_of_error = -1;
  // create constants used to fill the var_compute_cells
  PyObject * one = PyInt_FromLong(1);
@@ -833,9 +865,13 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
            }
        }

+      int first_updated = self->n_output_vars - self->n_updates;
      for (int i = 0; i < self->n_output_vars && (!err); ++i)
        {
-          err = lazy_rec_eval(self, self->output_vars[i], one, zero);
+          if (i >= first_updated || output_subset == NULL || output_subset[i] == 1)
+            {
+              err = lazy_rec_eval(self, self->output_vars[i], one, zero);
+            }
        }

      if (!err)
@@ -848,7 +884,8 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
            {
              Py_ssize_t src = self->output_vars[i];
              PyObject * item = PyList_GetItem(self->var_value_cells[src], 0);
-              if (self->var_computed[src] != 1)
+              if ((output_subset == NULL || output_subset[i]) &&
+                  self->var_computed[src] != 1)
                {
                  err = 1;
                  PyErr_Format(PyExc_AssertionError,
@@ -876,7 +913,7 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
    }

  /*
-    Clear everything that is left and not an output.  This is needed
+    Clear everything that is left and not an output. This is needed
    for lazy evaluation since the current GC algo is too conservative
    with lazy graphs.
  */
@@ -901,6 +938,9 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
          PyList_SetItem(self->var_value_cells[i], 0, Py_None);
        }
    }
+  if (output_subset != NULL)
+    free(output_subset);
+
  Py_DECREF(one);
  Py_DECREF(zero);
  if (err)
@@ -1014,7 +1054,7 @@ static PyTypeObject lazylinker_ext_CLazyLinkerType = {

 static PyObject * get_version(PyObject *dummy, PyObject *args)
 {
-  PyObject *result = PyFloat_FromDouble(0.21);
+  PyObject *result = PyFloat_FromDouble(0.211);
  return result;
 }


--- a/theano/gof/lazylinker_c.py
+++ b/theano/gof/lazylinker_c.py
@@ -15,7 +15,7 @@ from theano.gof import cmodule
 _logger = logging.getLogger('theano.gof.lazylinker_c')

 force_compile = False
-version = 0.21  # must match constant returned in function get_version()
+version = 0.211  # must match constant returned in function get_version()
 lazylinker_ext = None


@@ -145,4 +145,4 @@ except ImportError:
        release_lock()

 from lazylinker_ext.lazylinker_ext import *  # noqa
-assert force_compile or (version == get_version())
+assert force_compile or (version == get_version())  # noqa
--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -762,7 +762,7 @@ class PerformLinker(LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """

        Parameters
@@ -781,7 +781,8 @@ class PerformLinker(LocalLinker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
-            return type(self)(allow_gc=self.allow_gc).accept(fgraph, no_recycling)
+            return type(self)(allow_gc=self.allow_gc).accept(
+                fgraph, no_recycling, profile)
            # raise Exception("Cannot accept from a Linker that is already tied to another FunctionGraph.")
        self.fgraph = fgraph
        self.no_recycling = no_recycling
@@ -944,7 +945,7 @@ class WrapLinker(Linker):
            linkers=[l.clone(allow_gc=allow_gc) for l in self.linkers],
            wrapper=self.wrapper)

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """

        Parameters

--- a/theano/gof/tests/test_destroyhandler.py
+++ b/theano/gof/tests/test_destroyhandler.py
@@ -5,7 +5,8 @@ from theano.gof.type import Type
 from theano.gof import graph
 from theano.gof.graph import Variable, Apply
 from theano.gof.op import Op
-from theano.gof.opt import *  # noqa
+from theano.gof.opt import (OpKeyOptimizer, PatternSub, NavigatorOptimizer,
+                            TopoOptimizer, OpSub)

 from theano.gof import destroyhandler
 from theano.gof.fg import FunctionGraph, InconsistencyError

--- a/theano/gof/tests/test_link.py
+++ b/theano/gof/tests/test_link.py
@@ -11,7 +11,7 @@ from theano.gof.type import Type
 from theano.gof.op import Op
 from theano.gof import fg

-from theano.gof.link import *  # noqa
+from theano.gof.link import PerformLinker, WrapLinker, Container
 from theano.compat import cmp



--- a/theano/gof/tests/test_opt.py
+++ b/theano/gof/tests/test_opt.py
@@ -3,9 +3,11 @@ from __future__ import absolute_import, print_function, division
 from theano.gof.type import Type
 from theano.gof.graph import Variable, Apply, Constant
 from theano.gof.op import Op
-from theano.gof.opt import *  # noqa
+from theano.gof.opt import (OpKeyOptimizer, PatternSub, TopoOptimizer, OpSub,
+                            MergeOptimizer, config, theano,
+                            EquilibriumOptimizer, logging, pre_constant_merge,
+                            pre_greedy_local_optimizer)
 from theano.gof.fg import FunctionGraph
-from theano.gof.toolbox import *  # noqa

 from theano import tensor as T


--- a/theano/gof/tests/test_toolbox.py
+++ b/theano/gof/tests/test_toolbox.py
@@ -5,7 +5,7 @@ from theano.gof.type import Type
 from theano.gof.op import Op

 from theano.gof.fg import FunctionGraph
-from theano.gof.toolbox import *  # noqa
+from theano.gof.toolbox import NodeFinder


 def as_variable(x):

--- a/theano/gof/tests/test_vm.py
+++ b/theano/gof/tests/test_vm.py
@@ -197,23 +197,53 @@ def test_speed_lazy():
 def test_partial_function():
    import numpy as np
    from theano.tests import unittest_tools as utt
-    x = tensor.scalar('input')
-    y = x ** 2
-    f = theano.function([x], [y + 7, y - 9, y / 14.], mode=Mode(
-        optimizer=None, linker=vm.VM_Linker(allow_partial_eval=True)))

-    assert f(3, output_subset=[0, 1, 2]) == f(3)
-    assert f(4, output_subset=[0, 2]) == [f(4)[0], f(4)[2]]
-    utt.assert_allclose(f(5), np.array([32., 16., 1.7857142857142858]))
+    def check_partial_function(linker_name):
+        x = tensor.scalar('input')
+        y = x ** 2
+        f = theano.function([x], [y + 7, y - 9, y / 14.], mode=Mode(
+            optimizer=None, linker=linker_name))

+        assert f(3, output_subset=[0, 1, 2]) == f(3)
+        assert f(4, output_subset=[0, 2]) == [f(4)[0], f(4)[2]]
+        utt.assert_allclose(f(5), np.array([32., 16., 1.7857142857142858]))

-def test_partial_function_output_keys():
-    x = tensor.scalar('input')
-    y = 3 * x
-    f = theano.function([x], {'a': y * 5, 'b': y - 7}, mode=Mode(
-        optimizer=None, linker=vm.VM_Linker(allow_partial_eval=True)))
+    check_partial_function(vm.VM_Linker(allow_partial_eval=True, use_cloop=False))
+    check_partial_function('cvm')

-    assert f(5, output_subset=['a'])['a'] == f(5)['a']
+
+def test_partial_function_with_output_keys():
+
+    def check_partial_function_output_keys(linker_name):
+        x = tensor.scalar('input')
+        y = 3 * x
+        f = theano.function([x], {'a': y * 5, 'b': y - 7}, mode=Mode(
+            optimizer=None, linker=linker_name))
+
+        assert f(5, output_subset=['a'])['a'] == f(5)['a']
+
+    check_partial_function_output_keys(vm.VM_Linker(allow_partial_eval=True, use_cloop=False))
+    check_partial_function_output_keys('cvm')
+
+
+def test_partial_function_with_updates():
+
+    def check_updates(linker_name):
+        x = tensor.lscalar('input')
+        y = theano.shared(numpy.asarray(1, 'int64'), name='global')
+        f = theano.function([x], [x, x + 34], updates=[(y, x + 1)], mode=Mode(
+            optimizer=None, linker=linker_name))
+        g = theano.function([x], [x - 6], updates=[(y, y + 3)], mode=Mode(
+            optimizer=None, linker=linker_name))
+
+        assert f(3, output_subset=[]) == []
+        assert y.get_value() == 4
+        assert g(30, output_subset=[0]) == [24]
+        assert g(40, output_subset=[]) == []
+        assert y.get_value() == 10
+
+    check_updates(vm.VM_Linker(allow_partial_eval=True, use_cloop=False))
+    check_updates('cvm')


 def test_allow_gc_cvm():

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -332,7 +332,8 @@ class Stack(VM):

    def __init__(self, nodes, thunks, pre_call_clear,
                 storage_map, compute_map, fgraph, allow_gc,
-                 dependencies=None, callback=None, callback_input=None):
+                 n_updates, dependencies=None, callback=None,
+                 callback_input=None):
        super(Stack, self).__init__(nodes, thunks, pre_call_clear)

        self.allow_gc = allow_gc
@@ -346,6 +347,7 @@ class Stack(VM):
        self.node_idx = node_idx = {}
        self.callback = callback
        self.callback_input = callback_input
+        self.n_updates = n_updates

        ords = fgraph.orderings()

@@ -417,6 +419,9 @@ class Stack(VM):

        # apply_stack contains nodes
        if output_subset is not None:
+            first_updated = len(self.outputs) - self.n_updates
+            output_subset = output_subset + list(range(first_updated,
+                                                       len(self.outputs)))
            apply_stack =\
                [self.outputs[i].owner for i in output_subset
                    if self.outputs[i].owner]
@@ -425,7 +430,7 @@ class Stack(VM):

        last_apply_stack_len = -1

-        # This record all function inputs/shared varibles and constants
+        # This record all function inputs/shared variables and constants
        for var, data in iteritems(self.storage_map):
            if data[0] is None:
                continue
@@ -726,7 +731,7 @@ class VM_Linker(link.LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Check if fgraph is the first FunctionGraph that has ever been
        associated to self, else, create a new VM_Linker
@@ -774,9 +779,11 @@ class VM_Linker(link.LocalLinker):
                schedule=self.schedule,
                c_thunks=self.c_thunks,
                allow_partial_eval=self.allow_partial_eval
-            ).accept(fgraph, no_recycling)
+            ).accept(fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
+        self.profile = profile
+
        return self

    def accept_var_updates(self, updated_vars):
@@ -842,7 +849,7 @@ class VM_Linker(link.LocalLinker):

        if (self.callback is not None or self.callback_input is not None or
                (config.profile and config.profile_memory) or
-                self.allow_partial_eval):
+                (self.allow_partial_eval and not self.use_cloop)):

            if self.use_cloop and (self.callback is not None or
                                   self.callback_input is not None):
@@ -850,9 +857,9 @@ class VM_Linker(link.LocalLinker):
            if self.use_cloop and config.profile_memory:
                warnings.warn(
                    'CVM does not support memory profile, using Stack VM.')
-            if self.use_cloop and self.allow_partial_eval:
+            if not self.use_cloop and self.allow_partial_eval:
                warnings.warn(
-                    'CVM does not support partial evaluation yet, '
+                    'LoopGC does not support partial evaluation, '
                    'using Stack VM.')
            # Needed for allow_gc=True, profiling and storage_map reuse
            deps = self.compute_gc_dependencies(storage_map)
@@ -860,6 +867,7 @@ class VM_Linker(link.LocalLinker):
                nodes, thunks, pre_call_clear,
                storage_map, compute_map,
                self.fgraph, self.allow_gc,
+                len(updated_vars),
                dependencies=deps,
                callback=self.callback,
                callback_input=self.callback_input)
@@ -1000,7 +1008,8 @@ class VM_Linker(link.LocalLinker):
                    nodes, thunks, pre_call_clear,
                    storage_map, compute_map,
                    self.fgraph, self.allow_gc,
-                    dependencies=deps
+                    len(updated_vars),
+                    dependencies=deps,
                )
        return vm

@@ -1031,7 +1040,7 @@ class VM_Linker(link.LocalLinker):

        reallocated_info = calculate_reallocate_info(
            order, fgraph, storage_map, compute_map_re, dependencies)
-
+        t0 = time.time()
        for node in order:
            try:
                if self.c_thunks is False:
@@ -1049,6 +1058,11 @@ class VM_Linker(link.LocalLinker):
                e.args = ("The following error happened while"
                          " compiling the node", node, "\n") + e.args
                raise
+        t1 = time.time()
+
+        if self.profile:
+            self.profile.linker_node_make_thunks += t1 - t0
+
        for node, thunk in zip(order, thunks):
            thunk.inputs = [storage_map[v] for v in node.inputs]
            thunk.outputs = [storage_map[v] for v in node.outputs]

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -63,7 +63,8 @@ def init_dev(dev, name=None):
    if dev not in init_dev.devmap:
        ctx = pygpu.init(dev,
                         disable_alloc_cache=config.gpuarray.preallocate < 0,
-                         single_stream=config.gpuarray.single_stream)
+                         single_stream=config.gpuarray.single_stream,
+                         sched=config.gpuarray.sched)
        init_dev.devmap[dev] = ctx
        if config.gpuarray.preallocate > 0:
            MB = (1024 * 1024)
@@ -89,11 +90,11 @@ def init_dev(dev, name=None):
    if dev.startswith('cuda'):
        try:
            cudnn_version = dnn.version()
-            # 5100 should not print warning with cudnn 5 final.
-            if cudnn_version > 5100:
+            # 5200 should not print warning with cudnn 5.1 final.
+            if cudnn_version >= 5200:
                warnings.warn("Your cuDNN version is more recent than Theano."
                              " If you see problems, try updating Theano or"
-                              " downgrading cuDNN to version 5.")
+                              " downgrading cuDNN to version 5.1.")
            if config.print_active_device:
                print("Using cuDNN version %d on context %s" %
                      (cudnn_version, name), file=sys.stderr)

--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -47,8 +47,8 @@ class GpuGemv(BlasOp):
        A = as_gpuarray_variable(A, ctx_name)
        x = as_gpuarray_variable(x, ctx_name)
        y = as_gpuarray_variable(y, ctx_name)
-        alpha = as_tensor_variable(alpha)
-        beta = as_tensor_variable(beta)
+        alpha = as_tensor_variable(alpha).astype('float64')
+        beta = as_tensor_variable(beta).astype('float64')
        assert alpha.ndim == 0
        assert beta.ndim == 0
        assert A.ndim == 2
@@ -128,8 +128,8 @@ class GpuGemm(BlasOp):
        A = as_gpuarray_variable(A, ctx_name)
        B = as_gpuarray_variable(B, ctx_name)
        C = as_gpuarray_variable(C, ctx_name)
-        alpha = as_tensor_variable(alpha)
-        beta = as_tensor_variable(beta)
+        alpha = as_tensor_variable(alpha).astype('float64')
+        beta = as_tensor_variable(beta).astype('float64')
        assert alpha.ndim == 0
        assert beta.ndim == 0
        assert A.ndim == 2
@@ -208,7 +208,7 @@ class GpuGer(BlasOp):
        A = as_gpuarray_variable(A, ctx_name)
        x = as_gpuarray_variable(x, ctx_name)
        y = as_gpuarray_variable(y, ctx_name)
-        alpha = as_tensor_variable(alpha)
+        alpha = as_tensor_variable(alpha).astype('float64')
        assert alpha.ndim == 0
        assert A.ndim == 2
        assert x.ndim == 1
@@ -345,8 +345,8 @@ class GpuGemmBatch(BlasOp):
        A = as_gpuarray_variable(A, ctx_name)
        B = as_gpuarray_variable(B, ctx_name)
        C = as_gpuarray_variable(C, ctx_name)
-        alpha = as_tensor_variable(alpha)
-        beta = as_tensor_variable(beta)
+        alpha = as_tensor_variable(alpha).astype('float64')
+        beta = as_tensor_variable(beta).astype('float64')
        assert alpha.ndim == 0
        assert beta.ndim == 0
        assert A.ndim == 3

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -369,7 +369,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        return node.inputs[0].type.context

    def c_code_cache_version(self):
-        return (11,)
+        return (12,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']
@@ -499,7 +499,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        load_sm = load_w(dtype_sm)
        write_dx = write_w(dtype_dx)
        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
-        type_dnll = gpuarray.dtype_to_ctype(work_dnll)
+        wtype_dnll = gpuarray.dtype_to_ctype(work_dnll)
+        type_dnll = gpuarray.dtype_to_ctype(dtype_dnll)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
@@ -525,7 +526,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):

            for (int i = blockIdx.x; i < N; i += gridDim.x)
            {
-                %(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
+                %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];

                for (int j = threadIdx.x; j < K; j += blockDim.x)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -6,6 +6,7 @@ import pdb
 import time
 from six import iteritems
 from six.moves import xrange
+import sys

 import theano
 from theano import tensor, scalar, gof, config
@@ -13,7 +14,6 @@ from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        SequenceDB, Optimizer, DB, toolbox, graph)
-from theano.gof.opt import NavigatorOptimizer
 from theano.ifelse import IfElse
 from theano.misc.ordered_set import OrderedSet

@@ -81,8 +81,8 @@ gpu_seqopt = SequenceDB()
 gpu_seqopt.register('gpuarray_graph_optimization', GraphToGPUDB(), -0.5,
                    'fast_compile', 'fast_run', 'gpuarray')

-gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
-                    'fast_compile', 'fast_run', 'gpuarray')
+gpu_seqopt.register('gpuarray_local_optimizations', gpu_optimizer, 1,
+                    'fast_compile', 'fast_run', 'gpuarray', 'gpuarray_local_optimiziations')
 gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_compile', 'fast_run', 'gpuarray')

@@ -262,7 +262,7 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
                    0, 'fast_run', 'fast_compile', 'merge')


-class GraphToGPU(NavigatorOptimizer):
+class GraphToGPU(Optimizer):
    """
    Transfer the graph as a whole to GPU instead of transfering node by node.

@@ -373,6 +373,14 @@ class GraphToGPU(NavigatorOptimizer):

            if new_ops:
                node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
+                if any([getattr(old_o, 'dtype', None) != getattr(new_o, 'dtype', None)
+                        for old_o, new_o in zip(outputs, node.outputs)]):
+                    _logger.warning(
+                        "The optimization %s returned bad dtype. Skipping it."
+                        " Write to theano-dev mailing list about this." %
+                        str(lopt))
+                    newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
+                    outputs = newnode.outputs

            for new_o, old_o in zip(outputs, node.outputs):
                assert len(outputs) == len(node.outputs)
@@ -477,6 +485,16 @@ class GraphToGPU(NavigatorOptimizer):
                node_created,
                process_count)

+    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
+        print("%s%s (%i)" % (
+            (' ' * level), self.__class__.__name__, id(self)), file=stream)
+        if depth != 0:
+            map_values = []
+            for opts in self.local_optimizers_map.values():
+                map_values += opts
+            for opt in self.local_optimizers_all + map_values:
+                opt.print_summary(stream, level=(level + 2), depth=(depth - 1))
+

 @local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
 def local_cut_gpu_transfers(node):
@@ -625,10 +643,7 @@ def local_gpua_contiguous(op, context_name, inputs, outputs):
 @op_lifter([tensor.Reshape])
 @register_opt2([tensor.Reshape], 'fast_compile')
 def local_gpua_reshape(op, context_name, inputs, outputs):
-    name = op.name
-    if name:
-        name = 'Gpu' + name
-    res = GpuReshape(op.ndim, op.name)
+    res = GpuReshape(op.ndim)
    return res


@@ -647,7 +662,7 @@ def local_gpua_flatten(op, context_name, inputs, outputs):
    if op.outdim != 1:
        shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
    shp += [-1]
-    res = GpuReshape(op.outdim, None)
+    res = GpuReshape(op.outdim)
    o = res(inputs[0], theano.tensor.as_tensor_variable(shp))
    return o

@@ -1009,10 +1024,9 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
        else:
            return False
        x, = inputs
-
        greduce = op2(
            op.scalar_op, axis=op.axis,
-            dtype=getattr(op, 'dtype', None),
+            dtype=getattr(op, 'dtype', outputs[0].dtype),
            acc_dtype=getattr(op, 'acc_dtype', None))
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
@@ -1051,7 +1065,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
            greduce = op2(
                op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
-                dtype=getattr(op, 'dtype', None),
+                dtype=getattr(op, 'dtype', outputs[0].dtype),
                acc_dtype=getattr(op, 'acc_dtype', None))

            reshaped_x = x.reshape(tensor.stack(new_in_shp))

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -336,12 +336,23 @@ class GpuIncSubtensor(IncSubtensor):
            C code expression to copy source into view, and 0 on success.

        """
-        return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
+        return """sub_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
                '<gpuarray/elemwise.h>']

+    def c_support_code(self):
+        return """
+int sub_setarray(GpuArray *dst, GpuArray *src) {
+  int err;
+  err = GpuArray_setarray(dst, src);
+  if (err != GA_NO_ERROR)
+    PyErr_SetString(PyExc_RuntimeError, "setarray failed");
+  return err;
+}
+"""
+
    def c_support_code_struct(self, node, nodename):
        return "\nGpuElemwise *iadd;\n"

@@ -383,7 +394,7 @@ class GpuIncSubtensor(IncSubtensor):
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
        if not parent_version:
            return
-        return parent_version + (7,)
+        return parent_version + (8,)


 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -21,6 +21,7 @@ from ..basic_ops import (
    host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu,
    GpuAlloc, GpuAllocEmpty, GpuContiguous,
    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+from ..elemwise import GpuDimShuffle, GpuElemwise
 from ..subtensor import GpuSubtensor

 from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
@@ -167,6 +168,8 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
                               inputs, variables))

    Checker.__name__ = name
+    if hasattr(Checker, '__qualname__'):
+        Checker.__qualname__ = name
    return Checker


@@ -228,6 +231,7 @@ def gpu_alloc_expected(x, *shp):
    g[:] = x
    return g

+
 GpuAllocTester = makeTester(
    name="GpuAllocTester",
    op=alloc,
@@ -321,7 +325,7 @@ class G_reshape(test_basic.T_reshape):
            mode=mode_with_gpu,
            ignore_topo=(HostFromGpu, GpuFromHost,
                         theano.compile.DeepCopyOp,
-                         theano.gpuarray.elemwise.GpuElemwise,
+                         GpuDimShuffle, GpuElemwise,
                         theano.tensor.opt.Shape_i,
                         theano.tensor.opt.MakeVector))
        assert self.op == GpuReshape

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -479,10 +479,9 @@ class TestDnnInferShapes(utt.InferShapeTester):

    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
    def test_conv3d_none(self, border_mode, conv_mode):
-        ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
-        self._test_conv(ftensor5('img'),
-                        ftensor5('kerns'),
-                        ftensor5('out'),
+        self._test_conv(T.ftensor5('img'),
+                        T.ftensor5('kerns'),
+                        T.ftensor5('out'),
                        numpy.random.rand(10, 2, 6, 4, 11),
                        numpy.random.rand(8, 2, 4, 3, 1),
                        border_mode,

--- a/theano/gpuarray/tests/test_extra_ops.py
+++ b/theano/gpuarray/tests/test_extra_ops.py
@@ -28,6 +28,27 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            raise SkipTest("Cuda specific tests")
        self.max_threads_dim0 = test_ctx.maxlsize0
        self.max_grid_size1 = test_ctx.maxgsize2
+        self.op_class = GpuCumsum
+
+    def test_infer_shape(self):
+        # GpuCumSum is only defined for float32 for now, so we skip it
+        # in the unsupported cases
+        gpucumsum_supported_dtypes = ('float32',)
+        if theano.config.floatX not in gpucumsum_supported_dtypes:
+            raise SkipTest('GpuCumSum not implemented for dtype %s'
+                           % theano.config.floatX)
+        x = T.tensor3('x')
+        a = np.random.random((3, 5, 2)).astype(theano.config.floatX)
+
+        for axis in range(-len(a.shape), len(a.shape)):
+            self._compile_and_check([x],
+                                    [cumsum(x, axis=axis)],
+                                    [a],
+                                    self.op_class)
+
+    def test_grad(self):
+        # no grad for GpuCumsum
+        pass

    def test_Strides1D(self):
        x = T.fvector('x')

--- a/theano/gpuarray/tests/test_fft.py
+++ b/theano/gpuarray/tests/test_fft.py
@@ -214,8 +214,7 @@ class TestFFT(unittest.TestCase):
        res_irfft = f_irfft()

        inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
-        irfft_ref = numpy.fft.irfftn(
-            inputs_ref, s=(M, M), axes=(1, 2), norm='ortho')
+        irfft_ref = numpy.fft.irfftn(inputs_ref, s=(M, M), axes=(1, 2)) * M

        utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)


--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -7,6 +7,7 @@ from theano.compile import DeepCopyOp
 from theano.tensor.tests import test_subtensor

 from ..basic_ops import HostFromGpu, GpuFromHost
+from ..elemwise import GpuDimShuffle
 from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
                         GpuAdvancedSubtensor1,
                         GpuAdvancedIncSubtensor1)
@@ -27,6 +28,7 @@ class G_subtensor(test_subtensor.T_subtensor):
            inc_sub=GpuIncSubtensor,
            adv_sub1=GpuAdvancedSubtensor1,
            adv_incsub1=GpuAdvancedIncSubtensor1,
+            dimshuffle=GpuDimShuffle,
            mode=mode_with_gpu,
            # avoid errors with limited devices
            dtype='float32',

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -390,8 +390,8 @@ def grad(cost, wrt, consider_constant=None,
        If True, variables generated by grad will be named
        (d<cost.name>/d<wrt.name>) provided that both cost and wrt
        have names
-    known_grads : dict, optional
-        A dictionary mapping variables to their gradients. This is
+    known_grads : OrderedDict, optional
+        A ordered dictionary mapping variables to their gradients. This is
        useful in the case where you know the gradient on some
        variables but do not know the original cost.
    return_disconnected : {'zero', 'None', 'Disconnected'}
@@ -462,6 +462,9 @@ def grad(cost, wrt, consider_constant=None,

    if known_grads is None:
        known_grads = OrderedDict()
+    else:
+        m = "known_grads must be an OrderedDict. "
+        assert isinstance(known_grads, OrderedDict) or len(known_grads) <= 1, m

    # The gradient of the cost is 1 unless specified otherwise by known_grads.
    if cost is not None:
@@ -1369,8 +1372,10 @@ class numeric_grad(object):
    # perfectly accurate.
    type_eps = {'float64': 1e-7,
                'float32': 3e-4,
+                'float16': 1e-3,
                numpy.dtype('float64'): 1e-7,
-                numpy.dtype('float32'): 3e-4}
+                numpy.dtype('float32'): 3e-4,
+                numpy.dtype('float16'): 1e-3}

    def __init__(self, f, pt, eps=None, out_type=None):
        """Return the gradient of f at pt.
@@ -1570,7 +1575,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
        and returns a Theano variable. For instance, an Op instance with
        a single output.
    :param pt: the list of numpy.ndarrays to use as input values.
-        These arrays must be either float32 or float64 arrays.
+        These arrays must be either float16, float32, or float64 arrays.
    :param n_tests: number of times to run the test
    :param rng: random number generator used to sample u, we test gradient
        of sum(u * fun) at pt
@@ -1589,7 +1594,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
        comparison
    :param cast_to_output_type: if the output is float32 and
        cast_to_output_type is True, cast the random projection to
-        float32. Otherwise it is float64.
+        float32. Otherwise it is float64. float16 is not handled here.
    :param no_debug_ref: Don't use DebugMode for the numerical
        gradient function.

@@ -1606,12 +1611,13 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
    pt = [numpy.array(p) for p in pt]

    for i, p in enumerate(pt):
-        if p.dtype not in ('float32', 'float64'):
+        if p.dtype not in ('float16', 'float32', 'float64'):
            raise TypeError(
                ('verify_grad can work only with floating point '
                 'inputs, but input %i has dtype "%s".') % (i, p.dtype))

    _type_tol = dict(  # relative error tolerances for different types
+        float16=5e-2,
        float32=1e-2,
        float64=1e-4)


--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -6,6 +6,7 @@ import os
 import shutil
 import stat
 import sys
+import textwrap
 import warnings

 import theano
@@ -82,10 +83,10 @@ def set_cuda_disabled():
 cuda_path = os.path.abspath(os.path.split(__file__)[0])

 cuda_ndarray_loc = os.path.join(config.compiledir, 'cuda_ndarray')
-cuda_ndarray_so = os.path.join(cuda_ndarray_loc,
-                               'cuda_ndarray.' + get_lib_extension())
-libcuda_ndarray_so = os.path.join(cuda_ndarray_loc,
-                               'libcuda_ndarray.' + get_lib_extension())
+cuda_ndarray_so = os.path.join(
+        cuda_ndarray_loc, 'cuda_ndarray.' + get_lib_extension())
+libcuda_ndarray_so = os.path.join(
+        cuda_ndarray_loc, 'libcuda_ndarray.' + get_lib_extension())


 def try_import():
@@ -280,6 +281,7 @@ def dnn_available():
            dnn_available.msg = "Device not supported"
            dnn_available.avail = False
        else:
+<<<<<<< HEAD
            preambule = """
 #include <stdio.h>
 #include <cuda.h>
@@ -300,6 +302,27 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
            path_wrapper = "\"" if os.name =='nt' else ""
            params = ["-l", "cudnn"]
            params.extend(['-I%s%s%s' % (path_wrapper,os.path.dirname(__file__),path_wrapper)])
+=======
+            preambule = textwrap.dedent(
+                """
+                #include <stdio.h>
+                #include <cuda.h>
+                #include <cudnn.h>
+                #include <cudnn_helper.h>
+                """)
+
+            body = textwrap.dedent(
+                """
+                cudnnHandle_t _handle = NULL;
+                cudnnStatus_t err;
+                if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+                  fprintf(stderr, "could not create cuDNN handle: %s",
+                          cudnnGetErrorString(err));
+                  return 1;
+                }
+                """)
+            params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+>>>>>>> refs/remotes/Theano/master
            if config.dnn.include_path:
                params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)]) 
            if config.dnn.library_path:
@@ -307,8 +330,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
            if config.nvcc.compiler_bindir:
                params.extend(['--compiler-bindir',
                               config.nvcc.compiler_bindir])
-            params.extend([flag for flag in config.nvcc.flags.split(' ') if flag]) 
-            
+            params.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
+
            # Do not run here the test program. It would run on the
            # default gpu, not the one selected by the user. If mixed
            # GPU are installed or if the GPUs are configured in
@@ -370,24 +393,26 @@ class DnnVersion(GpuOp):
        return ['-Wl,-rpath,' + config.dnn.library_path]

    def c_support_code(self):
-        return """
-#if PY_MAJOR_VERSION >= 3
-#define PyInt_FromLong PyLong_FromLong
-#endif
-"""
+        return textwrap.dedent(
+            """
+            #if PY_MAJOR_VERSION >= 3
+            #define PyInt_FromLong PyLong_FromLong
+            #endif
+            """)

    def make_node(self):
        return theano.gof.Apply(self, [], [theano.gof.Generic()()])

    def c_code(self, node, name, inputs, outputs, sub):
        o = outputs[0]
-        return """
-        #if defined(CUDNN_VERSION)
-        %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
-        #else
-        %(o)s = PyInt_FromLong(-1);
-        #endif
-        """ % locals()
+        return textwrap.dedent(
+            """
+            #if defined(CUDNN_VERSION)
+            %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
+            #else
+            %(o)s = PyInt_FromLong(-1);
+            #endif
+            """) % locals()

    def do_constant_folding(self, node):
        # Needed as we do not want to cache this information.
@@ -426,12 +451,13 @@ if cuda_available:
    import cuda_ndarray.cuda_ndarray
    if cuda_ndarray_so != cuda_ndarray.cuda_ndarray.__file__:
        _logger.warning("cuda_ndarray was loaded from %s, but Theano expected "
-                "to load it from %s. This is not expected as theano should "
-                "compile it automatically for you. Do you have a directory "
-                "called cuda_ndarray in your LD_LIBRARY_PATH environment "
-                "variable? If so, please remove it as it is outdated.",
-                cuda_ndarray.cuda_ndarray.__file__,
-                cuda_ndarray_so)
+                        "to load it from %s. This is not expected as theano "
+                        "should compile it automatically for you. Do you have "
+                        "a directory called cuda_ndarray in your "
+                        "LD_LIBRARY_PATH environment variable? If so, please "
+                        "remove it as it is outdated.",
+                        cuda_ndarray.cuda_ndarray.__file__,
+                        cuda_ndarray_so)

    shared_constructor = float32_shared_constructor

@@ -446,8 +472,8 @@ if cuda_available:
            ftensor3, ftensor4,
            scalar, vector, matrix, row, col,
            tensor3, tensor4)
-    from .basic_ops import (host_from_gpu, gpu_from_host,
-            as_cuda_array, as_cuda_ndarray_variable)
+    from .basic_ops import (host_from_gpu, gpu_from_host, as_cuda_array,
+                            as_cuda_ndarray_variable)
    import cuda_ndarray
    from . import opt, dnn
    from .rng_curand import CURAND_RandomStreams
@@ -497,10 +523,11 @@ def use(device,
            raise EnvironmentError("You forced the use of gpu device %s, "
                                   "but CUDA initialization failed "
                                   "with error:\n%s" % (
-                device, cuda_initialization_error_message))
+                                       device,
+                                       cuda_initialization_error_message))
    elif not nvcc_compiler.is_nvcc_available():
-        _logger.error('nvcc compiler not found on $PATH.'
-              ' Check your nvcc installation and try again.')
+        _logger.error("nvcc compiler not found on $PATH. "
+                      "Check your nvcc installation and try again.")
        return
    elif not cuda_available:
        error_addendum = ""
@@ -509,10 +536,10 @@ def use(device,
                error_addendum = (" (error: %s)" %
                                  cuda_initialization_error_message)
        except NameError:
-# cuda_initialization_error_message is not available b/c compilation failed
+            # cuda_initialization_error_message is not available b/c compilation failed
            pass
-        _logger.warning('CUDA is installed, but device %s is not available %s',
-                device, error_addendum)
+        _logger.warning("CUDA is installed, but device %s is not available %s",
+                        device, error_addendum)
        return

    if device == 'gpu':
@@ -586,12 +613,12 @@ def use(device,
                    if dnn_available():
                        (hdr_v, runtime_v) = dnn_version()
                        cudnn_version = runtime_v
-                        # 5100 should not print warning with cudnn 5 final.
-                        if cudnn_version > 5100:
+                        # 5200 should not print warning with cudnn 5 final.
+                        if cudnn_version >= 5200:
                            warn = ("Your cuDNN version is more recent than the one"
                                    " Theano officially supports."
                                    " If you see any problems, try updating Theano or"
-                                    " downgrading cuDNN to version 5.")
+                                    " downgrading cuDNN to version 5.1.")
                except Exception:
                    cudnn_version = dnn_available.msg
                print("Using gpu device %d: %s (CNMeM is %s, cuDNN %s)" % (
@@ -625,8 +652,8 @@ def use(device,

    elif use.device_number != device and device != 'gpu':
        _logger.warning(("Ignoring call to use(%s), GPU number %i "
-            "is already in use."),
-            str(device), use.device_number)
+                         "is already in use."),
+                        str(device), use.device_number)

    if move_shared_float32_to_gpu:
        handle_shared_float32(True)
@@ -704,11 +731,10 @@ elif config.init_gpu_device.startswith('gpu'):
        "We can use the Theano flag init_gpu_device"
        " only when the Theano flag device=='cpu'")
    _logger.warning(("GPU device %s will be initialized, and used if a GPU is "
-          "needed. "
-          "However, no computation, nor shared variables, will be implicitly "
-          "moved to that device. If you want that behavior, use the 'device' "
-          "flag instead."),
-          config.init_gpu_device)
+                     "needed. However, no computation, nor shared variables, "
+                     "will be implicitly moved to that device. If you want "
+                     "that behavior, use the 'device' flag instead."),
+                    config.init_gpu_device)
    use(device=config.init_gpu_device,
        force=config.force_device,
        default_to_move_computation_to_gpu=False,

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -700,6 +700,8 @@ def local_gpu_solve(node):
    CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)

    """
+    if node.outputs[0].dtype != 'float32':
+        return
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and
@@ -1352,8 +1354,9 @@ def cast(x, dtype):


 @register_opt()
-@local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
-def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
+@local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias],
+                 'local_gpu_crossentorpy_softmax_argmax_1hot_with_bias')
+def local_gpu_crossentropy_softmax_argmax_1hot_with_bias(node):
    if isinstance(node.op, tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias):
        x, b, y = node.inputs
        if x.owner and isinstance(x.owner.op, HostFromGpu):
@@ -1381,8 +1384,9 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):


 @register_opt()
-@local_optimizer([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
-def local_gpu_crossentorpy_softmax_1hot_with_bias_dx(node):
+@local_optimizer([tensor.nnet.CrossentropySoftmax1HotWithBiasDx],
+                 'local_gpu_crossentorpy_softmax_1hot_with_bias_dx')
+def local_gpu_crossentropy_softmax_1hot_with_bias_dx(node):
    if isinstance(node.op, tensor.nnet.CrossentropySoftmax1HotWithBiasDx):
        dnll, sm, yidx = node.inputs
        if sm.owner and isinstance(sm.owner.op, HostFromGpu):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1014,6 +1014,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
    inc_sub = cuda.GpuIncSubtensor
    adv_sub1 = cuda.GpuAdvancedSubtensor1
    adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
+    dimshuffle = cuda.GpuDimShuffle
    mode = mode_with_gpu
    dtype = 'float32'
    type = tcn.CudaNdarrayType
@@ -1075,7 +1076,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
            # Test with c_contiguous input
            t = self.adv_sub1()(n, idx)
            t.owner.op.perform_using_take = True  # input c_contiguous, so we reshape
-            val = self.eval_output_and_check(t, list=True)
+            val = self.eval_output_and_check(t, op_type=self.adv_sub1)

            val = numpy.asarray(val)
            good = data[idx]

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -823,7 +823,7 @@ def test_batchnorm_inference():
                utt.assert_allclose(outputs[3], outputs[3 + 5])  # dscale
                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
-                utt.assert_allclose(outputs[6], outputs[6 + 5])  # dvar
+                utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=5e-5)  # dvar


 def test_dnn_tag():
@@ -938,10 +938,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
    def test_conv3d(self):
        if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
            raise SkipTest('"cuDNN 3D convolution requires cuDNN v2')
-        ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
-        img = ftensor5('img')
-        kerns = ftensor5('kerns')
-        out = ftensor5('out')
+        img = T.ftensor5('img')
+        kerns = T.ftensor5('kerns')
+        out = T.ftensor5('out')
        img_val = numpy.asarray(
            numpy.random.rand(10, 2, 6, 4, 11),
            dtype='float32'
@@ -1026,10 +1025,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
    def test_conv3d_gradw(self):
        if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
            raise SkipTest('"cuDNN 3D convolution requires cuDNN v2')
-        ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
-        img = ftensor5('img')
-        kerns = ftensor5('kerns')
-        out = ftensor5('out')
+        img = T.ftensor5('img')
+        kerns = T.ftensor5('kerns')
+        out = T.ftensor5('out')
        img_val = numpy.asarray(
            numpy.random.rand(9, 2, 4, 8, 13),
            dtype='float32'
@@ -1116,10 +1114,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
    def test_conv3d_gradi(self):
        if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
            raise SkipTest('"cuDNN 3D convolution requires cuDNN v2')
-        ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
-        img = ftensor5('img')
-        kerns = ftensor5('kerns')
-        out = ftensor5('out')
+        img = T.ftensor5('img')
+        kerns = T.ftensor5('kerns')
+        out = T.ftensor5('out')
        img_val = numpy.asarray(
            numpy.random.rand(8, 4, 6, 7, 11),
            dtype='float32'

--- a/theano/sandbox/cuda/tests/test_tensor_op.py
+++ b/theano/sandbox/cuda/tests/test_tensor_op.py
@@ -3,12 +3,14 @@ This file test tensor op that should also operate on CudaNdaray.
 """
 from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest
+from nose_parameterized import parameterized

 import numpy

 import theano
 from theano import tensor
 import theano.tensor as T
+import theano.tests.unittest_tools as utt

 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
@@ -139,6 +141,8 @@ def test_get_diagonal_subtensor_view():
    test_conv3d2d.test_get_diagonal_subtensor_view(wrap=cuda.CudaNdarray)


-def test_conv3d():
-    test_conv3d2d.test_conv3d(mode=mode_with_gpu,
-                              shared=cuda.shared_constructor)
+@parameterized.expand(('valid', 'full'), utt.custom_name_func)
+def test_conv3d(border_mode):
+    test_conv3d2d.check_conv3d(border_mode=border_mode,
+                               mode=mode_with_gpu,
+                               shared=cuda.shared_constructor)
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -2024,7 +2024,7 @@ class Scan(PureOp):
            # it will be the sum of the external gradient signal and the
            # gradient obtained by propagating Y's external gradient signal
            # to X.
-            known_grads = dict([(k.copy(), v) for (k, v) in known_grads.items()])
+            known_grads = OrderedDict([(k.copy(), v) for (k, v) in known_grads.items()])

            grads = gradient.grad(
                        cost=None,
@@ -2094,7 +2094,7 @@ class Scan(PureOp):
            dC_dXts.append(dC_dXt)


-        known_grads = {}
+        known_grads = OrderedDict()
        dc_dxts_idx = 0
        for i in range(len(diff_outputs)):
            if i < idx_nitsot_start or i >= idx_nitsot_end:

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -2472,6 +2472,8 @@ def _hv_switch(op, expected_function):
        def expected_f(self, a, format=None, dtype=None):
            return expected_function(a, format, dtype)
    XStackTester.__name__ = op.__name__ + "Tester"
+    if hasattr(XStackTester, '__qualname__'):
+        XStackTester.__qualname__ = XStackTester.__name__
    return XStackTester

 HStackTester = _hv_switch(HStack, sp.hstack)
@@ -2687,6 +2689,8 @@ def elemwise_checker(op, expected_f, gap=None, test_dtypes=None,
    if name is None:
        name = op.__name__.capitalize() + 'Tester'
    Tester.__name__ = name
+    if hasattr(Tester, '__qualname__'):
+        Tester.__qualname__ = name
    assert 'Roundhalftoeven' not in Tester.__name__

    return Tester

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -463,8 +463,8 @@ if int(config.tensor.cmp_sloppy) > 1:
    # When config.tensor.cmp_sloppy>1 we are even more sloppy. This is
    # useful to test the GPU as they don't use extended precision and
    # this cause some difference bigger then the normal sloppy.
-    float16_atol = 5e-3
-    float16_rtol = 1e-2
+    float16_atol = 1e-2
+    float16_rtol = 5e-2

    float32_atol = 5e-4
    float32_rtol = 1e-3
@@ -472,8 +472,8 @@ if int(config.tensor.cmp_sloppy) > 1:
    float64_rtol = 1e-4
    float64_atol = 1e-3
 elif int(config.tensor.cmp_sloppy):
-    float16_atol = 1e-3
-    float16_rtol = 5e-3
+    float16_atol = 5e-3
+    float16_rtol = 1e-2

    float32_atol = 1e-4
    float32_rtol = 1e-3
@@ -483,8 +483,8 @@ elif int(config.tensor.cmp_sloppy):
 else:
    # If you change those value in test don't forget to put them back
    # when the test end.  Don't forget the case when the test fail.
-    float16_atol = 5e-4
-    float16_rtol = 5e-4
+    float16_atol = 1e-3
+    float16_rtol = 1e-3

    float32_atol = 1e-5
    float32_rtol = 1e-5
@@ -1030,6 +1030,34 @@ def tensor4(name=None, dtype=None):
 tensor4s, ftensor4s, dtensor4s, itensor4s, ltensor4s = _multi(
    tensor4, ftensor4, dtensor4, itensor4, ltensor4)

+ctensor5 = TensorType('complex64', ((False,) * 5))
+ztensor5 = TensorType('complex128', ((False,) * 5))
+ftensor5 = TensorType('float32', ((False,) * 5))
+dtensor5 = TensorType('float64', ((False,) * 5))
+btensor5 = TensorType('int8', ((False,) * 5))
+wtensor5 = TensorType('int16', ((False,) * 5))
+itensor5 = TensorType('int32', ((False,) * 5))
+ltensor5 = TensorType('int64', ((False,) * 5))
+
+
+def tensor5(name=None, dtype=None):
+    """Return a symbolic 5-D variable.
+
+    Parameters
+    ----------
+    dtype: numeric type
+        None means to use theano.config.floatX.
+    name
+        A name to attach to this variable.
+
+    """
+    if dtype is None:
+        dtype = config.floatX
+    type = TensorType(dtype, (False, False, False, False, False))
+    return type(name)
+tensor5s, ftensor5s, dtensor5s, itensor5s, ltensor5s = _multi(
+    tensor5, ftensor5, dtensor5, itensor5, ltensor5)
+

 Tensor = TensorType

@@ -2270,12 +2298,15 @@ pprint.assign(fill, printing.FunctionPrinter('fill'))


 @constructor
-def ones_like(model, dtype=None):
+def ones_like(model, dtype=None, opt=False):
    """equivalent of numpy.ones_like
    Parameters
    ----------
    model : tensor
    dtype : data-type, optional
+    opt : If True, we will return a constant instead of a graph when possible.
+          Useful for Theano optimization, not for user building a graph as this
+          have the consequence that model isn't always in the graph.

    Returns
    -------
@@ -2284,17 +2315,22 @@ def ones_like(model, dtype=None):
    """
    if dtype is None:
        dtype = model.type.dtype
-    ret = fill(model, constant(1.0, dtype=dtype))
-    return ret
+    ret = constant(1.0, dtype=dtype)
+    if opt and ret.type == model.type:
+        return ret
+    return fill(model, ret)


 @constructor
-def zeros_like(model, dtype=None):
+def zeros_like(model, dtype=None, opt=False):
    """equivalent of numpy.zeros_like
    Parameters
    ----------
    model : tensor
    dtype : data-type, optional
+    opt : If True, we will return a constant instead of a graph when possible.
+          Useful for Theano optimization, not for user building a graph as this
+          have the consequence that model isn't always in the graph.

    Returns
    -------
@@ -2304,7 +2340,10 @@ def zeros_like(model, dtype=None):

    if dtype is None:
        dtype = model.type.dtype
-    return fill(model, constant(0.0, dtype=dtype))
+    ret = constant(0.0, dtype=dtype)
+    if opt and ret.type == model.type:
+        return ret
+    return fill(model, ret)


 def zeros(shape, dtype=None):
@@ -2780,13 +2819,14 @@ class Alloc(gof.Op):
            }

            // This function takes care of broadcasting
-            PyArray_CopyInto(%(zz)s, %(vv)s);
+            if (PyArray_CopyInto(%(zz)s, %(vv)s) == -1)
+              %(fail)s
            """ % dict(vv=vv, ndim=ndim, zz=zz, fail=fail)

        return code

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

    def infer_shape(self, node, input_shapes):
        return [node.inputs[1:]]
@@ -3135,7 +3175,7 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,


 @constructor
-def var(input, axis=None, keepdims=False):
+def var(input, axis=None, ddof=0, keepdims=False):
    """
    Computes the variance along the given axis(es) of a tensor `input`.

@@ -3144,6 +3184,8 @@ def var(input, axis=None, keepdims=False):
    axis: None or int or (list of int) (see `Sum`)
        Compute the variance along this axis of the tensor.
        None means all axes (like numpy).
+    ddof: Degrees of freedom; 0 would compute the ML estimate, 1 would compute
+        the unbiased estimate.
    keepdims : bool
        If this is set to True, the axes which are reduced are
        left in the result as dimensions with size one. With this option,
@@ -3158,6 +3200,9 @@ def var(input, axis=None, keepdims=False):

    """

+    if isinstance(ddof, (bool)):
+        raise ValueError('Parameter keepdims is now at index 3: (input, axis=None, ddof=0, keepdims=False)')
+
    input_ndim = input.type.ndim
    if axis is None:
        axis = list(range(input_ndim))
@@ -3175,13 +3220,19 @@ def var(input, axis=None, keepdims=False):
    centered_input = input - mean_input

    # return the mean sqr
-    v = mean((centered_input ** 2), axis, keepdims=keepdims)
+    if ddof == 0:
+        v = mean((centered_input ** 2), axis, keepdims=keepdims)
+    else:
+        shp = shape(input) - ddof
+        v = sum((centered_input ** 2), axis=axis, keepdims=keepdims)
+        for i in axis:
+            v = true_div(v, shp[i])
    v.name = 'var'
    return v


 @constructor
-def std(input, axis=None, keepdims=False):
+def std(input, axis=None, ddof=0, keepdims=False):
    """
    Computes the standard deviation along the given axis(es) of a tensor `input`.

@@ -3205,7 +3256,10 @@ def std(input, axis=None, keepdims=False):

    """

-    ret = sqrt(var(input=input, axis=axis, keepdims=keepdims))
+    if isinstance(ddof, (bool)):
+        raise ValueError('Parameter keepdims is now at index 3: (input, axis=None, ddof=0, keepdims=False)')
+
+    ret = sqrt(var(input=input, axis=axis, ddof=ddof, keepdims=keepdims))
    ret.name = 'std'
    return ret

@@ -4047,6 +4101,11 @@ def roll(x, shift, axis=None):
        else:
            axis = 0

+    # Shift may be larger than the size of the axis. If so, since the
+    # roll operation is cyclic, we can take the shift modulo the size
+    # of the axis
+    shift = shift % x.shape[axis]
+
    # A slice of all elements in a dimension ':'
    allslice = slice(None)
    # List of slices describing the front half [:, :, shift:, :]
@@ -4381,7 +4440,7 @@ class Reshape(Op):

    def __init__(self, ndim, name=None):
        self.ndim = ndim
-        self.name = name
+        assert name is None, 'name attribute for Reshape has been deprecated'

    def __str__(self):
        return '%s{%s}' % (self.__class__.__name__, self.ndim)
@@ -4557,7 +4616,7 @@ class Reshape(Op):
            return Op.c_code(self, node, name, inputs, outputs, sub)


-def reshape(x, newshape, ndim=None, name=None):
+def reshape(x, newshape, ndim=None):
    if ndim is None:
        newshape = as_tensor_variable(newshape)
        if newshape.ndim != 1:
@@ -4573,7 +4632,7 @@ def reshape(x, newshape, ndim=None, name=None):
                "to know what the number of dimensions of the reshaped "
                "variable will be. You can provide the 'ndim' keyword "
                "argument to 'reshape' to avoid this problem." % newshape)
-    op = Reshape(ndim, name)
+    op = Reshape(ndim)
    rval = op(x, newshape)
    return rval


--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
--- a/theano/tensor/nnet/tests/test_conv3d2d.py
+++ b/theano/tensor/nnet/tests/test_conv3d2d.py
--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
--- a/theano/tensor/signal/tests/test_pool.py
+++ b/theano/tensor/signal/tests/test_pool.py
--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py
--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py