Merged

24ef1606 · Olivier Delalleau · d7286bf8 · 43cf804a · 24ef1606 · 24ef1606
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -168,7 +168,7 @@ latex_font_size = '11pt'
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, document class [howto/manual]).
 latex_documents = [
-  ('contents', 'theano.tex', 'theano Documentation',
+  ('index', 'theano.tex', 'theano Documentation',
   'LISA lab, University of Montreal', 'manual'),
 ]

--- a/doc/index.txt
+++ b/doc/index.txt
@@ -37,7 +37,7 @@ Roughly in order of what you'll want to check out:
 * :ref:`extending` -- Learn to add a Type, Op, or graph optimization.
 * :ref:`internal` -- How to maintaining Theano, LISA-specific tips, and more...
-You can download the latest `PDF documentation <http://pylearn.org/theano/theano.pdf>`_, rather than reading it online.
+You can download the latest `PDF documentation <http://deeplearning.net/theanodoc/theano.pdf>`_, rather than reading it online.
 Community
 =========
@@ -46,7 +46,7 @@ Community
 * Register and post to `theano-dev`_ if you want to talk to the developers.
-* We try to stay organized with `Theano's Trac <trac/>`__ 
+* We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__ 
 * Come visit us in Montreal!  Most of the developers are students in the LISA_ group at the `University of Montreal`_.

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -20,7 +20,7 @@ to be installed:
        We develop mainly on 64-bit Linux machines. 32-bit architectures are
        not well-tested.
-    python >= 2.5
+    python >= 2.5 (2.4 should be supported as well)
    `numpy <http://numpy.scipy.org/>`_ >= 1.2
        Earlier versions have memory leaks.
@@ -30,6 +30,8 @@ to be installed:
        is buggy in 0.6. (scipy.csc_matrix dot has a bug with singleton
        dimensions. There may be more bugs.)
+    A BLAS installation (with Level 3 functionality)
 The following libraries and software are optional:
    g++, python-dev
@@ -42,41 +44,49 @@ The following libraries and software are optional:
    `mercurial <http://www.selenic.com/mercurial/>`_
        To download bleeding-edge version of Theano.
+.. _install_bleeding_edge:
+Getting the code
+-----------------
-Easy install
+If you are a developer of Theano, then check out the :ref:`dev_start_guide` guide. 
------------
-The following command will install the latest release of Theano
+The following are general instructions that will set you up with the bleeding-edge 
-on your system:
+version of Theano. First, get the code using `mercurial <http://www.selenic.com/mercurial/wiki/>`__:
 .. code-block:: bash
-    easy_install Theano
+    hg clone http://hg.assembla.com/theano Theano
-Manual install
+Configuring PYTHONPATH
--------------
+---------------------------
+The subdirectory Theano/theano has to be located in a path
+mentioned in your PYTHONPATH. In order to do that, you can either
+create a symbolic link to Theano/theano in a directory already
+mentioned in your PYTHONPATH environment variable, or modify the
+PYTHONPATH so that it mentions Theano.
-To install the latest release of Theano from source, visit the `downloads
+To create a symbolic link:
-<http://pylearn.org/theano/downloads/>`_ page and download the release you
-want. Unpack the release, and type:
 .. code-block:: bash
-    python setup.py build
+    ln -s Theano/theano <someplace on your PYTHONPATH>/theano
-    python setup.py test
-    python setup.py install
-.. _install_bleeding_edge:
+To modify the environment variable PYTHONPATH in bash, you may do this:
-Bleeding Edge
+.. code-block:: bash
--------------
-Feeling lucky and want to run bleeding-edge code?
+    export PYTHONPATH=<path to Theano's parent dir>/Theano:$PYTHONPATH
-Then check out the :ref:`dev_start_guide` guide.
+In csh:
-Configuring the environment
+.. code-block:: csh
---------------------------
+    setenv PYTHONPATH <path to Theano's parent dir>/Theano:$PYTHONPATH
+Configuring Theano's environmental variables
+---------------------------------------------
 Two environment variables are used to control automatic code
 generation. It is possible to use Theano in a way which avoids all
@@ -118,6 +128,33 @@ automatic code generation, but that way is much, much slower.
    Omitting this variable defaults the mode to ``'FAST_RUN'``.
+Testing your installation
+---------------------------
+Once you have completed these steps, you should run the theano test suite like this:
+.. code-block:: bash
+    cd Theano
+    nosetests #execute all the tests
+All tests should pass. If some test fails on your machine, you are
+encouraged to tell us what went wrong on the ``theano-users@googlegroups.com``
+mailing list.
+Updating
+-------------
+To update your library to the latest revision, change directory (``cd``)
+to your ``Theano`` folder and execute the following command:
+.. code-block:: bash
+    hg pull -u
+You should update frequently, bugs are fixed on a very regular basis.
 Mac
 ---
@@ -126,20 +163,21 @@ Mac
 -
    .. code-block:: bash
-        $ sudo port install gcc42 py25-zlib py25-numpy py25-scipy mercurial
+        $ sudo port install gcc44 py25-zlib py25-numpy py25-scipy mercurial
-    Note that compiling gcc42 takes a significant time (hours) so it is probably
+    Note that compiling gcc takes a significant time (hours) so it is probably
    not the best solution if you are in a rush! It may happen that SciPy
    fails to compile the first time and still compiles just fine on a second
    try. Same thing with py25-zlib.
- Install some kind of BLAS library (TODO: how?)
+- scipy depends on ATLAS (a BLAS library), which will be installed by MacPorts.
 - Set ``THEANO_BLAS_LDFLAGS`` to something which will link against said BLAS
  library.  E.g., ``THEANO_BLAS_LDFLAGS='-lcblas -latlas -lgfortran'``.
-This advice has not been tested recently, so please inform us of your results.
+These installation instructions have not tested recently, please infom us of your results! 
+We would be especially interested in dependencies that we missed listing, as well as tests
+that fail on your platform (use the ``theano-users@googlegroups.com`` mailing list).
 Windows
@@ -247,9 +285,9 @@ Generating the documentation
 ----------------------------
 You can read the latest HTML documentation `here
-<http://pylearn.org/theano/contents.html>`__.
+<http://deeplearning.net/theanodoc>`__.
 You can download the latest PDF documentation `here
-<http://pylearn.org/theano/theano.pdf>`__.
+<http://deeplearning.net/theanodoc/theano.pdf>`__.
 We recommend you look at the documentation on the website, since it
 will be more current than the documentation included with the package.

--- a/doc/internal/dev_start_guide.txt
+++ b/doc/internal/dev_start_guide.txt
@@ -21,11 +21,10 @@ Developer Start Guide
 Accounts
 ========
-To obtain developer access: send an email to an admin with an username and
+To obtain developer access: register with `Assembla
-temporary password. Pending approval, this will give you access to both the
+<http://www.assembla.com/>`_ and add yourself as a watcher on the `Theano space 
-repository and Trac. You should then change your password in the
+<http://www.assembla.com/spaces/theano>`_. Then send an email to an admin asking 
-`<http://pylearn.org/theano/prefs preferences>` tab - do *NOT* use a good 
+to be promoted to a member of the project.
-password! We are using plain text http which is not secure.
 Theano code
@@ -34,10 +33,9 @@ Theano code
 *To get the source via mercurial,* you must have `mercurial
 <http://www.selenic.com/mercurial/wiki/>`__ installed.
-The code that makes up Theano is in a single repository available in
+The code that makes up Theano is in a `single repository
-`<http://pylearn.org/hg/Theano>`__.
+<http://www.assembla.com/spaces/theano/trac_mercurial_tool>`__. As a developer, 
+you should clone this repository like this:
-As a developer, you should clone this repository like this:
 .. code-block:: bash
@@ -121,9 +119,6 @@ to your ``Theano`` folder and execute the following command:
    hg pull -u
-You may also download the latest source directly as a gzip'd tar file:
-`<http://pylearn.org/hg/Theano/archive/tip.tar.gz>`__.
 Nightly test
 ============

--- a/doc/introduction.txt
+++ b/doc/introduction.txt
@@ -5,43 +5,40 @@
 Theano at a Glance
 ==================
-Theano is a Python library that allows you to define, optimize, and evaluate
+Theano is a Python library that lets you to define, optimize, and evaluate
-mathematical expressions involving multi-dimensional arrays. Using Theano it is
+mathematical expressions, especially ones with multi-dimensional arrays
+(numpy.ndarray).  Using Theano it is
 possible to attain speeds rivaling hand-crafted C implementations for problems
 involving large amounts of data.  It can also surpass C on a CPU by many orders
 of magnitude by taking advantage of recent GPUs.
-Theano melds some aspects of a computer algebra system (CAS) with
+Theano combines aspects of a computer algebra system (CAS) with aspects of an
-aspects of an optimizing compiler. It can even transform some or all
+optimizing compiler. It can also generate customized C code for many
-of the mathematical expression into C code and compile it into native
+mathematical operations.  This combination of CAS with optimizing compilation
-machine instructions. This combination of CAS with optimizing
+is particularly useful for tasks in which complicated mathematical expressions
-compilation is particularly useful for tasks in which complicated
+are evaluated repeatedly and evaluation speed is critical.  For situations
-mathematical expressions are evaluated repeatedly and evaluation speed
+where many different expressions are each evaluated once Theano can minimize
-is critical.
+the amount of compilation/analysis overhead, but still provide symbolic
+features such as automatic differentiation.
-Theano supports a range of numerical types in multiple dimensions and
-a number of well-tested operations. It also allows you to compute the
-gradient of an expression with respect to another. Symbolic
-expressions may be compiled into functions, which work on the same
-data structures as numpy_, allowing for easy interoperability.
 Theano's compiler applies many optimizations of varying complexity to
 these symbolic expressions. These optimizations include, but are not
 limited to:
+* use of GPU for computations
 * constant folding
-* merging of similar subgraphs, to avoid calculating the same values
+* merging of similar subgraphs, to avoid redundant calculation
-  more than once
+* arithmetic simplification (e.g. ``x*y/x -> y``, ``--x -> x``)
-* arithmetic simplification (``x*y/x -> y``)
+* inserting efficient BLAS_ operations (e.g. ``GEMM``) in a variety of
-* inserting efficient BLAS_ operations
+  contexts
-* using inplace operations wherever it is safe to do so.
+* using memory aliasing to avoid calculation
+* using inplace operations wherever it does not interfere with aliasing
-Theano defines several optimizations which improve the numerical
+* loop fusion for elementwise sub-expressions
-stability of computations.
+* improvements to numerical stability (e.g.  :math:`\log(1+\exp(x))` and :math:`\log(\sum_i \exp(x[i]))`)
+* for a complete list, see :ref:`_optimizations`
-Theano was written at the LISA_ lab to support the development of
-efficient machine learning algorithms while minimizing human time. We
+Theano was written at the LISA_ lab to support rapid development of
-use it especially in gradient-based learning techniques.  Theano is
+efficient machine learning algorithms. Theano is
 named after the `Greek mathematician`_, who may have been Pythagoras'
 wife.  Theano is released under a BSD license (:ref:`link <license>`).
@@ -92,30 +89,28 @@ machine instructions.
 What does it do that they don't?
 ================================
-Theano is a python library and optimizing compiler for manipulating
+Theano is a Python library and optimizing compiler for manipulating
 and evaluating expressions, especially matrix-valued
 ones. Manipulation of matrices is typically done using the numpy
 package, so what does Theano do that Python and numpy do not?
- *execution speed optimizations*: Theano can use `g++` to compile
+- *execution speed optimizations*: Theano can use `g++` or `nvcc` to compile
-  parts your expression graph into native machine code, which runs
+  parts your expression graph into CPU or GPU instructions, which run
-  much faster than python.
+  much faster than pure Python.
 - *symbolic differentiation*: Theano can automatic build symbolic graphs
  for computing gradients.
- *stability optimizations*: Theano can recognize numerically unstable
+- *stability optimizations*: Theano can recognize [some] numerically unstable
  expressions and compute them with more stable algorithms.
-There exist another symbolic package in Python, namely sympy_. Theano
+The closest Python package to Theano is sympy_.
-is different from sympy in the sense that while Theano allows symbolic
+Theano focuses more on tensor expressions than Sympy, and has more machinery
-manipulation it puts more emphasis on the evaluation of these expressions
+for compilation.  Sympy has more sophisticated algebra rules and can
-and being able to repeatedly evaluate them on many different inputs. Theano
+handle a wider variety of mathematical operations (such as series, limits, and integrals).
-is also better suited to handling large tensors which have no
-assumed structures.
 If numpy_ is to be compared to MATLAB_ and sympy_ to Mathematica_,
-Theano is a sort of hybrid of the two which tries to make the best of
+Theano is a sort of hybrid of the two which tries to combine the best of
 both worlds.
@@ -134,7 +129,8 @@ Getting started
  the :ref:`tutorial` first though.
-A PDF version of the online documentation may be found `here <theano.pdf>`_.
+A PDF version of the online documentation may be found `here
+<http://deeplearning.net/theanodoc/theano.pdf>`_.
 Contact us

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -331,6 +331,8 @@ Indexing
 Basic indexing.
+    Mirrors numpy's `basic indexing  <http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html>`_. Read that page first.
 Advanced indexing.
 .. _libdoc_tensor_elementwise:

--- a/doc/links.txt
+++ b/doc/links.txt
@@ -40,10 +40,10 @@ This is a sort of memo for developers and would-be developers.
 .. _mercurial: http://www.selenic.com/mercurial/wiki/
 .. _nosetests: http://somethingaboutorange.com/mrl/projects/nose/
 .. _numpy: http://numpy.scipy.org/
-.. _python: http://www.python.or
+.. _python: http://www.python.org
 .. _scipy: http://scipy.org/
-.. _autodiff: http://autodiff.org
+.. _autodiff: http://www.autodiff.org
 .. _boost.python: http://www.boost.org/doc/libs/1_38_0/libs/python/doc/index.html
 .. _cython: http://www.cython.org/
 .. _liboil: http://liboil.freedesktop.org/wiki/

--- a/doc/tutorial/symbolic_graphs.txt
+++ b/doc/tutorial/symbolic_graphs.txt
@@ -41,9 +41,10 @@ details about these building blocks see :ref:`variable`, :ref:`op`,
 .. figure:: apply.png 
    :align: center
-    Arrows represent references to the Python objects pointed at. The blue
-    box is an :ref:`apply` node. Red boxes are :ref:`variable` nodes. Green
+Arrows represent references to the Python objects pointed at. The blue
-    circles are :ref:`Ops <op>`. Purple boxes are :ref:`Types <type>`.
+box is an :ref:`apply` node. Red boxes are :ref:`variable` nodes. Green
+circles are :ref:`Ops <op>`. Purple boxes are :ref:`Types <type>`.
 The graph can be traversed starting from outputs (the result of some
@@ -104,7 +105,7 @@ how to compute the gradient of the node's outputs with respect to its
 inputs. Note that if an :ref:`op` does not provide this information, 
 it is assumed that the gradient does not defined.
 Using the 
-`chain rule <http://en.wikipedia.org/wiki/Chain_rile>`_ 
+`chain rule <http://en.wikipedia.org/wiki/Chain_rule>`_ 
 these gradients can be composed in order to obtain the expression of the 
 gradient of the graph's output with respect to the graph's inputs .

--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
--- a/theano/sandbox/scan.py
+++ b/theano/sandbox/scan.py
@@ -62,17 +62,6 @@ def scan(fn, sequences, initial_states, non_sequences, inplace_map={},
    # compute number of sequences and number of seqs   
    n_seqs     = len(seqs)
-    # see if there are outputs that do not feed anything back to the function
-    # applied recursively
-    #outs_tapkeys = outputs_taps.keys()
-    #outs_tapkeys.sort()
-    #for k in outs_tapkeys:
-    #    if outputs_taps[k] == []:
-    #        # add empty lists where you have outputs that do not have past 
-    #        # values
-    #        init_outs = init_outs[:k] + [[]] + init_outs[k:]
    n_outs   = len(init_outs)

--- a/theano/sandbox/test_conv.py
+++ b/theano/sandbox/test_conv.py
--- a/theano/sandbox/test_scan.py
+++ b/theano/sandbox/test_scan.py
-from scan import Scan
 import unittest
 import theano
+import theano.sandbox.scan
 import random
 import numpy.random
@@ -74,6 +75,14 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps = None, tol = None,
+def compareArrays(a,b):
+    if type(a) in (list,tuple):
+        a = numpy.array(a)
+    if type(b) in (list, tuple):
+        b = numpy.array(b)
+    return numpy.all( abs(a-b) < 1e-5)
@@ -85,7 +94,7 @@ class T_Scan(unittest.TestCase):
    # generator network, only one output , type scalar ; no sequence or 
    # non sequence arguments
-    def test_1():
+    def test_1(self):
      def f_pow2(x_tm1):
        return (2*x_tm1, {})
@@ -94,11 +103,12 @@ class T_Scan(unittest.TestCase):
      Y = theano.sandbox.scan.scan(f_pow2, [],s, [],n_steps = n_steps)
      f1 = theano.function([s,n_steps], Y)
-      assert( numpy.any(f1([1],3)== [2,4,8])  )
+      assert(compareArrays(f1([1],3), [2,4,8]))
    # simple rnn, one input, one state, weights for each; input/state are 
    # vectors, weights are scalars
-    def test_2():
+    def test_2(self):
        def f_rnn(u_t,x_tm1,W_in, W):
            return (u_t*W_in+x_tm1*W, {})
@@ -109,14 +119,15 @@ class T_Scan(unittest.TestCase):
        Y = theano.sandbox.scan.scan(f_rnn, u,x0,[W_in,W])
-        f2 = theano.function([u,x0,W_in,W], Y)
+        f2    = theano.function([u,x0,W_in,W], Y)
+        v_u   = numpy.array([1.,2.,3.,4.])
-        assert(numpy.any(f2([1,2,3,4],[1],.1,1)== \
+        v_x0  = numpy.array([1])
-                numpy.array([1.1,1.3,1.6,2.])))
+        v_out = numpy.array([1.1,1.3,1.6,2.])
+        assert(compareArrays( f2(v_u,v_x0,.1,1), v_out   ) )
    # simple rnn, one input, one state, weights for each; input/state are 
    # vectors, weights are scalars; using shared variables
-    def test_3():
+    def test_3(self):
        u    = theano.tensor.dvector()
        x0   = theano.tensor.dvector()
@@ -128,14 +139,16 @@ class T_Scan(unittest.TestCase):
        Y = theano.sandbox.scan.scan(f_rnn_shared, u,x0,[])
-        f3 = theano.function([u,x0], Y)
+        f3    = theano.function([u,x0], Y)
+        v_u   = numpy.array([1.,2.,3.,4.])
-        assert(numpy.any(f3([1,2,3,4],[1])== numpy.array([1.1,1.3,1.6,2.])))
+        v_x0  = numpy.array([1.])
+        v_out = numpy.array([1.1,1.3,1.6,2.])
+        assert(compareArrays(f3(v_u,v_x0),v_out))
    # some rnn with multiple outputs and multiple inputs; other dimension 
    # instead of scalars/vectors
-    def test_4():
+    def test_4(self):
        W_in2 = theano.shared(numpy.array([1.,2.]), name='win2')
        W     = theano.shared(numpy.array([[2.,1.],[1.,1.]]), name='w')
@@ -152,20 +165,22 @@ class T_Scan(unittest.TestCase):
        Y = theano.sandbox.scan.scan(f_rnn_cmpl,[u1,u2],[x0,y0],W_in1)
-        f4 = theano.function([u1,u2,x0,y0,W_in1], Y)
+        f4     = theano.function([u1,u2,x0,y0,W_in1], Y)
+        v_u1   = numpy.array([[1.,2.],[1.,2.],[1.,2.]])
-        (x,y) =  f4( numpy.array([[1,2],[1,2],[1,2]]), \
+        v_u2   = numpy.array([1.,2.,3.])
-                  numpy.array([1,2,3]),             \
+        v_x0   = numpy.array([[0.,0.]])
-                  numpy.array([[0,0]]),             \
+        v_y0   = numpy.array([1])
-                  numpy.array([1]),                 \
+        v_Win1 = numpy.array([[1.,1.],[1.,1.]])
-                  numpy.array([[1,1],[1,1]]))
+        v_x    = numpy.array([[4.,5.],[18.,16.],[58.,43.]])
+        v_y    = numpy.array([0.,7.,25.])
-        assert( numpy.all(x == numpy.array([[4.,5.],[18.,16.],[58.,43.]])))
+        (x,y) =  f4( v_u1, v_u2, v_x0, v_y0, v_Win1)
-        assert( numpy.all(y == numpy.array([0.,7.,25.])))
+        assert( compareArrays(x,v_x)) 
+        assert( compareArrays(y,v_y))
    # basic ESN using updates 
-    def test_5(): 
+    def test_5(self): 
        W_in = theano.shared(numpy.array([1.,1.]), name='win')
        W    = theano.shared(numpy.array([[.1,0.],[.0,.1]]),name='w')
        W_out= theano.shared(numpy.array([.5,1.]), name='wout')
@@ -180,12 +195,15 @@ class T_Scan(unittest.TestCase):
        Y = theano.sandbox.scan.scan(f_ESN,u,y0,[],outputs_taps={0:[]})
-        f5 = theano.function([u,y0],Y)
+        f5    = theano.function([u,y0],Y)
-        assert( f5( numpy.array([1,2,3]), numpy.array([0])) == \
+        v_u   = numpy.array([1.,2.,3.])
-                 numpy.array([0.,1.4,3.15]))
+        v_y0  = numpy.array([0.])
+        v_out  = numpy.array([0.,1.5,3.15])
+        out = f5( v_u, v_y0 )
+        assert( compareArrays(v_out, out))
    # basic ESN using updates ; moving backwards
-    def test_6(): 
+    def test_6(self): 
        W_in = theano.shared(numpy.array([1.,1.]), name='win')
        W    = theano.shared(numpy.array([[.1,0.],[.0,.1]]),name='w')
        W_out= theano.shared(numpy.array([.5,1.]), name='wout')
@@ -201,9 +219,55 @@ class T_Scan(unittest.TestCase):
        Y = theano.sandbox.scan.scan(f_ESN,u,y0,[],outputs_taps={0:[]}, \
                                     go_backwards = True)
-        f6 = theano.function([u,y0],Y)
+        f6    = theano.function([u,y0],Y)
-        assert( f6( numpy.array([1,2,3]), numpy.array([0])) == \
+        v_u   = numpy.array([1.,2.,3.])
-                 numpy.array([0., 4.5, 3.45]))
+        v_y0  = numpy.array([0])
+        v_out = numpy.array([0.,4.5,3.45])
+        out   = f6(v_u, v_y0)
+        assert( compareArrays(out, v_out))
+    # simple rnn, one input, one state, weights for each; input/state are 
+    # vectors, weights are scalars; using shared variables and past 
+    # taps (sequences and outputs)
+    def test_7(self):
+        u    = theano.tensor.dvector()
+        x0   = theano.tensor.dvector()
+        W_in = theano.shared(.1, name = 'w_in')
+        W    = theano.shared(1., name ='w')
+        def f_rnn_shared(u_tm2, x_tm1, x_tm2):
+            return (u_tm2*W_in+x_tm1*W+x_tm2, {})
+        Y = theano.sandbox.scan.scan(f_rnn_shared, u,x0, [], \
+                 sequences_taps = {0:[-2]}, outputs_taps = {0:[-1,-2]})
+        f7 = theano.function([u,x0], Y)
+        #print f7([1,2,3,4],[1,2])
+    # simple rnn, one input, one state, weights for each; input/state are 
+    # vectors, weights are scalars; using shared variables and past 
+    # taps (sequences and outputs) and future taps for sequences
+    def test_8(self):
+        u    = theano.tensor.dvector()
+        x0   = theano.tensor.dvector()
+        W_in = theano.shared(.1, name = 'w_in')
+        W    = theano.shared(1., name ='w')
+        def f_rnn_shared(u_tm2,u_tp2, x_tm1, x_tm2):
+            return ((u_tm2+u_tp2)*W_in+x_tm1*W+x_tm2, {})
+        Y = theano.sandbox.scan.scan(f_rnn_shared, u,x0, [], \
+                 sequences_taps = {0:[-2,2]}, outputs_taps = {0:[-1,-2]})
+        f8 = theano.function([u,x0], Y)
+        #print f8([1,2,3,4,5,6],[1,2])
    '''
@@ -214,7 +278,8 @@ class T_Scan(unittest.TestCase):
        - test gradient (go_bacwards) 
        - test gradient (multiple outputs / some uncomputable )
        - test gradient (truncate_gradient)
-        - test gradient (force_gradient) 
+        - test gradient (force_gradient)
+        - test_gradient (taps past/future)
        - test inplace map
    '''

--- a/theano/tensor/nnet.py
+++ b/theano/tensor/nnet.py
@@ -1020,13 +1020,18 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
    #           / softmax(x)
    #   which arises from the gradient of log(softmax(x))[arange(y.shape[0]), y]
    #
-    # TODO: explain variants of case 1.
-    # TODO: explain other variants of case 2.
    # In some cases, in case 2., insted of "-1. like (AdvancedSubtensor...)",
    # we can have "-1. like ([-1] * AdvancedSubtensor...)". This case will be
    # recognized too, but other variants, even with the same shape, might not
    # (yet).
+    # The base cases are realized when the gradient of the
+    # cost wrt the output is equal to 1.  When this gradient
+    # has another (scalar) value, it typically appears in the
+    # second argument of AdvancedIncSubtensor. In that case, we
+    # try to extract it, and feed it as the output gradient of
+    # crossentropy_softmax_1hot_with_bias_dx.
    #
    # N.B. Regarding clients -- This substitution is important for numerical stability, so we
    # perform the substitution even when intermediate values have multiple clients.
@@ -1052,43 +1057,60 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
        else:
            return
-        # Check that incr has the form -1./sm[arange(len(y)), y]
+        # In the base case (output gradient = 1), incr is -1./sm[arange(len(y)), y]
+        # Here, we are looking for the AdvancedSubtensor term (sm[arange(len(y)), y]),
+        # the remaining of the expression will be used to compute outgrad_factor
+        # outgrad_factor will be constructed in 3 steps as follow:
+        # outgrad_factor = +/- 1 (initial sign)
+        # outgrad_factor *= numerator
+        # outgrad_factor /= denominator
+        adv_subtensor = None
+        outgrad_factor = 1.
+        # If there's a 'minus' sign before the whole expression, put it in
+        # outgrad_factor and iterate
+        if incr.owner and incr.owner.op == tensor.neg:
+            outgrad_factor = -1.
+            incr = incr.owner.inputs[0]
        if incr.owner and incr.owner.op == tensor.true_div:
            num, denom = incr.owner.inputs
-            if not (hasattr(num, 'data') and numpy.all(num.data == -1)):
+            # set outgrad_factor according to the numerator,
+            # it may be divided later
+            if hasattr(num, 'data') and numpy.all(num.data == -1):
+                # Base case, num is -1
+                outgrad_factor *= 1.
+            elif numpy.all(num.broadcastable):
+                # Otherwise, it should be a scalar
+                outgrad_factor *= -num
+            else:
                return
-            #else: OK
            if not denom.owner:
                return
-            adv_subtensor = None
            if isinstance(denom.owner.op, tensor.AdvancedSubtensor):
+                # Base case
                adv_subtensor = denom
-                mult_factor = 1
+                outgrad_factor /= 1.
            elif denom.owner.op == tensor.mul:
-                # Try to find the AdvancedSubtensor node mentionned above
+                # Try to find the AdvancedSubtensor node mentionned above,
-                # For now, we support only the case where the other inputs
+                # and a scalar that is equal to the output gradient
-                # of the "mul" node are of integer type, so we are sure it
-                # does not affect the gradient computation.
                for i, input in enumerate(denom.owner.inputs):
                    if input.owner and isinstance(input.owner.op, tensor.AdvancedSubtensor):
-                        adv_subtensor = input
                        other_inputs = [in_ for (j, in_) in enumerate(denom.owner.inputs) if j!=i]
                        if len(other_inputs) == 1:
-                            mult_factor = other_inputs[0]
+                            rest = other_inputs[0]
                        else:
-                            mult_factor = tensor.mul(*[other_inputs])
+                            rest = tensor.mul(*[other_inputs])
-                        # Check that mult_factor is of integer type
+                        # Check that rest is a scalar
-                        if mult_factor.dtype.startswith('int')\
+                        if numpy.all(rest.broadcastable):
-                                or mult_factor.dtype.startswith('uint'):
+                            adv_subtensor = input
-                            #OK
+                            outgrad_factor /= rest
                            break
-                        else:
-                            # That subtensor was not right
-                            adv_subtensor = None
            else:
                return
@@ -1101,6 +1123,8 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
                if not (maybe_sm is sm and maybe_rows is rows and maybe_labels is labels):
                    return
                #else: OK
+            else:
+                return
        else:
            return
@@ -1147,7 +1171,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
            if incr.owner and incr.owner.op == tensor.fill:
                model, value = incr.owner.inputs
                adv_subtensor = None
-                mult_factor = 1
+                outgrad_factor = None
                if model.owner and isinstance(model.owner.op, tensor.AdvancedSubtensor):
                    adv_subtensor = model
                else:
@@ -1169,17 +1193,16 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
                    if not (maybe_log_sm is log_sm and maybe_rows is rows and maybe_labels is labels):
                        return
                    #else: OK
+                else:
+                    return
                # In the base case, value is the constant '-1'
                if hasattr(value, 'data') and numpy.all(value.data == -1):
-                    mult_factor = 1
+                    outgrad_factor = 1.
-                # In the case of -1/denom, if denom is of integer type
+                # Otherwise, it should be a scalar, and the output gradient
-                elif value.owner and value.owner.op == tensor.true_div:
+                # would be -value
-                    val_num, val_denom = value.owner.inputs
+                elif numpy.all(value.broadcastable):
-                    if hasattr(val_num, 'data') and numpy.all(val_num.data == -1):
+                    outgrad_factor = -value
-                        if val_denom.dtype.startswith('int')\
-                                or val_denom.dtype.startswith('uint'):
-                            mult_factor = val_denom
                else:
                    return
@@ -1204,11 +1227,10 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
    # Dimension check before substitution
    if labels.ndim == 1 and x_var.ndim == 2:
-        if mult_factor is not None:
+        if outgrad_factor is not None:
-            out_grad = tensor.fill(x_var[:,0], 1./mult_factor)
+            out_grad = tensor.fill(x_var[:,0], outgrad_factor)
            return [crossentropy_softmax_1hot_with_bias_dx(out_grad, sm, labels)]
        else:
-            print 'mult_factor is None?'
            return
    else:
        return

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -346,7 +346,7 @@ def local_IncSubtensor_serialize(node):
    #
    #  add(x, incsubtensor(b, c), incsubtensor(b, d))
-    #  -> incsubtensor(incsubtensor(add(x,b), c), d)
+    #  -> incsubtensor(incsubtensor(add(x,b,b), c), d)
    """
    def movable(i):
@@ -354,7 +354,8 @@ def local_IncSubtensor_serialize(node):
        return i.owner \
                and isinstance(i.owner.op, T.IncSubtensor) \
                and i.type == o_type \
-                and len(i.clients) == 1
+                and len(i.clients) == 1 \
+                and not i.owner.op.set_instead_of_inc
    if node.op == T.add:
        o_type = node.outputs[0].type
@@ -383,7 +384,8 @@ def local_IncSubtensor_serialize(node):
 @gof.local_optimizer([None])
 def local_inplace_setsubtensor(node):
    if isinstance(node.op, T.IncSubtensor) and not node.op.inplace:
-        new_op = T.IncSubtensor(node.op.idx_list, inplace=True)
+        new_op = T.IncSubtensor(node.op.idx_list, inplace=True, \
+                        set_instead_of_inc=node.op.set_instead_of_inc)
        new_node = new_op(*node.inputs)
        return [new_node]
    return False
@@ -932,8 +934,11 @@ def local_neg_neg(node):
 @register_specialize
 @gof.local_optimizer([T.neg])
 def local_neg_div_neg(node):
+    """- (-a / b) -> a / b
+    Also performs - (c / b) -> ((-c) / b) when c is a scalar constant.
+    """
    if node.op == T.neg:
-        """- (-a / b) -> a / b"""
        if node.inputs[0].owner and node.inputs[0].owner.op == T.true_div:
            frac = node.inputs[0]
            num, denom = frac.owner.inputs
@@ -942,6 +947,11 @@ def local_neg_div_neg(node):
                    # No other clients of the original division
                    new_num = num.owner.inputs[0]
                    return [T.true_div(new_num, denom)]
+            elif numpy.all(num.broadcastable) and isinstance(num, gof.Constant):
+                if len(frac.clients) == 1:
+                    new_num = -num.data
+                    return [T.true_div(new_num, denom)]
 @gof.local_optimizer([T.mul])
 def local_mul_zero(node):

--- a/theano/tensor/tests/test_nnet.py
+++ b/theano/tensor/tests/test_nnet.py