merge NC

cd9e62a0 · James Bergstra · cc8822d3 · f5aec8b8 · cd9e62a0 · cd9e62a0
--- a/doc/hpcs2011_tutorial/advanced_theano.txt
+++ b/doc/hpcs2011_tutorial/advanced_theano.txt
+
+.. _advanced_theano:
+
+
+***************
+Advanced Theano
+***************
+
+Compilation pipeline
+--------------------
+
+.. image:: pics/pipeline.png
+   :width: 400 px
+
+Inplace optimization
+--------------------
+
+- 2 type of inplace operations:
+
+  - An op that return a view on its inputs (e.g. reshape, inplace transpose)
+  - An op that write the output on the inputs memory space
+
+- This allows some memory optimization
+- The Op must tell Theano if they work inplace
+- Inplace Op add constraints to the order of execution
+
+
+Profiling
+---------
+
+- To replace the default mode with this mode, use the Theano flags ``mode=ProfileMode``
+
+- To enable the memory profiling use the flags ``ProfileMode.profile_memory=True``
+
+Theano output:
+
+.. code-block:: python
+
+    """
+    Time since import 33.456s
+    Theano compile time: 1.023s (3.1% since import)
+      Optimization time: 0.789s
+      Linker time: 0.221s
+    Theano fct call 30.878s (92.3% since import)
+     Theano Op time 29.411s 87.9%(since import) 95.3%(of fct call)
+     Theano function overhead in ProfileMode 1.466s 4.4%(since import)
+                                                  4.7%(of fct call)
+    10001 Theano fct call, 0.003s per call
+    Rest of the time since import 1.555s 4.6%
+
+    Theano fct summary:
+    <% total fct time> <total time> <time per call> <nb call> <fct name>
+     100.0% 30.877s 3.09e-03s 10000 train
+      0.0% 0.000s 4.06e-04s 1 predict
+
+    Single Op-wise summary:
+    <% of local_time spent on this kind of Op> <cumulative %>
+        <self seconds> <cumulative seconds> <time per call> <nb_call>
+        <nb_op> <nb_apply> <Op name>
+       87.3%   87.3%  25.672s  25.672s  2.57e-03s   10000  1  1 <Gemv>
+        9.7% s  97.0%  2.843s  28.515s  2.84e-04s   10001  1  2 <Dot>
+        2.4%   99.3%  0.691s  29.206s  7.68e-06s * 90001 10 10 <Elemwise>
+        0.4%   99.7%  0.127s  29.334s  1.27e-05s   10000  1  1 <Alloc>
+        0.2%   99.9%  0.053s  29.386s  1.75e-06s * 30001  2  4 <DimShuffle>
+        0.0%  100.0%  0.014s  29.400s  1.40e-06s * 10000  1  1 <Sum>
+        0.0%  100.0%  0.011s  29.411s  1.10e-06s * 10000  1  1 <Shape_i>
+    (*) Op is running a c implementation
+
+    Op-wise summary:
+    <% of local_time spent on this kind of Op> <cumulative %>
+        <self seconds> <cumulative seconds> <time per call>
+        <nb_call> <nb apply> <Op name>
+       87.3%   87.3%  25.672s  25.672s  2.57e-03s   10000  1 Gemv{inplace}
+        9.7%   97.0%  2.843s  28.515s  2.84e-04s   10001  2 dot
+        1.3%   98.2%  0.378s  28.893s  3.78e-05s * 10000  1 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}
+        0.4%   98.7%  0.127s  29.021s  1.27e-05s   10000  1 Alloc
+        0.3%   99.0%  0.092s  29.112s  9.16e-06s * 10000  1 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)]
+        0.1%   99.3%  0.033s  29.265s  1.66e-06s * 20001  3 InplaceDimShuffle{x}
+       ... (remaining 11 Apply account for 0.7%(0.00s) of the runtime)
+    (*) Op is running a c implementation
+
+    Apply-wise summary:
+    <% of local_time spent at this position> <cumulative %%>
+        <apply time> <cumulative seconds> <time per call>
+        <nb_call> <Apply position> <Apply Op name>
+       87.3%   87.3%  25.672s  25.672s 2.57e-03s  10000  15 Gemv{inplace}(w, TensorConstant{-0.01}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.9998})
+        9.7%   97.0%  2.843s  28.515s 2.84e-04s  10000   1 dot(x, w)
+        1.3%   98.2%  0.378s  28.893s 3.78e-05s  10000   9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
+        0.4%   98.7%  0.127s  29.020s 1.27e-05s  10000  10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+        0.3%   99.0%  0.092s  29.112s 9.16e-06s  10000  13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0,0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0,0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0)
+        0.3%   99.3%  0.080s  29.192s 7.99e-06s  10000  11 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)](Elemwise{neg,no_inplace}.0)
+       ... (remaining 14 Apply instances account for
+           0.7%(0.00s) of the runtime)
+
+    Profile of Theano functions memory:
+    (This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)
+    Theano fct: train
+        Max without gc, inplace and view (KB) 2481
+        Max FAST_RUN_NO_GC (KB) 16
+        Max FAST_RUN (KB) 16
+        Memory saved by view (KB) 2450
+        Memory saved by inplace (KB) 15
+        Memory saved by GC (KB) 0
+        <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)>
+            <created/inplace/view> <Apply node>
+        <created/inplace/view> is taked from the op declaration, not ...
+             2508800B  [2508800] v InplaceDimShuffle{1,0}(x)
+                6272B  [6272] i Gemv{inplace}(w, ...)
+                3200B  [3200] c Elemwise{Composite{...}}(y, ...)
+
+    Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list).
+    Test them first, as they are not guaranteed to always provide a speedup.
+      - Try the Theano flag floatX=float32
+    """
+
+Exercise 4
+-----------
+
+- In the last exercises, do you see a speed up with the GPU?
+- Where does it come from? (Use ProfileMode)
+- Is there something we can do to speed up the GPU version?
+
+
+Printing/Drawing Theano graphs
+------------------------------
+
+- Pretty Printing
+
+``theano.printing.pprint(variable)``
+
+>>> theano.printing.pprint(prediction)
+gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorConstant{0.5})
+
+
+- Debug Print
+
+``theano.printing.debugprint({fct, variable, list of variables})``
+
+>>> theano.printing.debugprint(prediction)
+Elemwise{gt,no_inplace} [@181772236] ''
+ |Elemwise{true_div,no_inplace} [@181746668] ''
+ | |InplaceDimShuffle{x} [@181746412] ''
+ | | |TensorConstant{1} [@181745836]
+ | |Elemwise{add,no_inplace} [@181745644] ''
+ | | |InplaceDimShuffle{x} [@181745420] ''
+ | | | |TensorConstant{1} [@181744844]
+ | | |Elemwise{exp,no_inplace} [@181744652] ''
+ | | | |Elemwise{sub,no_inplace} [@181744012] ''
+ | | | | |Elemwise{neg,no_inplace} [@181730764] ''
+ | | | | | |dot [@181729676] ''
+ | | | | | | |x [@181563948]
+ | | | | | | |w [@181729964]
+ | | | | |InplaceDimShuffle{x} [@181743788] ''
+ | | | | | |b [@181730156]
+ |InplaceDimShuffle{x} [@181771788] ''
+ | |TensorConstant{0.5} [@181771148]
+>>> theano.printing.debugprint(predict)
+Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
+ |dot [@183018796] ''   1
+ | |x [@183000780]
+ | |w [@183000812]
+ |InplaceDimShuffle{x} [@183133580] ''   0
+ | |b [@183000876]
+ |TensorConstant{[ 0.5]} [@183084108]
+
+- Picture Printing of Graphs
+
+>>> theano.printing.pydotprint_variables(prediction)
+
+.. image:: pics/logreg_pydotprint_prediction.png
+   :width: 800 px
+
+All pydotprint* requires graphviz and pydot
+
+>>> theano.printing.pydotprint(predict)
+
+.. image:: pics/logreg_pydotprint_predic.png
+   :width: 800 px
+
+>>> theano.printing.pydotprint(train) # This is a small train example!
+
+.. image:: pics/logreg_pydotprint_train.png
+   :width: 1500 px
+
+
+Debugging
+---------
+
+- Run with the flag ``mode=DebugMode``
+
+  - 100-1000x slower
+  - Test all optimization steps from the original graph to the final graph
+  - Checks many things that Op should/shouldn't do
+  - Executes both the Python and C code versions
+
+- Run with the Theano flag ``compute_test_value = {``off'',``ignore'', ``warn'', ``raise''}``
+
+  - Run the code as we create the graph
+  - Allows you to find the bug earlier (ex: shape mismatch)
+  - Makes it easier to identify where the problem is in *your* code
+  - Use the value of constants and shared variables directly
+  - For pure symbolic variables uses ``x.tag.test_value = numpy.random.rand(5,10)``
+
+- Run with the flag ``mode=FAST_COMPILE``
+  
+  - Few optimizations
+  - Run Python code (better error messages and can be debugged interactively in the Python debugger)
+
+
+Loops
+-----
+
+**Scan**
+
+- General form of **recurrence**, which can be used for looping.
+- **Reduction** and **map** (loop over the leading dimensions) are special cases of Scan
+- You 'scan' a function along some input sequence, producing an output at each time-step
+- The function can see the **previous K time-steps** of your function
+- ``sum()`` could be computed by scanning the z + x(i) function over a list, given an initial state of ``z=0``.
+- Often a for-loop can be expressed as a ``scan()`` operation, and ``scan`` is the closest that Theano comes to looping.
+- The advantage of using ``scan`` over for loops
+  
+  - The number of iterations to be part of the symbolic graph
+  - Minimizes GPU transfers if GPU is involved
+  - Compute gradients through sequential steps
+  - Slightly faster then using a for loop in Python with a compiled Theano function
+  - Can lower the overall memory usage by detecting the actual amount of memory needed
+
+**Scan Example: Computing pow(A,k)**
+
+.. code-block:: python
+
+  import theano
+  import theano.tensor as T
+
+  k = T.iscalar("k"); A = T.vector("A")
+
+  def inner_fct(prior_result, A): return prior_result * A
+  # Symbolic description of the result
+  result, updates = theano.scan(fn=inner_fct,
+                              outputs_info=T.ones_like(A),
+                              non_sequences=A, n_steps=k)
+
+  # Scan has provided us with A**1 through A**k.  Keep only the last
+  # value. Scan notices this and does not waste memory saving them.
+  final_result = result[-1]
+  
+  power = theano.function(inputs=[A,k], outputs=final_result,
+                        updates=updates)
+  
+  print power(range(10),2)
+  #[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
+
+
+**Scan Example: Calculating a Polynomial**
+
+.. code-block:: python
+
+  import theano
+  import theano.tensor as T
+
+  coefficients = theano.tensor.vector("coefficients")
+  x = T.scalar("x"); max_coefficients_supported = 10000
+
+  # Generate the components of the polynomial
+  full_range=theano.tensor.arange(max_coefficients_supported)
+  components, updates = theano.scan(fn=lambda coeff, power, free_var:
+                                     coeff * (free_var ** power),
+                                  outputs_info=None,
+                                  sequences=[coefficients, full_range],
+                                  non_sequences=x)
+  polynomial = components.sum()
+  calculate_polynomial = theano.function(inputs=[coefficients, x],
+                                       outputs=polynomial)
+
+  test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
+  print calculate_polynomial(test_coeff, 3)
+  # 19.0
+
+
+
+Exercise 5
+-----------
+
+- Run both examples 
+- Modify and execute the polynomial example to have the reduction done by scan
+
+Known limitations
+-----------------
+
+- Compilation phase distinct from execution phase
+- Compilation time can be significant
+
+  - Amortize it with functions over big input or reuse functions
+
+- Execution overhead
+
+  - Needs a certain number of operations to be useful
+  - We have started working on this in a branch
+
+- Compilation time superlinear in the size of the graph.
+
+  - A few hundreds nodes is fine
+  - Disabling a few optimizations can speed up compilation
+  - Usually too many nodes indicates a problem with the graph
+
+- Lazy evaluation in a branch (We will try to merge this summer)
+
--- a/doc/hpcs2011_tutorial/extending_theano.txt
+++ b/doc/hpcs2011_tutorial/extending_theano.txt
+
+.. _extending_theano:
+
+****************
+Extending Theano
+****************
+
+Theano graphs
+-------------
+
+- Theano works with symbolic graphs
+- Those graphs are bi-partite graphs (graph with 2 types of nodes)
+- Those 2 nodes types are Apply and Variable nodes
+
+Inputs and Outputs are lists of Theano variables
+
+.. image:: pics/apply_node.png
+    :width: 500 px
+
+Op contract
+-----------
+
+
+.. code-block:: python
+
+    import theano
+
+    class MyOp(Op):
+        def __eq__(self, other):
+        def __hash__(self):
+        def __str__(self):
+        def make_node(self, x):
+        # Python implementation:
+        def perform(self, node, inputs_storage, output_storage):
+        # C implementation: [see theano web site]
+        # others implementation (pycuda, ...):
+        def make_thunk(self, node, storage_map, _, _2):
+        # optional:
+        def __init__(self, ...):
+        def grad(self, inputs, g):
+        def infer_shape(node, (i0_shapes, ...))
+
+
+Op example
+----------
+
+.. code-block:: python
+
+    import theano
+
+    class DoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, x):
+            x = theano.tensor.as_tensor_variable(x)
+            return theano.Apply(self, [x], [x.type()])
+        def perform(self, node, inputs, output_storage):
+            x = inputs[0]
+            z = output_storage[0]
+            z[0] = x * 2
+
+Test it!
+
+>>> x = theano.tensor.matrix()
+>>> f = theano.function([x],DoubleOp()(x))
+>>> import numpy
+>>> inp = numpy.random.rand(5,5)
+>>> out = f(inp)
+>>> assert numpy.allclose(inp*2, out)
+>>> print inp
+>>> print out
+
+
+Exercises 7
+-----------
+
+- Run the code in the file double_op.py.
+- Modify and execute to compute: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+
+  - Our current elemwise fusion generate computation with only 1 outputs
+
+
+
+Theano + PyCUDA
+---------------
+
+.. code-block:: python
+
+    import numpy, theano
+    import theano.misc.pycuda_init
+    from pycuda.compiler import SourceModule
+    import theano.sandbox.cuda as cuda
+
+    class PyCUDADoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, inp):
+            inp = cuda.basic_ops.gpu_contiguous(
+               cuda.basic_ops.as_cuda_ndarray_variable(inp))
+            assert inp.dtype == "float32"
+            return theano.Apply(self, [inp], [inp.type()])
+        def make_thunk(self, node, storage_map, _, _2):
+            mod = SourceModule("""
+        __global__ void my_fct(float * i0, float * o0, int size) {
+        int i = blockIdx.x*blockDim.x + threadIdx.x;
+        if(i<size){
+            o0[i] = i0[i]*2;
+        }
+      }""")
+            pycuda_fct = mod.get_function("my_fct")
+            inputs = [ storage_map[v] for v in node.inputs]
+            outputs = [ storage_map[v] for v in node.outputs]
+            def thunk():
+                z = outputs[0]
+                if z[0] is None or z[0].shape!=inputs[0][0].shape:
+                    z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
+                grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
+                pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
+                           block=(512,1,1), grid=grid)
+            return thunk
+    
+
+Test it!
+
+>>> x = theano.tensor.fmatrix()
+>>> f = theano.function([x], PyCUDADoubleOp()(x))
+>>> xv=numpy.ones((4,5), dtype="float32")
+>>> assert numpy.allclose(f(xv), xv*2)
+>>> print numpy.asarray(f(xv))
+
+Exercises 8
+-----------
+
+- Run the above example
+- Modify and execute the example to multiple two matrix: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+
+  - Our current elemwise fusion generate computation with only 1 outputs
+
+- Modify and execute the example to support stride? (Don't force the input to be c contiguous)
--- a/doc/hpcs2011_tutorial/gpundarray.txt
+++ b/doc/hpcs2011_tutorial/gpundarray.txt
+
+.. _gpundarray:
+
+**********
+GpuNdArray
+**********
+
+Why a common GPU ndarray?
+
+- Currently there are at least 4 different GPU array data structures in use by Python packages
+
+  - CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ...
+  - There are even more if we include other languages
+
+- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU
+- Lots of duplicated effort
+
+  - GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
+
+- Lack of a common array API makes it harder to port/reuse code
+- Also harder to find/distribute code
+- Divides development work
+
+
+Design Goals
+
+- Make it VERY similar to ``numpy.ndarray``
+- Be compatible with both CUDA and OpenCL
+- Have the base object accessible from C to allow collaboration with more projects, across high-level languages
+
+  - We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
+
+
+Final GpuNdArray Note
+
+- Under development
+- Will be the next GPU array container for Theano (this summer!)
+- Probably also for PyCUDA, PyOpenCL
+- Mailing list: http://lists.tiker.net/listinfo/gpundarray
+
--- a/doc/hpcs2011_tutorial/index.txt
+++ b/doc/hpcs2011_tutorial/index.txt
+
+.. _index:
+
+=========================
+GPU programming made Easy
+=========================
+
+.. toctree::
+
+    introduction
+    theano
+    advanced_theano
+    pyCUDA
+    extending_theano
+    gpundarray
+
--- a/doc/hpcs2011_tutorial/introduction.txt
+++ b/doc/hpcs2011_tutorial/introduction.txt
+
+.. _introduction:
+
+
+************
+Introduction
+************
+
+Theano motivations
+------------------
+Theano tries to be the **holy grail** in computing: *easy to code* and *it fast to execute* !
+
+it works only on mathematical expressions, so you won't have:
+
+  - Function call inside a theano function
+  - Structure, enum
+  - Dynamic type (Theano is Fully typed)
+
+Unfortunately it doesn't do coffee... yet.
+
+.. image:: pics/Caffeine_Machine_no_background_red.png
+
+
+Theano status
+-------------
+
+Why you can rely on Theano:
+
+- Theano has been developed and used since January 2008 (3.5 yrs old)
+- Core technology for a funded Silicon-Valley startup
+- Driven over 40 research papers in the last few years
+- Good user documentation
+- Active mailing list with participants from outside our lab
+- Many contributors (some from outside our lab)
+- Used to teach IFT6266 for two years
+- Used by everyone in our lab (\textasciitilde 30 people)
+- Deep Learning Tutorials
+- Unofficial RPMs for Mandriva
+- Downloads (June 8 2011, since last January): Pypi 780, MLOSS: 483, Assembla (``bleeding edge'' repository): unknown
+
+Why scripting for GPUs ?
+------------------------
+
+**GPUs?**
+
+- Faster, cheaper, more efficient power usage
+- How much faster? I have seen numbers from 100x slower to 1000x faster.
+
+  - It depends on the algorithms
+  - How the benchmark is done
+      
+    - Quality of implementation
+    - How much time was spent optimizing CPU vs GPU code
+
+  - In Theory:
+
+    - Intel Core i7 980 XE (107Gf/s float64) 6 cores
+    - NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores
+    - NVIDIA GTX580 (1.5Tf/s float32) 512 cores
+  
+  - Theano goes up to 100x faster on th GPU because we don't use multiple core on CPU
+    
+    - Theano can be linked with multi-core capable BLAS (GEMM and GEMV)
+  - If you see 1000x, it probably means the benchmark is not fair
+
+**Scripting for GPUs?**
+
+They *Complement each other*
+
+- GPUs are everything that scripting/high level languages are not
+
+  - Highly parallel
+  - Very architecture-sensitive
+  - Built for maximum FP/memory throughput
+- CPU: largely restricted to control
+
+  - Optimized for sequential code and low latency (rather than high throughput)
+  - Tasks (1000/sec)
+  - Scripting fast enough
+
+Theano vs PyCUDA vs PyOpenCL vs CUDA
+------------------------------------
+
+- Theano
+
+  - Mathematical expression compiler
+  - Generates costum C and CUDA code
+  - Uses Python code when performance is not critical
+
+- CUDA
+
+  - C extension by NVIDA that allow to code and use GPU
+
+- PyCUDA (Python + CUDA)
+
+  - Python interface to CUDA
+  - Memory management of GPU objects
+  - Compilation of code for the low-level driver
+
+- PyOpenCL (Python + OpenCL)
+  - PyCUDA for OpenCL
+
+Python
+------
+
+- Interpreted language
+- General-purpose high-level programming language
+- OO and scripting language
+- Emphasizes code readability
+- Large and comprehensive standard library
+- Indentation for block delimiters
+- Dynamic type and memory management
+- Dictionary ``d={'var1':'value1', 'var2':42, ...}``
+- List comprehension: ``[i+3 for i in range(10)]``
+
+NumPy
+-----
+
+- Base scientific computing package in Python on the CPU
+- A powerful N-dimensional array object
+
+  - ndarray.{ndim, shape, size, dtype, itemsize, stride}
+  
+- Sophisticated broadcasting functions
+    
+  - ``numpy.random.rand(4,5) * numpy.random.rand(1,5)`` -> mat(4,5)
+  - ``numpy.random.rand(4,5) * numpy.random.rand(4,1)`` -> mat(4,5)
+  - ``numpy.random.rand(4,5) * numpy.random.rand(5)`` -> mat(4,5)
+
+- Tools for integrating C/C++ and Fortran code
+- Linear algebra, Fourier transform and pseudorandom number generation
+
+
--- a/doc/hpcs2011_tutorial/pics/apply_node.png
+++ b/doc/hpcs2011_tutorial/pics/apply_node.png
--- a/doc/hpcs2011_tutorial/pics/conv.png
+++ b/doc/hpcs2011_tutorial/pics/conv.png
--- a/doc/hpcs2011_tutorial/pics/f_optimized.png
+++ b/doc/hpcs2011_tutorial/pics/f_optimized.png
--- a/doc/hpcs2011_tutorial/pics/f_unoptimized.png
+++ b/doc/hpcs2011_tutorial/pics/f_unoptimized.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_train.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_train.png
--- a/doc/hpcs2011_tutorial/pics/mlp.png
+++ b/doc/hpcs2011_tutorial/pics/mlp.png
--- a/doc/hpcs2011_tutorial/pics/multiple_graph.png
+++ b/doc/hpcs2011_tutorial/pics/multiple_graph.png
--- a/doc/hpcs2011_tutorial/pics/pipeline.png
+++ b/doc/hpcs2011_tutorial/pics/pipeline.png
--- a/doc/hpcs2011_tutorial/pyCUDA.txt
+++ b/doc/hpcs2011_tutorial/pyCUDA.txt
+
+.. _pyCUDA:
+
+******
+PyCUDA
+******
+
+Introduction
+------------
+
+Authors: Andreas Klockner
+
+- PyCUDA can access Nvidia's CUDA parallel computation API from Python
+- Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
+
+  - Makes it much easier to write correct, leak- and crash-free code
+  - PyCUDA knows about dependencies (e.g.. it won't detach from a context before all memory allocated in it is also freed)
+
+- Convenience
+
+  - Abstractions to compile CUDA code from Python: ``pycuda.driver.SourceModule``
+  - A GPU memory buffer: \texttt{pycuda.gpuarray.GPUArray}
+
+- Completeness
+
+  - Binding to all of CUDA's driver API
+
+- Automatic Error Checking
+
+  - All CUDA errors are automatically translated into Python exceptions
+
+- Speed
+
+  - PyCUDA's base layer is written in C++
+
+- Helpful documentation
+
+
+Example
+-------
+
+.. code-block:: python
+
+  import pycuda.autoinit
+  import pycuda.driver as drv
+  import numpy
+  
+  from pycuda.compiler import SourceModule
+  mod = SourceModule("""
+  __global__ void multiply_them(float *dest, float *a, float *b)
+  {
+    const int i = threadIdx.x;
+    dest[i] = a[i] * b[i];
+  }
+  """)
+
+  multiply_them = mod.get_function("multiply_them")
+  
+  a = numpy.random.randn(400).astype(numpy.float32)
+  b = numpy.random.randn(400).astype(numpy.float32)
+  
+  dest = numpy.zeros_like(a)
+  multiply_them(
+          drv.Out(dest), drv.In(a), drv.In(b),
+          block=(400,1,1), grid=(1,1))
+
+  assert numpy.allclose(dest, a*b)
+  print dest
+
+
+Exercice 6
+----------
+
+- Run the above example
+- Modify and execute it to work for a matrix of 20 x 10
+
--- a/doc/hpcs2011_tutorial/theano.txt
+++ b/doc/hpcs2011_tutorial/theano.txt
+
+.. _theano:
+
+******
+Theano
+******
+
+Pointers
+--------
+
+- http://deeplearning.net/software/theano/
+- Announcements mailing list: http://groups.google.com/group/theano-announce
+- User mailing list: http://groups.google.com/group/theano-users
+- Deep Learning Tutorials: http://www.deeplearning.net/tutorial/
+- Installation: https://deeplearning.net/software/theano/install.html
+
+
+Description
+-----------
+
+- Mathematical symbolic expression compiler
+- Dynamic C/CUDA code generation
+- Efficient symbolic differentiation
+ 
+  - Theano computes derivatives of functions with one or many inputs.
+
+- Speed and stability optimizations
+
+  - Gives the right answer for ``log(1+x)`` even if x is really tiny.
+  
+- Works on Linux, Mac and Windows
+- Transparent use of a GPU
+
+  - float32 only for now (working on other data types)
+  - Doesn't work on Windows for now
+  - On GPU data-intensive calculations are typically between 6.5x and 44x faster. We've seen speedups up to 140x
+
+- Extensive unit-testing and self-verification
+
+  - Detects and diagnoses many types of errors
+  
+- On CPU, common machine learning algorithms are 1.6x to 7.5x faster than competitive alternatives
+
+  - including specialized implementations in C/C++, NumPy, SciPy, and Matlab
+
+- Expressions mimic NumPy's syntax & semantics
+- Statically typed and purely functional
+- Some sparse operations (CPU only)
+- The project was started by James Bergstra and Olivier Breuleux
+- For the past 1-2 years, I have replaced Olivier as lead contributor
+
+Why Theano is better ?
+----------------------
+
+Executing the code is faster because Theano:
+  - Rearranges high-level expressions
+  - Produces customized low-level code
+  - Uses a variety of backend technologies (GPU,...)
+
+Writing the code is faster because:
+  - High-level language allows to **concentrate on the algorithm**
+  - Theano do **automatic optimization**
+
+    - No need to manually optimize for each algorithm you want to test
+  - Theano do **automatic efficient symbolic differentiation**
+    
+    - No need to manually differentiate your functions (tedious & error-prone for complicated expressions!)
+
+Simple example
+--------------
+
+>>> import theano
+>>> a = theano.tensor.vector("a")      # declare symbolic variable
+>>> b = a + a**10                      # build symbolic expression
+>>> f = theano.function([a], b)        # compile function
+>>> print f([0,1,2])                   # prints `array([0,2,1026])`
+
+
+==================================  ==================================
+        Unoptimized graph                    Optimized graph
+==================================  ==================================
+.. image:: pics/f_unoptimized.png   .. image:: pics/f_optimized.png
+==================================  ==================================
+
+Symbolic programming
+
+- Paradigm shift: people need to use it to understand it
+
+Exercise 1
+-----------
+
+.. code-block:: python
+
+  import theano
+  a = theano.tensor.vector("a") # declare variable
+  b = a + a**10                 # build symbolic expression
+  f = theano.function([a], b)   # compile function
+  print f([0,1,2])
+  # prints `array([0,2,1026])`
+  
+  theano.printing.pydotprint_variables(b, outfile="f_unoptimized.png", var_with_name_simple=True)
+  theano.printing.pydotprint(f, outfile="f_optimized.png", var_with_name_simple=True)
+
+Modify and execute the example to do this expression: a**2 + b**2 + 2*a*b
+
+Real example
+------------
+
+**Logistic Regression**
+
+- GPU-ready
+- Symbolic differentiation
+- Speed optimizations
+- Stability optimizations
+
+.. code-block:: python
+
+  import numpy
+  import theano
+  import theano.tensor as T
+  rng = numpy.random
+  
+  N = 400
+  feats = 784
+  D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2))
+  training_steps = 10000
+  
+  # Declare Theano symbolic variables
+  x = T.matrix("x")
+  y = T.vector("y")
+  w = theano.shared(rng.randn(100), name="w")
+  b = theano.shared(0., name="b")
+  print "Initial model:"
+  print w.get_value(), b.get_value()
+
+  # Construct Theano expression graph
+  p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))     # Probability that target = 1
+  prediction = p_1 > 0.5                    # The prediction thresholded
+  xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) # Cross-entropy loss function
+  cost = xent.mean() + 0.01*(w**2).sum()    # The cost to minimize
+  gw,gb = T.grad(cost, [w,b])
+
+  # Compile
+  train = theano.function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+            updates={w:w-0.1*gw, b:b-0.1*gb})
+  predict = theano.function(inputs=[x], outputs=prediction)
+
+  # Train
+  for i in range(training_steps):
+      pred, err = train(D[0], D[1])
+
+  print "Final model:"
+  print w.get_value(), b.get_value()
+  print "target values for D:", D[1]
+  print "prediction on D:", predict(D[0])
+
+
+**Optimizations:**
+
+.. code-block:: python
+
+  p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
+  # 1 / (1 + T.exp(var)) -> sigmoid(var)
+  xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
+  # Log(1-sigmoid(var)) -> -sigmoid(var)
+  prediction = p_1 > 0.5
+  cost = xent.mean() + 0.01*(w**2).sum()
+  gw,gb = T.grad(cost, [w,b])
+
+  train = theano.function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+            # w-0.1*gw: GEMV with the dot in the grad
+            updates={w:w-0.1*gw, b:b-0.1*gb})
+
+
+Where are those optimization applied?
+
+- ``log(1+exp(x))``
+- ``1 / (1 + T.exp(var))`` (sigmoid)
+- ``log(1-sigmoid(var))`` (softplus, stabilisation)
+- GEMV (matrix-vector multiply from BLAS)
+- Loop fusion
+
+
+Theano flags
+------------
+
+Theano can be configured with flags. They can be defined in two ways
+
+- With an environment variable: ``THEANO_FLAGS="mode=ProfileMode,ProfileMode.profile_memory=True"``
+- With a configuration file that defaults to ``~.theanorc``
+
+
+Exercise 2
+-----------
+
+.. code-block:: python
+    
+    import numpy
+    import theano
+    import theano.tensor as T
+    rng = numpy.random
+
+    N = 400
+    feats = 784
+    D = (rng.randn(N, feats).astype(theano.config.floatX),
+    rng.randint(size=N,low=0, high=2).astype(theano.config.floatX))
+    training_steps = 10000
+
+    # Declare Theano symbolic variables
+    x = T.matrix("x")
+    y = T.vector("y")
+    w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
+    b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
+    x.tag.test_value = D[0]
+    y.tag.test_value = D[1]
+    #print "Initial model:"
+    #print w.get_value(), b.get_value()
+
+
+    # Construct Theano expression graph
+    p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b)) # Probabily of having a one
+    prediction = p_1 > 0.5 # The prediction that is done: 0 or 1
+    xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) # Cross-entropy
+    cost = xent.mean() + 0.01*(w**2).sum() # The cost to optimize
+    gw,gb = T.grad(cost, [w,b])
+
+    # Compile expressions to functions
+    train = theano.function(
+                inputs=[x,y],
+                outputs=[prediction, xent],
+                updates={w:w-0.01*gw, b:b-0.01*gb},
+                name = "train")
+    predict = theano.function(inputs=[x], outputs=prediction,
+                name = "predict")
+
+    if any( [x.op.__class__.__name__=='Gemv' for x in
+    train.maker.env.toposort()]):
+        print 'Used the cpu'
+    elif any( [x.op.__class__.__name__=='GpuGemm' for x in
+    train.maker.env.toposort()]):
+        print 'Used the gpu'
+    else:
+        print 'ERROR, not able to tell if theano used the cpu or the gpu'
+        print train.maker.env.toposort()
+
+
+
+    for i in range(training_steps):
+        pred, err = train(D[0], D[1])
+    #print "Final model:"
+    #print w.get_value(), b.get_value()
+
+    print "target values for D"
+    print D[1]
+
+    print "prediction on D"
+    print predict(D[0])
+
+    # Print the graph used in the slides
+    theano.printing.pydotprint(predict,
+                               outfile="pics/logreg_pydotprint_predic.png",
+                               var_with_name_simple=True)
+    theano.printing.pydotprint_variables(prediction,
+                               outfile="pics/logreg_pydotprint_prediction.png",
+                               var_with_name_simple=True)
+    theano.printing.pydotprint(train,
+                               outfile="pics/logreg_pydotprint_train.png",
+                               var_with_name_simple=True)
+
+Modify and execute the example to run on CPU with floatX=float32
+
+* You will need to use: ``theano.config.floatX`` and ``ndarray.astype("str")``
+
+GPU
+---
+
+- Only 32 bit floats are supported (being worked on)
+- Only 1 GPU per process
+- Use the Theano flag ``device=gpu`` to tell to use the GPU device
+  
+  - Use ``device=gpu{0, 1, ...}`` to specify which GPU if you have more than one
+  - Shared variables with float32 dtype are by default moved to the GPU memory space
+
+- Use the Theano flag ``floatX=float32``
+
+  - Be sure to use ``floatX`` (``theano.config.floatX``) in your code
+  - Cast inputs before putting them into a shared variable
+  - Cast "problem": int32 with float32 to float64
+    
+    - A new casting mechanism is being developed
+    - Insert manual cast in your code or use [u]int{8,16}
+    - Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
+
+
+
+Exercice 3
+-----------
+
+- Modify and execute the example of `Exercise 2`_ to run with floatX=float32 on GPU
+- Time with: ``time python file.py``
+
+Symbolic variables
+------------------
+
+- # Dimensions
+    
+  - T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4
+
+- Dtype
+
+  - T.[fdczbwil]vector (float32, float64, complex64, complex128, int8, int16, int32, int64)
+  - T.vector to floatX dtype
+  - floatX: configurable dtype that can be float32 or float64.
+
+- Custom variable
+  - All are shortcuts to: ``T.tensor(dtype, broadcastable=[False]*nd)``
+  - Other dtype: uint[8,16,32,64], floatX
+
+Creating symbolic variables: Broadcastability
+  - Remember what I said about broadcasting?
+  - How to add a row to all rows of a matrix?
+  - How to add a column to all columns of a matrix?
+  
+
+- Broadcastability must be specified when creating the variable
+- The only shorcut with broadcastable dimensions are: **T.row** and **T.col**
+- For all others: ``T.tensor(dtype, broadcastable=([False or True])*nd)``
+
+
+Differentiation details
+-----------------------
+
+>>> gw,gb = T.grad(cost, [w,b])
+
+- T.grad works symbolically: takes and returns a Theano variable
+- T.grad can be compared to a macro: it can be applied multiple times
+- T.grad takes scalar costs only
+- Simple recipe allows to compute efficiently vector x Jacobian and vector x Hessian
+- We are working on the missing optimizations to be able to compute efficently the full Jacobian and Hessian and Jacobian x vector
+
+
+
+Benchmarks
+----------
+
+Example:
+
+- Multi-layer perceptron
+- Convolutional Neural Networks
+- Misc Elemwise operations
+
+Competitors: NumPy + SciPy, MATLAB, EBLearn, Torch5, numexpr
+
+- EBLearn, Torch5: specialized libraries written by practitioners specifically for these tasks
+- numexpr: similar to Theano, 'virtual machine' for elemwise expressions
+
+**Multi-Layer Perceptron**:
+
+60x784 matrix times 784x500 matrix, tanh, times 500x10 matrix, elemwise, then all in reverse for backpropagation
+
+.. image:: pics/mlp.png
+
+**Convolutional Network**: 
+
+256x256 images convolved with 6 7x7 filters,
+downsampled to 6x50x50, tanh, convolution with 16 6x7x7 filter, elementwise
+tanh, matrix multiply, softmax elementwise, then in reverse
+
+.. image:: pics/conv.png
+
+**Elemwise**
+
+- All on CPU
+- Solid blue: Theano
+- Dashed Red: numexpr (without MKL)
+
+.. image:: pics/multiple_graph.png
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1183,36 +1183,46 @@ class _Linker(gof.link.LocalLinker):

        thunks_py = [] #python thunks
        thunks_c = [] #c thunks
+
+        compute_map = {}
+        for k in storage_map:
+            compute_map[k] = [k.owner is None]
+
+
        for node in order:
            node_input_storage = [storage_map[r] for r in node.inputs]
            node_output_storage = [storage_map[r] for r in node.outputs]
-            try:
-                if not self.maker.mode.check_c_code:
-                    raise utils.MethodNotDefined()
-                e = Env(*graph.clone(node.inputs, node.outputs))
-                e.toposort = lambda: e.nodes #WARNING: STOCHASTIC ORDER
-                #  Specifically... e.nodes is a set, but of only 1 element
-
-                cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs, node.outputs) if r2 in no_recycling])
-
-                thunk, node_input_filters, node_output_filters = cl.make_thunk(
-                    input_storage = node_input_storage,
-                    output_storage = node_output_storage)
-                thunk.inputs = node_input_storage
-                thunk.outputs = node_output_storage
-                thunks_c.append(thunk)

-            except (NotImplementedError, utils.MethodNotDefined):
-                thunks_c.append(None)
+            if hasattr(node.op, '_op_use_c_code'):
+                old_value = node.op._op_use_c_code
+            else:
+                old_value = False
+            try:
+                # ! Problem ! We do not know if make_thunk succedded into
+                # generating a cthunk, or if it reverted back to a python
+                # thunk, or if it is none of the above ...
+                node.op._op_use_c_code = True
+                tmp_thunk = node.op.make_thunk(node,
+                                    storage_map,
+                                    compute_map,
+                                    no_recycling)
+                if hasattr(tmp_thunk, 'cthunk'):
+                    # Arbritrary check to see if it has a C implementation
+                    thunks_c.append(tmp_thunk)
+                else:
+                    thunks_c.append(None)
+            finally:
+                node.op._op_use_c_code = old_value

            if self.maker.mode.check_py_code or thunks_c[-1] is None:
-                p = node.op.perform
-                thunk = (lambda p = p, i = node_input_storage, o = node_output_storage, n =
-                        node: p(n, [x[0] for x in i], o))
-                thunk.inputs = node_input_storage
-                thunk.outputs = node_output_storage
-                thunk.perform = p
-                thunks_py.append(thunk)
+                try:
+                    node.op._op_use_c_code = False
+                    thunks_py += [node.op.make_thunk(node,
+                                        storage_map,
+                                        compute_map,
+                                        no_recycling)]
+                finally:
+                    node.op._op_use_c_code = old_value
            else:
                thunks_py.append(None)

@@ -1233,6 +1243,11 @@ class _Linker(gof.link.LocalLinker):
        # This is the function that runs when you evaluate the graph
        #####
        def f():
+            ####
+            # Note: `f` ignores the compute_map and evaluates the nodes in
+            # topological order. In some sense, this is ok, and can be used
+            # for now.
+            #####
            _logger.debug("starting a DebugMode call")
            for x in no_recycling:
                x[0] = None

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -401,7 +401,9 @@ class PerformLinker(LocalLinker):
        for node in order:
            # Maker sure we don't use C version of the code, but rather only
            # the python version
-            old_value = node.op._op_use_c_code
+            # Note : ops that implement their own make thunk don't usually
+            # have this attribute defiend !!
+            old_value = getattr(node.op, '_op_use_c_code', False)
            try:
                node.op._op_use_c_code = False
                thunks += [node.op.make_thunk(node,

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1063,6 +1063,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            start_from = env.outputs
        changed = True
        max_use_abort = False
+        opt_name = None
        process_count = {}

        while changed and not max_use_abort:
@@ -1099,6 +1100,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                        process_count.setdefault(lopt, 0)
                        if process_count[lopt] > max_use:
                            max_use_abort = True
+                            opt_name = lopt.name
                        else:
                            lopt_change = self.process_node(env, node, lopt)
                            if lopt_change:
@@ -1110,7 +1112,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                self.detach_updater(env, u)
            self.detach_updater(env, u) #TODO: erase this line, it's redundant at best
        if max_use_abort:
-            _logger.error("EquilibriumOptimizer max'ed out")
+            _logger.error("EquilibriumOptimizer max'ed out by "+opt_name)

    def print_summary(self, stream=sys.stdout, level=0):
        print >> stream, "%s%s id=%i" %(' '*level, self.__class__.__name__, id(self))

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -168,7 +168,10 @@ class EquilibriumDB(DB):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
        return opt.EquilibriumOptimizer(opts,
                max_depth=5,
-                max_use_ratio=11,#upgraded to 11 to don't generated useless output in test.
+                max_use_ratio=50,#upgraded to 50 to avoid equibriumOptimizer
+                                # to be max'ed out by constant folding (can
+                                        # I increase the max ratio only for
+                                        # constant folding somehow?
                failure_callback=opt.NavigatorOptimizer.warn_inplace)



--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
@@ -71,6 +71,14 @@ def test_pycuda_memory_to_theano():
    print "gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y)
    assert sys.getrefcount(y)==3
    assert (numpy.asarray(z) == 0).all()
+    assert z.base is y
+
+    # Test that we can take a view from this cuda view on pycuda memory
+    zz = z.view()
+    assert sys.getrefcount(y) == 4
+    assert zz.base is y
+    del zz
+    assert sys.getrefcount(y) == 3

    cuda_ones = cuda_ndarray.CudaNdarray(numpy.asarray([[[1]]],dtype='float32'))
    z += cuda_ones

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -50,13 +50,7 @@ class HostFromGpu(Op):
        z[0] = numpy.asarray(x)
    def grad(self, inputs, grads):
        gz, = grads
-        if isinstance(gz, tensor.TensorType):
-            # This would only happen if you call Lop, and provide a tensor
-            # that is not cuda
-            # This might require another look to be sure
-            return [gpu_from_host(gz)]
-        else:
-            return [gz]
+        return [gpu_from_host(gz)]

    def R_op(self, inputs, eval_points):
        ev, = eval_points
@@ -85,13 +79,7 @@ class GpuFromHost(Op):
        z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0])
    def grad(self, inputs, grads):
        gz, = grads
-        if isinstance(gz,CudaNdarrayType):
-            # This would only happen if you call Lop, and provide a tensor
-            # that is not cuda
-            # This might require another look to be sure
-            return [host_from_gpu(gz)]
-        else:
-            return [gz]
+        return [host_from_gpu(gz)]

    def R_op(self, inputs, eval_points):
        ev, = eval_points

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2585,13 +2585,10 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * bas
    // Get the original base object (base.base.base...)
    PyObject * orig_base = base;
    // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
-    if (orig_base && CudaNdarray_Check(orig_base))
+    while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
    {
-        while (((CudaNdarray*) orig_base)->base)
-        {
-            // base_base is itself a view
-            orig_base = ((CudaNdarray*) orig_base)->base;
-        }
+        // base_base is itself a view
+        orig_base = ((CudaNdarray*) orig_base)->base;
    }
    //N.B. XDECREF and XINCREF are no-ops for NULL pointers
    if (self->base != orig_base)

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -590,7 +590,7 @@ def local_gpu_advanced_incsubtensor1(node):
                                               gpu_from_host(y), *coords)]

    # Should not execute for GpuAdvancedIncSubtensor1
-    if node.op.__class__ is tensor.AdvancedSubtensor1 and node.inputs[0].dtype=="float32":
+    if node.op.__class__ is tensor.AdvancedIncSubtensor1 and node.inputs[0].dtype=="float32":
        x, y  = node.inputs[0:2]
        coords = node.inputs[2:]
        go_gpu = False

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -806,6 +806,22 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
    def __init__(self, name):
        return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name)

+def test_advinc_subtensor1():
+    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
+    shared = cuda.shared_constructor
+    #shared = tensor.shared
+    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+                      dtype='float32')
+    yval = numpy.asarray([[10,10,10], [10,10,10]],
+                      dtype='float32')
+    x = shared(xval, name = 'x')
+    y = T.fmatrices('y')
+    expr = T.advanced_inc_subtensor1(x,y,[0,2])
+    f=theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op,cuda.GpuAdvancedIncSubtensor1) for node in f.maker.env.toposort() ])==1
+    assert numpy.allclose(f(yval),[[11.,12.,13.], [4.,5.,6.], [17.,18.,19.]])
+
+
 def test_inc_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared
@@ -832,7 +848,6 @@ def test_set_subtensor():
                      dtype='float32')
    expr = T.set_subtensor(x[:,1:3], y[:,1:3])
    f=theano.function([x,y], expr, mode=mode_with_gpu)
-    print f.maker.env.toposort()
    assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
    assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==True for node in f.maker.env.toposort() ])==1
    print f(xval,yval)

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -116,7 +116,7 @@ def test_run_nnet():
            rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
            #print "cpu:", rval_cpu
            #print "gpu:", rval_gpu
-            abs_diff, rel_diff = theano.tensor.basic.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
+            abs_diff, rel_diff = theano.tensor.tensor_grad.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
            max_abs_diff = abs_diff.max()
            print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d"%(
                max_abs_diff, rel_diff.max(), n_in, n_hid)

--- a/theano/scan_module/__init__.py
+++ b/theano/scan_module/__init__.py
@@ -41,4 +41,4 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 import scan_opt
 from scan import scan
 from scan_views import map, reduce, foldl, foldr
-from scan_utils import clone
+from scan_utils import clone, until
--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -5,7 +5,7 @@ Scanning is a general form of recurrence, which can be used for looping.
 The idea is that you *scan* a function along some input sequence, producing
 an output at each time-step that can be seen (but not modified) by the
 function at the next time-step. (Technically, the function can see the
-previous K  time-steps of your outputs and L time steps (from the past and
+previous K  time-steps of your outputs and L time steps (from past and
 future) of your inputs.

 So for example, ``sum()`` could be computed by scanning the ``z+x_i``
@@ -13,15 +13,21 @@ function over a list, given an initial state of ``z=0``.

 Special cases:

-* A *reduce* operation can be performed by returning only the last
+* A *reduce* operation can be performed by using only the last
  output of a ``scan``.
 * A *map* operation can be performed by applying a function that
  ignores previous steps of the outputs.

-Often a for-loop can be expressed as a ``scan()`` operation, and ``scan`` is
-the closest that theano comes to looping. The advantage of using ``scan``
-over for loops is that it allows the number of iterations to be a part of
-the symbolic graph.
+Often a for-loop or while-loop can be expressed as a ``scan()`` operation,
+and ``scan`` is the closest that theano comes to looping. The advantages
+of using ``scan`` over `for` loops in python (amongs other) are:
+
+* it allows the number of iterations to be part of the symbolic graph
+* it allows computing gradients through the for loop
+* there exist a bunch of optimizations that help re-write your loop
+such that less memory is used and that it runs faster
+* it ensures that data is not copied from host to gpu and gpu to
+host at each step

 The Scan Op should typically be used by calling any of the following
 functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
@@ -65,7 +71,8 @@ def scan( fn
         , truncate_gradient = -1
         , go_backwards      = False
         , mode              = None
-         , name              = None ):
+         , name              = None
+         , profile           = False):
    """
    This function constructs and applies a Scan op to the provided
    arguments.
@@ -74,27 +81,27 @@ def scan( fn
        ``fn`` is a function that describes the operations involved in one
        step of ``scan``. ``fn`` should construct variables describing the
        output of one iteration step. It should expect as input theano
-        variables representing all the time slices of the input sequences
-        and outputs, and all other arguments given to scan as
-        ``non_sequences``. The order in which scan passes this variables
-        to ``fn``  is the following :
+        variables representing all the slices of the input sequences
+        and previous values of the outputs, as well as all other arguments
+        given to scan as ``non_sequences``. The order in which scan passes
+        these variables to ``fn``  is the following :

        * all time slices of the first sequence
        * all time slices of the second sequence
        * ...
        * all time slices of the last sequence
-        * all time slices of the first output
-        * all time slices of the second otuput
+        * all past slices of the first output
+        * all past slices of the second otuput
        * ...
-        * all time slices of the last output
+        * all past slices of the last output
        * all other arguments (the list given as `non_sequences` to
            scan)

        The order of the sequences is the same as the one in the list
-        `sequences` given to scan. The order of the outputs is the sane
+        `sequences` given to scan. The order of the outputs is the same
        as the order of ``output_info``. For any sequence or output the
-        order of the time slices is the same as the order of the time
-        taps provided. For example if one writes the following :
+        order of the time slices is the same as the one in which they have
+        been given as taps. For example if one writes the following :

        .. code-block:: python

@@ -122,25 +129,64 @@ def scan( fn
        The list of ``non_sequences`` can also contain shared variables
        used in the function, though ``scan`` is able to figure those
        out on its own so they can be skipped. For the clarity of the
-        code we recommand though to provide them to scan.
+        code we recommand though to provide them to scan. To some extend
+        ``scan`` can also figure out other ``non sequences`` (not shared)
+        even if not passed to scan (but used by `fn`). A simple example of
+        this would be :
+
+        .. code-block:: python
+
+            import theano.tensor as TT
+            W   = TT.matrix()
+            W_2 = W**2
+            def f(x):
+                return TT.dot(x,W_2)

        The function is expected to return two things. One is a list of
        outputs ordered in the same order as ``outputs_info``, with the
        difference that there should be only one output variable per
        output initial state (even if no tap value is used). Secondly
        `fn` should return an update dictionary ( that tells how to
-        update any shared variable after each iteration ste). The
+        update any shared variable after each iteration step). The
        dictionary can optionally be given as a list of tuples. There is
        no constraint on the order of these two list, ``fn`` can return
        either ``(outputs_list, update_dictionary)`` or
        ``(update_dictionary, outputs_list)`` or just one of the two (in
        case the other is empty).

+        To use ``scan`` as a while loop, the user needs to change the
+        function ``fn`` such that also a stopping condition is returned.
+        To do so, he/she needs to wrap the condition in an ``until`` class.
+        The condition can be returned as a third element, or all the other
+        outputs and updates can be wrapped in ``until``. A few examples
+        would be :
+
+        .. code-block:: python
+
+            ...
+            return [y1_t, y2_t], {x:x+1}, theano.scan_module.until(x < 50)
+
+        or
+
+        .. code-block:: python
+
+            ...
+            return theano.scan_module.until(x<50, [y1_t, y2_t], {x:x+1})
+
+        Note that a number of steps ( considered in here as the maximum
+        number of steps ) is still required even though a condition is
+        passed ( and it is used to allocate memory if needed ). Also when
+        passing multiple argument to ``until`` be aware of its signature:
+
+        .. code-block:: python
+
+            class until(object):
+                def __init__( condition, outputs = [], updates = {}):

    :param sequences:
        ``sequences`` is the list of Theano variables or dictionaries
        describing the sequences ``scan`` has to iterate over. If a
-        sequence is given as wrapped in a dictionary a set of optional
+        sequence is given as wrapped in a dictionary, then a set of optional
        information can be provided about the sequence. The dictionary
        should have the following keys:

@@ -191,13 +237,6 @@ def scan( fn
          ``fn``. They are provided as a list of *negative* integers,
          where a value ``k`` implies that at iteration step ``t`` scan
          will pass to ``fn`` the slice ``t+k``.
-        * ``return_steps`` -- Integer representing the number of steps
-          to return for the current steps. For example, if ``k`` is
-          provided, ``scan`` will return ``output[-k:]``. This is meant
-          as a hint, based on ``k`` and the past taps of the outputs used,
-          scan can be smart about the amount of memory it requires to
-          store intermidiate results. If not given, or ``0``, ``scan``
-          will return all computed steps.

        ``scan`` will follow this logic if partial information is given:

@@ -210,12 +249,12 @@ def scan( fn
        * If you wrap an output in a dictionary but you do not provide any
          initial state, it assumes that you are not using any form of
          taps.
-        * If you provide a ``None`` instead of a variable or a dictionary
-          ``scan`` assumes that you will not use any taps for this output
-          (like for example in case of a map)
+        * If you provide a ``None`` instead of a variable or a empty
+          dictionary ``scan`` assumes that you will not use any taps for
+          this output (like for example in case of a map)

        If ``outputs_info`` is an empty list or None, ``scan`` assumes
-        that no tap is used for any of the otuputs. If information is
+        that no tap is used for any of the outputs. If information is
        provided just for a subset of the outputs an exception is
        raised (because there is no convention on how scan should map
        the provided information to the outputs of ``fn``)
@@ -223,8 +262,9 @@ def scan( fn

    :param non_sequences:
        ``non_sequences`` is the list of arguments that are passed to
-        ``fn`` at each steps. Once can opt to exclude shared variables
-        used in ``fn`` from this list.
+        ``fn`` at each steps. One can opt to exclude variable
+        used in ``fn`` from this list as long as they are part of the
+        computational graph, though for clarity we encourage not to do so.


    :param n_steps:
@@ -232,10 +272,9 @@ def scan( fn
        or Theano scalar. If any of the input sequences do not have
        enough elements, scan will raise an error. If the *value is 0* the
        outputs will have *0 rows*. If the value is negative, ``scan``
-        run backwards in time. If the ``go_backwards`` flag is already
+        will run backwards in time. If the ``go_backwards`` flag is already
        set and also ``n_steps`` is negative, ``scan`` will run forward
-        in time. If n stpes is not provided, or is a constant that
-        evaluates to ``None``, ``inf`` or ``NaN``, ``scan`` will figure
+        in time. If n stpes is not provided, ``scan`` will figure
        out the amount of steps it should run given its input sequences.


@@ -257,19 +296,20 @@ def scan( fn


    :param name:
-        When profiling ``scan`` it is crucial to provide a name for any
+        When profiling ``scan``, it is crucial to provide a name for any
        instance of ``scan``. The profiler will produce an overall
-        profile of your code as well as profiles for doing one iteration
-        step for each instance of ``scan``. The ``name`` of the instance is
-        how you differentiate between all these profiles.
-
+        profile of your code as well as profiles for the computation of
+        one step of each instance of ``scan``. The ``name`` of the instance
+        appears in those profiles and can greatly help to disambiguate
+        information.

    :param mode:
        It is recommended to leave this argument to None, especially
        when profiling ``scan`` (otherwise the results are not going to
-        be accurate). If you prefer the computations of one step os
-        ``scan`` to be done differently then the entire function set
-        this parameters (see ``theano.function`` for details about
+        be accurate). If you prefer the computations of one step of
+        ``scan`` to be done differently then the entire function, you
+        can use this parameter to describe how the computations in this
+        loop are done (see ``theano.function`` for details about
        possible values and their meaning).


@@ -278,9 +318,9 @@ def scan( fn
             Theano variable or a list of Theano variables representing the
             outputs of ``scan`` (in the same order as in
             ``outputs_info``). ``updates`` is a dictionary specifying the
-             update rules for all shared variables used in the scan
-             operation. This dictionary should be passed to
-             ``theano.function`` when you compile your function.
+             update rules for all shared variables used in scan
+             This dictionary should be passed to ``theano.function`` when
+             you compile your function.
    """
    # General observation : this code is executed only once, at creation
    # of the computational graph, so we don't yet need to be smart about
@@ -318,7 +358,7 @@ def scan( fn
    else:
        try :
            n_fixed_steps = opt.get_constant_value(n_steps)
-        except:
+        except (TypeError, AttributeError):
            n_fixed_steps = None

    # Check n_steps is an int
@@ -346,8 +386,14 @@ def scan( fn
    for i in xrange(n_outs):
        if outs_info[i]:
            if isinstance(outs_info[i], dict):
+                # DEPRICATED :
                if outs_info[i].get('return_steps', None):
+                    _logger.warning( ("Using `return_steps` has been depricated."
+                              " Simply select the entries you need using "
+                              " a subtensor. Scan will optimize memory "
+                              " consumption, so do not worry about that."))
                    return_steps[i] = outs_info[i]['return_steps']
+                # END

            if not isinstance(outs_info[i], dict):
                # by default any output has a tap value of -1
@@ -539,6 +585,10 @@ def scan( fn

            actual_arg = init_out['initial']
            arg = safe_new(init_out['initial'])
+            if isinstance(arg, tensor.Constant):
+                # safe new returns a clone of the constants, but that is not
+                # what we need for initial states
+                arg = arg.type()

            # Try to transfer test_value to the new variable
            if config.compute_test_value != 'off':
@@ -662,17 +712,20 @@ def scan( fn
                ordered_args +
                non_seqs     )

-    # add only the non-shared variables to the arguments of the dummy
-    # function [ a function should not get shared variables as input ]
-    # this could happen if for example the initial state of an output is a
-    # shared variable for which we use only the last step (i.e. no
-    # subtensort is applied to the shared variable )
+    # add only the non-shared variables and non-constants to the arguments of the dummy
+    # function [ a function should not get shared variables or constants as input ]
    dummy_args = [arg for arg in args
-                  if not isinstance(arg, SharedVariable)]
+                  if (not isinstance(arg, SharedVariable) and
+                      not isinstance(arg, tensor.Constant) )]
    # when we apply the lambda expression we get a mixture of update rules
    # and outputs that needs to be separated

-    outputs, updates = scan_utils.get_updates_and_outputs(fn(*args))
+
+    condition, outputs, updates = scan_utils.get_updates_and_outputs(fn(*args))
+    if condition is not None:
+        as_while = True
+    else:
+        as_while = False
    ##
    ###   Step 3. Check if we actually need scan and remove it if we don't
    ##
@@ -681,6 +734,10 @@ def scan( fn
    if n_fixed_steps in [1, -1]:
        # We do not need to use the scan op anymore, so we can just return
        # the outputs and updates we have
+        if condition is not None:
+            _logger.warning( ('When the number of steps is fixed and equal to 1,'
+                      ' the provided stopping condition, ', str(condition),
+                      ' is ignored'))

        for pos, inner_out in enumerate(outputs):
            # we need to see if we need to pad our sequences with an
@@ -726,8 +783,11 @@ def scan( fn
    ## in args is quite important
    dummy_args += extra_inputs

+    dummy_outs = outputs
+    if condition is not None:
+        dummy_outs.append(condition)
    dummy_f = function( dummy_args
-                       , outputs
+                       , dummy_outs
                       , updates = updates
                       , mode = compile.mode.Mode(linker='py',
                                                  optimizer=None) )
@@ -745,13 +805,18 @@ def scan( fn
    # assumed outputs until now (provided by the user) there can be
    # only one explanation: No information is provided for any of the
    # outputs (i.e. we are dealing with a map)
-    if not ( len(dummy_f.maker.outputs) == n_outs or outs_info == []):
+    tmp_dummy_f_outs = len(dummy_f.maker.outputs)
+    if as_while:
+        tmp_dummy_f_outs -= 1
+    if not ( tmp_dummy_f_outs == n_outs or outs_info == []):
        raise ValueError('Please provide None as output_info for '
                         'any output that does not feed back into '
                         'scan (i.e. it behaves like a map) ')

    if outs_info == []:
        n_outs = len(dummy_f.maker.outputs)
+        if as_while:
+            n_outs = n_outs - 1
        outs_info   = [ dict() for x in xrange(n_outs) ]


@@ -803,24 +868,20 @@ def scan( fn
    other_inner_args = []

    other_scan_args  += [ arg for arg in non_seqs
-                        if not isinstance(arg, SharedVariable) ]
-
-    ## Step 5.6 all non sequences including shared variables with no update rules
-    def new_variable( v ):
-        if isinstance(new_variable, tensor.Constant):
-            return v.clone()
-        new_v = safe_new(v)
-        if getattr(v,'name', None) is not None:
-            new_v.name = v.name + '_copy'
-        return new_v
-    other_inner_args += [ new_variable(arg) for arg in non_seqs
-                         if not isinstance(arg, SharedVariable) ]
+                        if (not isinstance(arg, SharedVariable) and
+                            not isinstance(arg, tensor.Constant))]
+
+    ## Step 5.6 all shared variables with no update rules
+    other_inner_args += [ safe_new(arg,'_copy') for arg in non_seqs
+                         if (not isinstance(arg, SharedVariable) and
+                             not isinstance(arg, tensor.Constant))]
+
    givens.update( dict( zip(other_scan_args, other_inner_args) ))
    other_shared_scan_args  = [ arg.variable for arg
                        in dummy_f.maker.expanded_inputs
                        if ( isinstance(arg.variable, SharedVariable) and
                            not arg.update) ]
-    other_shared_inner_args = [ new_variable(arg.variable) for arg
+    other_shared_inner_args = [ safe_new(arg.variable, '_copy') for arg
                        in dummy_f.maker.expanded_inputs
                        if ( isinstance(arg.variable, SharedVariable) and
                            not arg.update) ]
@@ -845,6 +906,8 @@ def scan( fn
                   sit_sot_inner_outputs +
                   nit_sot_inner_outputs +
                   shared_inner_outputs  )
+    if condition is not None:
+        inner_outs.append(condition)
    if cuda.cuda_available:
        # very often we end up in this situation when we want to
        # replace w with w_copy, where w is CudaNdarray
@@ -886,13 +949,15 @@ def scan( fn
    info['mode']               = mode
    info['inplace']            = False
    info['gpu']                = False
+    info['as_while']           = as_while
+    info['profile']            = profile

    local_op = scan_op.Scan( inner_inputs, new_outs, info )

    ##
    ### Step 8. Compute the outputs using the scan op
    ##
-    _scan_inputs = ( scan_seqs                                    +
+    _scan_inputs = ( scan_seqs                                   +
                   mit_mot_scan_inputs                           +
                   mit_sot_scan_inputs                           +
                   sit_sot_scan_inputs                           +

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -110,7 +110,7 @@ class Scan(Op):
                    TensorType(
                        broadcastable = (False,) + o.type.broadcastable
                        , dtype = o.type.dtype ))
-            # shared outputs
+            # shared outputs + possibly the ending condition
            for o in outputs[end:]:
                if cuda.cuda_available and isinstance(o.type,
                                                      cuda.CudaNdarrayType):
@@ -120,7 +120,8 @@ class Scan(Op):
                else:
                    self.output_types.append( o.type )

-
+        if self.as_while:
+            self.output_types = self.output_types[:-1]
        self.destroy_map = {}

        if hasattr(self,'inplace') and self.inplace:
@@ -154,22 +155,6 @@ class Scan(Op):
        # function that we set in case none was given
        self.info['name'] = self.name

-        # If a shared variable is the result of a ViewOp it is a clear
-        # indication that we need to copy that value after the perform of
-        # scan is done
-        slices = ( self.n_mit_mot_outs +
-                  self.n_mit_sot +
-                  self.n_sit_sot +
-                  self.n_nit_sot )
-        wrapped_inputs  = [Param(x, borrow=True) for x in inputs ]
-        wrapped_outputs = [Out(x, borrow=True) for x in
-                           outputs[:slices] ]
-        wrapped_outputs += outputs[slices:]
-        self.fn = function(wrapped_inputs,
-                           wrapped_outputs,
-                           mode = self.mode_instance,
-                           name = self.name )
-
        # Pre-computing some values to speed up perform
        self.mintaps   = [ numpy.min(x) for x in self.tap_array]
        self.mintaps  += [ 0 for x in xrange(self.n_nit_sot) ]
@@ -182,7 +167,10 @@ class Scan(Op):
                                    self.n_shared_outs )
        self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
        self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
-        self._cmodule_key = gof.CLinker.cmodule_key_(self.fn.maker.env,[])
+        tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
+                                                       self.outputs)
+        local_env = gof.Env(tmp_in, tmp_out)
+        self._cmodule_key = gof.CLinker.cmodule_key_(local_env,[])
        self._hash_inner_graph = hash(self._cmodule_key)


@@ -307,15 +295,15 @@ class Scan(Op):
            # If everything went OK up to here, there is still one thing to
            # check. Namely, do the internal graph represent same
            # computations
-            if not scan_utils.equal_computations(self.inputs,
-                                                 other.inputs,
-                                                 strict = False):
-                return False
+            for self_in, other_in in zip(self.inputs, other.inputs):
+                if self_in.type != other_in.type :
+                    return False

            if not scan_utils.equal_computations(self.outputs,
                                                 other.outputs,
                                                 self.inputs,
-                                                 other.inputs):
+                                                 other.inputs,
+                                                strict = True):
                return False

            # If they do, then they need to match in other small details
@@ -327,15 +315,17 @@ class Scan(Op):
            gpu_str = 'gpu'
        else:
            gpu_str = 'cpu'
-        if self.inplace :
-            aux_txt = '{inplace,%s}'%gpu_str
+        if self.as_while:
+            name = 'while'
        else:
-            aux_txt = '{%s}'%gpu_str
+            name = 'for'

-        if self.name:
-            return self.name+aux_txt
+        if self.inplace :
+            aux_txt = '%s{inplace,%s,%s}'%(name, gpu_str, str(self.name))
        else:
-            return 'scan'+aux_txt
+            aux_txt = '%s{%s,%s}'%(name,gpu_str, str(self.name))
+
+        return aux_txt


    def __hash__(self):
@@ -346,6 +336,68 @@ class Scan(Op):
                scan_utils.hash_listsDictsTuples(self.info) )


+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        """
+        :param node: something previously returned by self.make_node
+
+        :param storage_map: dict variable -> one-element-list where a computed
+                value for this variable may be found.
+
+        :param compute_map: dict variable -> one-element-list where a boolean
+                value will be found.  The boolean indicates whether the
+                variable's storage_map container contains a valid value (True)
+                or if it has not been computed yet (False).
+
+        :param no_recycling: list of variables for which it is forbidden to
+                reuse memory allocated by a previous call.
+
+        :note: If the thunk consults the storage_map on every call, it is safe
+            for it to ignore the no_recycling argument, because elements of the
+            no_recycling list will have a value of None in the storage map.  If
+            the thunk can potentially cache return values (like CLinker does),
+            then it must not do so for variables in the no_recycling list.
+        """
+        node_input_storage = [storage_map[r] for r in node.inputs]
+        node_output_storage = [storage_map[r] for r in node.outputs]
+        node_input_compute = [compute_map[r] for r in node.inputs]
+        node_output_compute = [compute_map[r] for r in node.outputs]
+        #logger.debug('Compiling node %i of graph' % node_idx)
+        # If a shared variable is the result of a ViewOp it is a clear
+        # indication that we need to copy that value after the perform of
+        # scan is done
+        slices = ( self.n_mit_mot_outs +
+                  self.n_mit_sot +
+                  self.n_sit_sot +
+                  self.n_nit_sot )
+        wrapped_inputs  = [Param(x, borrow=True) for x in self.inputs ]
+        wrapped_outputs = [Out(x, borrow=True) for x in
+                           self.outputs[:slices] ]
+        wrapped_outputs += self.outputs[slices:]
+        profile = None
+        if theano.config.profile or type(self.profile) is str:
+            profile = ScanProfileStats(name = self.name)
+        elif self.profile:
+            profile = self.profile
+        self.fn = function(wrapped_inputs,
+                           wrapped_outputs,
+                           mode = self.mode_instance,
+                           name = self.name,
+                          profile = profile)
+
+        p = self.perform
+        # default arguments are stored in the closure of `rval`
+        def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
+            r = p(n, [x[0] for x in i], o)
+            for o in node.outputs:
+                compute_map[o][0] = True
+            return r
+        rval.inputs = node_input_storage
+        rval.outputs = node_output_storage
+        rval.perform = p
+        rval.lazy = False
+        return rval
+
+
    def perform( self, node, args, outs):
        """
        The args are packed like this:
@@ -438,8 +490,12 @@ class Scan(Op):
        for idx in xrange(len(other_args)):
            input_storage[idx+offset].storage[0] = other_args[idx]

+        i = 0
+        cond = True
        ############## THE MAIN LOOP #########################
-        for i in xrange(n_steps):
+        #for i in xrange(n_steps):
+        while (i< n_steps) and cond:
+
            # sequences over which scan iterates
            # 3. collect input slices
            for idx in xrange(self.n_seqs):
@@ -496,11 +552,18 @@ class Scan(Op):
            offset += self.n_outs+self.n_nit_sot - self.n_mit_mot
            for idx in xrange(self.n_shared_outs):
                output_storage[idx+offset].storage[0] = None
-
+            # If condition add it to the mix
+            if self.as_while:
+                pdx = offset + self.n_shared_outs
+                output_storage[pdx].storage[0] = None
            # 5. compute outputs
            t0_fn = time.time()
            fn()
            dt_fn = time.time() - t0_fn
+            if self.as_while:
+                pdx = offset + self.n_shared_outs
+                cond = output_storage[pdx].storage[0] == 0
+
            t_fn += dt_fn
            offset_out = 0
            # 5.1 Copy over the values for mit_mot outputs
@@ -558,13 +621,14 @@ class Scan(Op):
                               itertools.izip(pos, store_steps)
                               ]

+            i = i+1

        # 6. Check if you need to re-order output buffers
        begin = self.n_mit_mot
        end   = self.n_outs + self.n_nit_sot
        for idx in xrange(begin, end):
            min_tap = self.mintaps[idx]
-            if ( store_steps[idx] < n_steps-self.mintaps[idx] and
+            if ( store_steps[idx] < i-self.mintaps[idx] and
                pos[idx] < store_steps[idx] ):

                pdx = pos[idx]
@@ -594,8 +658,8 @@ class Scan(Op):
            # backpropagation through time. In such a scenarion Scan is
            # expected to return 0 for all entries for which the gradient is
            # not actually computed
-            elif store_steps[idx] > n_steps - self.mintaps[idx]:
-                outs[idx][0][n_steps-self.mintaps[idx]:] = 0
+            elif store_steps[idx] > i - self.mintaps[idx]:
+                outs[idx][0][i-self.mintaps[idx]:] = 0


        t_call = time.time() - t0_call
@@ -603,15 +667,25 @@ class Scan(Op):
        # and this little string helps us to find this spot:
        # "PROFILE_CODE"

-        if hasattr(self.fn.maker.mode,'fct_call_time'):
-            self.fn.maker.mode.fct_call_time[self.fn] += t_fn
-            self.fn.maker.mode.fct_call[self.fn] += n_steps
-
-        self.fn.maker.mode.call_time += t_fn
-        self.fn.maker.mode.fn_time += t_fn
+        if hasattr(self.fn.maker, 'profile') and self.fn.maker.profile:
+            profile = self.fn.maker.profile
+            profile.callcount += 1
+            profile.nbsteps += n_steps
+            profile.call_time += t_call
+            profile.vm_call_time +=  t_fn
+            if hasattr(self.fn.fn, 'update_profile'):
+                self.fn.fn.update_profile(profile)
+
+        #/* Old ProfileMode
+        #if hasattr(self.fn.maker.mode,'fct_call_time'):
+        #    self.fn.maker.mode.fct_call_time[self.fn] += t_fn
+        #    self.fn.maker.mode.fct_call[self.fn] += n_steps
+
+        #self.fn.maker.mode.call_time += t_fn
+        #self.fn.maker.mode.fn_time += t_fn
+        # Old Profile Mode */
        self.t_call = t_call
-        self.t_fn = t_fn
-
+        self.t_fn   = t_fn

    ### Infer Shape
    def infer_shape(self, node, input_shapes):
@@ -644,9 +718,12 @@ class Scan(Op):
        out_equivalent = {}
        for in_ns, out_ns in zip(inner_non_sequences, node.inputs[offset:]):
            out_equivalent[in_ns] = out_ns
-
+        if self.as_while:
+            self_outs = self.outputs[:-1]
+        else:
+            self_outs = self.outputs
        outs_shape = scan_utils.infer_shape(
-                outs = self.outputs,
+                outs = self_outs,
                inputs = self.inputs,
                input_shapes = inner_ins_shapes)
        # Will be used to check if outs_shape can be expressed without using
@@ -704,18 +781,10 @@ class Scan(Op):
        # Note ! We don't want to use the actual same variable as the ones
        # used by the original scan, rather create clones of them

-        def new_var(x):
-            nw_x = x.type()
-            if x.name:
-                nw_x.name=x.name +'grad_copy'
-            return nw_x
-
-
-        self_inputs = [new_var(x) for x in self.inputs ]
-        givens = {}
-        for new_x, x in zip(self_inputs, self.inputs):
-            givens[x] = new_x
-        self_outputs = scan_utils.clone(self.outputs, replace=givens)
+        rval = scan_utils.reconstruct_graph(self.inputs,
+                                            self.outputs,'_grad')
+        self_inputs  = rval[0]
+        self_outputs = rval[1]


        seqs   = self_inputs[:self.n_seqs]
@@ -738,6 +807,7 @@ class Scan(Op):
                                 + self.n_mit_sot
                                 + self.n_nit_sot
                                 + self.n_sit_sot )
+        # shared variables as well as the condition as well as the condition
        old_scan_shared_outs  = self_outputs[out_offset:]
        arg_offset = ( 1
                      + self.n_seqs
@@ -992,6 +1062,8 @@ class Scan(Op):
        info['n_sit_sot']                = 0
        info['n_shared_outs']            = n_shared_outs + self.n_shared_outs
        info['n_nit_sot']                = n_nit_sot
+        info['as_while']           = self.as_while
+        info['profile']            = self.profile
        if self.name:
            info['name']  = 'grad_of_' + self.name
        else:
@@ -1060,6 +1132,180 @@ class Scan(Op):
        gradients += outputs[begin:end]
        return gradients

+    def R_op(self, inputs, eval_points):
+        # Step 0. Don't work on the orignal tensor variables
+        rval = scan_utils.reconstruct_graph(self.inputs,
+                                            self.outputs,'_rop')
+        self_inputs  = rval[0]
+        self_outputs = rval[1]
+        # Step 1. Compute the R_op of the inner function
+        inner_eval_points = [scan_utils.safe_new(x,'_evalpoint') for x in self_inputs]
+        if self.as_while:
+            rop_self_outputs = self_outputs[:-1]
+        else:
+            rop_self_outputs = self_outputs
+        rop_outs = tensor.Rop(rop_self_outputs, self_inputs, inner_eval_points)
+        if type(rop_outs) not in (list, tuple):
+            rop_outs = [rop_outs]
+        # Step 2. Figure out what corresponds to what in the scan
+
+        # When doing the R-op of scan, you end up having double of each type of
+        # input, because for each sequence you need also its eval point, for
+        # each mit_mot, mit_sot, sit_sot or other type of inputs the same.
+        # Interestingly enough, all these types of eval points behave the same
+        # way as the input to which they correspond
+        # The only exception is the eval point for the number of sequences, and
+        # evan point for the number of nit_sot which I think should just be
+        # ignored (?)
+        info = {}
+        info['n_seqs']    = self.n_seqs*2
+        info['n_mit_sot'] = self.n_mit_sot*2
+        info['n_sit_sot'] = self.n_sit_sot*2
+        info['n_mit_mot'] = self.n_mit_mot*2
+        info['n_nit_sot'] = self.n_nit_sot*2
+        info['n_shared_outs'] = self.n_shared_outs*2
+        info['gpu']       = False
+        info['as_while']  = self.as_while
+        info['profile'] = self.profile
+        info['truncate_gradient'] = self.truncate_gradient
+        if self.name:
+            info['name'] = 'rop_of_'+self.name
+        else:
+            info['name'] = None
+        info['mode'] = self.mode
+        info['inplace'] = False
+        info['mit_mot_out_slices'] = self.mit_mot_out_slices*2
+        new_tap_array = []
+        b = 0
+        e = self.n_mit_mot
+        new_tap_array += self.tap_array[b:e]*2
+        b = e
+        e += self.n_mit_sot
+        new_tap_array += self.tap_array[b:e]*2
+        b = e
+        e += self.n_sit_sot
+        new_tap_array += self.tap_array[b:e]*2
+        info['tap_array'] = new_tap_array
+
+        # Sequences ...
+        b  = 1
+        ib = 0
+        e  = 1 + self.n_seqs
+        ie = self.n_seqs
+        scan_seqs  = inputs[b:e] + eval_points[b:e]
+        inner_seqs = self_inputs[ib:ie] + inner_eval_points[ib:ie]
+
+        # MIT_MOT sequences ...
+        b  = e
+        e  = e + self.n_mit_mot
+        ib = ie
+        ie = ie + int(numpy.sum([len(x) for x in
+                                 self.tap_array[:self.n_mit_mot]]))
+        scan_mit_mot  = inputs[b:e] + eval_points[b:e]
+        inner_mit_mot = self_inputs[ib:ie] + inner_eval_points[ib:ie]
+
+        # MIT_SOT sequences ...
+        b  = e
+        e  = e + self.n_mit_sot
+        ib = ie
+        ie = ie + int(numpy.sum([len(x) for x in
+                             self.tap_array[self.n_mit_mot:self.n_mit_mot+self.n_mit_sot]]))
+        scan_mit_sot  = inputs[b:e] + eval_points[b:e]
+        inner_mit_sot = self_inputs[ib:ie] + inner_eval_points[ib:ie]
+
+        #SIT_SOT sequences ...
+        b  = e
+        e  = e + self.n_sit_sot
+        ib = ie
+        ie = ie + self.n_sit_sot
+        scan_sit_sot  = inputs[b:e] + eval_points[b:e]
+        inner_sit_sot = self_inputs[ib:ie] + inner_eval_points[ib:ie]
+
+        #Shared outs ...
+        b  = e
+        e  = e + self.n_shared_outs
+        ib = ie
+        ie = ie + self.n_shared_outs
+        scan_shared  = inputs[b:e] + eval_points[b:e]
+        inner_shared = self_inputs[ib:ie] + inner_eval_points[ib:ie]
+
+        # NIT_SOT sequences
+        b = e
+        e = e + self.n_nit_sot
+        scan_nit_sot = inputs[b:e]*2
+
+
+        # All other arguments
+        scan_other = inputs[e:] + eval_points[e:]
+        inner_other = self_inputs[ie:] + inner_eval_points[ie:]
+
+        # Outputs
+        n_mit_mot_outs = int(numpy.sum([len(x) for x in
+                                        self.mit_mot_out_slices]))
+        info['n_mit_mot_outs'] = n_mit_mot_outs
+        b = 0
+        e = n_mit_mot_outs
+        inner_out_mit_mot = self_outputs[b:e] + rop_outs[b:e]
+        b = e
+        e = e + self.n_mit_sot
+        inner_out_mit_sot = self_outputs[b:e] + rop_outs[b:e]
+        b = e
+        e = e + self.n_sit_sot
+        inner_out_sit_sot = self_outputs[b:e] + rop_outs[b:e]
+        b = e
+        e = e + self.n_nit_sot
+        inner_out_nit_sot = self_outputs[b:e] + rop_outs[b:e]
+        b = e
+        e = e + self.n_shared_outs
+        inner_out_shared = self_outputs[b:e] + rop_outs[b:e]
+
+        inner_ins = ( inner_seqs +
+                     inner_mit_mot +
+                     inner_mit_sot +
+                     inner_sit_sot +
+                     inner_shared +
+                     inner_other )
+        inner_outs = ( inner_out_mit_mot +
+                      inner_out_mit_sot +
+                      inner_out_sit_sot +
+                      inner_out_nit_sot +
+                      inner_out_shared)
+
+        if self.as_while:
+            inner_outs += [self_outputs[-1]]
+        scan_inputs =  ( [inputs[0]] +
+                        scan_seqs +
+                        scan_mit_mot +
+                        scan_mit_sot +
+                        scan_sit_sot +
+                        scan_shared +
+                        scan_nit_sot +
+                        scan_other)
+
+        local_op = Scan( inner_ins, inner_outs, info )
+        outputs = local_op(*scan_inputs)
+        if type(outputs) not in (list, tuple):
+            outputs = [ outputs ]
+        # Select only the result of the R_op results
+        final_outs = []
+        b = self.n_mit_mot
+        e = self.n_mit_mot*2
+        final_outs += outputs[b:e]
+        b = e + self.n_mit_sot
+        e = e + self.n_mit_sot*2
+        final_outs += outputs[b:e]
+        b = e + self.n_sit_sot
+        e = e + self.n_sit_sot*2
+        final_outs += outputs[b:e]
+        b = e + self.n_nit_sot
+        e = e + self.n_nit_sot*2
+        final_outs += outputs[b:e]
+        b = e + self.n_shared_outs
+        e = e + self.n_shared_outs*2
+        final_outs += outputs[b:e]
+
+        return final_outs
+

 @theano.compile.profilemode.register_profiler_printer
 def profile_printer(fct_name, compile_time, fct_call_time, fct_call,

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -17,7 +17,7 @@ import numpy
 import sys

 import theano
-from theano import tensor
+from theano import tensor, scalar
 from theano.tensor import opt, TensorType, get_constant_value
 from theano import gof
 from theano.compile import optdb
@@ -26,7 +26,7 @@ from theano import config

 import scan_op
 import scan_utils
-from scan_utils import clone, equal_computations
+from scan_utils import clone, equal_computations, find_up, scan_args
 from theano.gof.opt import pre_constant_merge, pre_greedy_local_optimizer

 # Logging function for sending warning or info
@@ -75,9 +75,10 @@ def remove_constants_and_unused_inputs_scan(node):
        if (isinstance(node.inputs[idx+1], tensor.TensorConstant) and
            node.inputs[idx+1].tag.unique_value is not None):
            try:
-                val = tensor.get_constant_value(node.inputs[idx+1],
-                                                return_ndarray = True)
-                givens[op_ins[idx]] = tensor.constant(val[0])
+                # This works if input is a constant that has all entries
+                # equal
+                val = tensor.get_constant_value(node.inputs[idx+1])
+                givens[op_ins[idx]] = node.inputs[idx+1].clone()[0]
            except TypeError:
                pass
        elif op_ins[idx] in all_ins:
@@ -99,6 +100,7 @@ def remove_constants_and_unused_inputs_scan(node):
        op_outs = scan_utils.clone(op_outs, replace = givens)
        nw_info = op.info.copy()
        nw_info['n_seqs'] = nw_n_seqs
+        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
        nw_outs = nwScan.make_node(*nw_outer).outputs
        return nw_outs
@@ -113,147 +115,156 @@ optdb.register( 'scanOp_remove_constants_and_unused_inputs'
               , 'scan')


-@gof.local_optimizer([None])
-def scan_pushout_non_seq_operation(node):
-    if not isinstance(node.op, scan_op.Scan):
-        return False
-    # this flag tells if there was any change during the last iterations
-    changed   = True
-    try:
+class PushOutNonSeqScan(gof.Optimizer):
+
+    def __init__(self):
+        gof.Optimizer.__init__(self)
+
+    def add_requirements(self,env):
+        env.extend(gof.toolbox.ReplaceValidate())
+
+
+    def apply(self, env):
+        nodelist = [x for x in env.toposort() if isinstance(x.op,
+                                                           scan_op.Scan)]
+        for node in nodelist:
+            self.process_node(env, node)
+
+    def process_node(self, env, node):
+        # this flag tells if there was any change during the last iterations
+
+
+        changed   = True
        clean_inputs, clean_outputs = scan_utils.reconstruct_graph(
                        node.op.inputs, node.op.outputs)
-        local_env = gof.Env(clean_inputs, clean_outputs)
-    except:
-        import ipdb; ipdb.set_trace()
-
-    max_iterations = 2*len(local_env.toposort()) + 3
-    counts = 0
-    to_remove        = []
-    to_replace       = []
-    replace_with_in  = []
-    replace_with_out = []
-    op = node.op
-    # Construct the list of non_sequences to simplify a few things
-    st  = op.n_seqs
-    st += int(numpy.sum([len(x) for x in
-                         op.tap_array[:(op.n_mit_mot+op.n_mit_sot)] ]))
-    st += op.n_sit_sot
-    st += op.n_shared_outs
-    non_seqs = clean_inputs[st:]
-    st  = ( op.n_seqs +
-           op.n_mit_mot +
-           op.n_mit_sot +
-           op.n_sit_sot +
-           op.n_nit_sot +
-           op.n_shared_outs +1 )
-    outer_non_seqs = node.inputs[st:]

-    while changed and counts < max_iterations:
-        counts += 1
-        changed = False
-        for nd in local_env.toposort():
-            if (    numpy.all([ (x in non_seqs) or
-                                (x.owner in to_remove) or
-                                isinstance(x, tensor.Constant)
-                               for x in nd.inputs]) and
-                    # we can do this because the assumption is that a
-                    # viewOp or deepCopyOp will be just at the end of the
-                    # function and not somewhere in the middle ..
-                    not isinstance(nd.op,theano.compile.ViewOp) and
-                    not isinstance(nd.op,theano.compile.DeepCopyOp) and
-                    # and we didn't already looked at this node
-                    not nd in to_remove
-               ):
-
-                # We have a candidate node to removable
-                # Step 1. Reconstruct it on outside
-                to_remove += [nd]
-                outside_ins = []
-                for x in nd.inputs:
-                    if x in non_seqs:
-                        outside_ins +=[ outer_non_seqs[non_seqs.index(x)]]
-                    elif x in to_replace:
-                        outside_ins +=[replace_with_out[to_replace.index(x)]]
-                    elif isinstance(x, theano.Constant):
-                        outside_ins +=[x.clone()]
-                    else:
-                        raise Exception(
-                            ('Error in the `scan_pushout_non_seq_operations`'
-                             '. The optimization tries to move some '
-                             'computation fron scan which is not allowed '
-                             'to move. Report this on theano-users list'),x )
-                nw_outer_node = nd.op.make_node(*outside_ins)
-                # Step 2. Create variables for replacements
-                for idx,y in enumerate(nd.outputs):
-                    y_place_holder = scan_utils.safe_new(y,'_replace')
-                    to_replace       += [y]
-                    replace_with_in  += [y_place_holder]
-                    if (cuda.cuda_available and
-                        isinstance(nw_outer_node.outputs[idx],
-                                   CudaNdarrayType)):
-                        nw_out = nw_outer_node.outputs[idx]
-                        replace_with_out += [host_from_gpu(nw_out)]
-                    else:
+
+        local_env = gof.Env(clean_inputs, clean_outputs)
+        max_iterations = 2*len(local_env.toposort()) + 3
+        counts = 0
+        to_remove        = []
+        to_replace       = []
+        replace_with_in  = []
+        replace_with_out = []
+        op = node.op
+        # Construct the list of non_sequences to simplify a few things
+        st  = op.n_seqs
+        st += int(numpy.sum([len(x) for x in
+                             op.tap_array[:(op.n_mit_mot+op.n_mit_sot)] ]))
+        st += op.n_sit_sot
+        st += op.n_shared_outs
+        non_seqs = clean_inputs[st:]
+        st  = ( op.n_seqs +
+               op.n_mit_mot +
+               op.n_mit_sot +
+               op.n_sit_sot +
+               op.n_nit_sot +
+               op.n_shared_outs +1 )
+        outer_non_seqs = node.inputs[st:]
+        assert len(non_seqs) == len(outer_non_seqs)
+        while changed and counts < max_iterations:
+            counts += 1
+            changed = False
+
+            for nd in local_env.toposort():
+                if (    numpy.all([ (x in non_seqs) or
+                                    (x.owner in to_remove) or
+                                    isinstance(x, tensor.Constant)
+                                   for x in nd.inputs]) and
+                        # we can do this because the assumption is that a
+                        # viewOp or deepCopyOp will be just at the end of the
+                        # function and not somewhere in the middle ..
+                        not isinstance(nd.op,theano.compile.ViewOp) and
+                        not isinstance(nd.op,theano.compile.DeepCopyOp) and
+                        # and we didn't already looked at this node
+                        not nd in to_remove
+                   ):
+
+                    # We have a candidate node to removable
+                    # Step 1. Reconstruct it on outside
+                    to_remove.append(nd)
+                    outside_ins = []
+                    for x in nd.inputs:
+                        if x in non_seqs:
+                            outside_ins +=[ outer_non_seqs[non_seqs.index(x)]]
+                        elif x in to_replace:
+                            outside_ins +=[replace_with_out[to_replace.index(x)]]
+                        elif isinstance(x, theano.Constant):
+                            outside_ins +=[x.clone()]
+                        else:
+                            raise Exception(
+                                ('Error in the `scan_pushout_non_seq_operations`'
+                                 '. The optimization tries to move some '
+                                 'computation fron scan which is not allowed '
+                                 'to move. Report this on theano-users list'),x )
+                    nw_outer_node = nd.op.make_node(*outside_ins)
+                    # Step 2. Create variables for replacements
+                    for idx,y in enumerate(nd.outputs):
+
+                        y_place_holder = scan_utils.safe_new(y,'_replace')
+                        to_replace       += [y]
+                        replace_with_in  += [y_place_holder]
+                        assert type(y) == type(nw_outer_node.outputs[idx])
                        replace_with_out += [nw_outer_node.outputs[idx]]
-                changed = True
-
-    if counts >= max_iterations:
-        raise Exception( ('Error in the `scan_pushout_non_seq_operations`.'
-                          ' The optimization exhausted the maximal number '
-                          'of iterations allowed!'))
-    # We need to check all candidate replacements and choose those that
-    # make sense for us
-
-    # Step 1. which elements of `to_replace` are used by remaining
-    # components of the inner function
-    clean_to_replace       = []
-    clean_replace_with_in  = []
-    clean_replace_with_out = []
-    existent_nodes = [ nd for nd in local_env.toposort()
-                        if nd not in to_remove]
-    to_keep = []
-    for nd in existent_nodes:
-        to_keep += nd.inputs
-    for idx,out in enumerate(to_replace):
-        if out in to_keep and out.owner not in existent_nodes:
-            clean_to_replace += [out]
-            clean_replace_with_in  += [replace_with_in[idx]]
-            clean_replace_with_out += [replace_with_out[idx]]
-
-    if len(clean_to_replace) > 0:
-        # We can finally put an end to all this madness
-        givens = {}
-        nw_outer = []
-        nw_inner = []
-        for to_repl, repl_in, repl_out in zip( clean_to_replace,
+                    changed = True
+
+        if counts >= max_iterations:
+            raise Exception( ('Error in the `scan_pushout_non_seq_operations`.'
+                              ' The optimization exhausted the maximal number '
+                              'of iterations allowed!'))
+        # We need to check all candidate replacements and choose those that
+        # make sense for us
+
+        # Step 1. which elements of `to_replace` are used by remaining
+        # components of the inner function
+        clean_to_replace       = []
+        clean_replace_with_in  = []
+        clean_replace_with_out = []
+        existent_nodes = [ nd for nd in local_env.toposort()
+                            if nd not in to_remove]
+        to_keep = []
+        for nd in existent_nodes:
+            to_keep += nd.inputs
+        for idx,out in enumerate(to_replace):
+            if out in to_keep and out.owner not in existent_nodes:
+                clean_to_replace += [out]
+                clean_replace_with_in  += [replace_with_in[idx]]
+                clean_replace_with_out += [replace_with_out[idx]]
+
+        if len(clean_to_replace) > 0:
+            # We can finally put an end to all this madness
+            givens = {}
+            nw_outer = []
+            nw_inner = []
+            for to_repl, repl_in, repl_out in zip( clean_to_replace,
                                              clean_replace_with_in,
                                              clean_replace_with_out):
-            if isinstance(repl_out, theano.Constant):
-                # Is this even possible !?
-                repl_in = repl_out.clone()
-            else:
-                nw_inner += [repl_in]
-                nw_outer += [repl_out]
-            givens[to_repl] = repl_in
-
-
-        _op_outs = scan_utils.clone(clean_outputs,
-                                    replace=givens)
-
-        _op_ins = clean_inputs + nw_inner
-        op_ins, op_outs = scan_utils.reconstruct_graph(_op_ins, _op_outs, '')
-        # Reconstruct node
-        nwScan = scan_op.Scan(op_ins, op_outs, op.info)
-        node = nwScan.make_node(* (node.inputs + nw_outer))
-        return node.outputs
-    else:
-        return False
+                if isinstance(repl_out, theano.Constant):
+                    repl_in = repl_out.clone()
+                else:
+                    nw_inner += [repl_in]
+                    nw_outer += [repl_out]
+                givens[to_repl] = repl_in
+
+            _op_outs = scan_utils.clone(clean_outputs,
+                                        replace=givens)
+            _op_ins = clean_inputs + nw_inner
+            op_ins, op_outs = scan_utils.reconstruct_graph(_op_ins, _op_outs)
+            # Reconstruct node
+            nwScan = scan_op.Scan(op_ins, op_outs, op.info)
+            nw_node = nwScan.make_node(* (node.inputs + nw_outer))
+            env.replace_all_validate(zip(node.outputs, nw_node.outputs),
+                                     reason = 'scan_push_computation_out')
+            return True
+        else:
+            return False


 optdb.register('scanOp_pushout_nonseqs_ops',
-               opt.in2out( scan_pushout_non_seq_operation,
-                          ignore_newtrees=True),
-               1.90,
+               PushOutNonSeqScan(),
+               #opt.out2in( scan_pushout_non_seq_operation),
+                        #  ignore_newtrees=True),
+               1.899,
               'fast_run',
               'scan')

@@ -757,6 +768,236 @@ optdb.register( 'scanOp_save_mem'
               , 'scan')


+class ScanMerge(gof.Optimizer):
+    """ Graph Optimizer that merges different scan ops """
+    def add_requirements(self,env):
+        env.extend(gof.toolbox.ReplaceValidate())
+
+    def merge(self, A,B, as_while):
+        Aargs = scan_args(A.inputs, A.outputs, A.op.inputs, A.op.outputs, A.op.info)
+        Bargs = scan_args(B.inputs, B.outputs, B.op.inputs, B.op.outputs, B.op.info)
+        Margs = Aargs.merge(Bargs)
+
+        # fixup name
+        info = Margs.info
+        info['name'] = A.op.name+'&'+B.op.name
+
+        #indicates that we have a stopping condition for scan
+        if as_while:
+            Margs_inner_outs = Margs.inner_outputs + Margs.cond
+        else:
+            Margs_inner_outs = Margs.inner_outputs
+        op = scan_op.Scan(Margs.inner_inputs, Margs_inner_outs, info)
+
+        outputs = op(*Margs.outer_inputs)
+
+        if type(outputs) not in (list, tuple):
+            outputs = [outputs]
+
+        return zip(Margs.outer_outputs, outputs)
+
+    def apply(self, env):
+        nodelist = list(env.toposort())
+        scan_nodes = filter(lambda s: isinstance(s.op, scan_op.Scan), nodelist)
+
+        nscan = dict()
+        for snode in scan_nodes:
+            n_steps = snode.inputs[0]
+            try:
+                n_steps = int(get_constant_value(n_steps))
+            except TypeError:
+                pass
+            l = nscan.get(n_steps)
+            if l is None:
+                nscan[n_steps] = [snode]
+            else:
+                l.append(snode)
+        for snodes in nscan.values():
+            if len(snodes) > 1:
+                # amongst nodes that have the same number of steps
+                # try to find the ones that can be merged
+                curnode = snodes[0]
+                for snode in snodes[1:]:
+                    if (snode.op.truncate_gradient == curnode.op.truncate_gradient and
+                        snode.op.mode == curnode.op.mode and
+                        not find_up(snode, curnode)):
+                        if (not snode.op.as_while and
+                            not curnode.op.as_while):
+                            proposal = self.merge(curnode, snode, False)
+                            env.replace_all_validate(proposal, reason='scan merge')
+                        elif (snode.op.as_while and
+                              curnode.op.as_while):
+                            # check if equal computations
+                            if scan_utils.equal_computations(
+                                [snode.op.outputs[-1]],
+                                [curnode.op.outputs[-1]],
+                                snode.op.inputs,
+                                curnode.op.inputs):
+                                proposal = self.merge(curnode, snode, True)
+                                env.replace_all_validate(proposal, reason =
+                                                         'scan_merge')
+                            else:
+                                pass
+                        else:
+                            pass
+                        # other merges will be done in other passes
+                        break
+
+# after const merge but before stabilize so that we can have identity
+# for equivalent nodes but we still have the chance to hoist stuff out
+# of the scan later.
+optdb.register('scanOp_merge',
+               EquilibriumOptimizer([ScanMerge()],
+                                    max_use_ratio=11),
+               1.90,
+               'fast_run',
+               'scan')
+
+def has_duplicates(l):
+    """returns true if l has any duplicates (according to __eq__)."""
+    return len(set(l)) < len(l)
+
+def make_equiv(lo, li):
+    """builds a dictionary of equivalences between inner inputs based on the equivalence of their corresponding outer inputs."""
+    seeno = {}
+    left  = []
+    right = []
+    for o, i in zip(lo, li):
+        if o in seeno:
+            left  += [i]
+            right += [o]
+        else:
+            seeno[o] = i
+    return left, right
+
+@gof.local_optimizer([None])
+def scan_merge_inouts(node):
+    if not isinstance(node.op, scan_op.Scan):
+        return False
+
+    a = scan_args(node.inputs, node.outputs,
+                  node.op.inputs, node.op.outputs, node.op.info)
+
+    inp_equiv = {}
+
+    if has_duplicates(a.outer_in_seqs):
+        new_outer_seqs = []
+        new_inner_seqs = []
+        for out_seq, in_seq in zip(a.outer_in_seqs, a.inner_in_seqs):
+            if out_seq in new_outer_seqs:
+                i = new_outer_seqs.index(out_seq)
+                inp_equiv[in_seq] = new_inner_seqs[i]
+            else:
+                new_outer_seqs.append(out_seq)
+                new_inner_seqs.append(in_seq)
+        a.outer_in_seqs = new_outer_seqs
+        a.inner_in_seqs = new_inner_seqs
+
+    if has_duplicates(a.outer_in_non_seqs):
+        new_outer_nseqs = []
+        new_inner_nseqs = []
+        for out_nseq, in_nseq in zip(a.outer_in_non_seqs, a.inner_in_non_seqs):
+            if out_nseq in new_outer_nseqs:
+                i = new_outer_nseqs.index(out_nseq)
+                inp_equiv[in_nseq] = new_inner_nseqs[i]
+            else:
+                new_outer_nseqs.append(out_nseq)
+                new_inner_nseqs.append(in_nseq)
+        a.outer_in_non_seqs = new_outer_nseqs
+        a.inner_in_non_seqs = new_inner_nseqs
+
+    if len(inp_equiv) > 0:
+        # do the replacement now. The rest will be left to ScanSaveMem
+        inner_inputs = a.inner_inputs
+        outer_inputs = a.outer_inputs
+        info = a.info
+        if info['as_while']:
+            a_inner_outs = a.inner_outputs + a.cond
+        else:
+            a_inner_outs = a.inner_outputs
+        inner_outputs = scan_utils.clone(a_inner_outs, replace=inp_equiv)
+        orig_outputs = a.outer_outputs
+
+        op = scan_op.Scan(inner_inputs, inner_outputs, info)
+        outputs = op(*outer_inputs)
+
+        if not isinstance(outputs, (list, tuple)):
+            outputs = [outputs]
+
+        na = scan_args(outer_inputs, outputs, op.inputs, op.outputs, op.info)
+    else:
+        na = a
+
+    # start again
+    left  = []
+    right = []
+
+    if has_duplicates(na.outer_in_shared):
+        _left, _right = make_equiv(na.outer_in_shared, na.inner_in_shared)
+        left  += _left
+        right += _right
+    if has_duplicates(na.outer_in_sit_sot):
+        _left, _right = make_equiv(na.outer_in_sit_sot, na.inner_in_sit_sot)
+        left  += _left
+        right += _right
+    if has_duplicates(na.outer_in_mit_mot):
+        seen = {}
+        for omm, imm, _sl in zip(na.outer_in_mit_mot, na.inner_in_mit_mot, na.mit_mot_in_slices):
+            sl = tuple(_sl)
+            if (omm, sl) in seen:
+                simm = seen[(omm, sl)]
+                left  += imm
+                right += simm
+            else:
+                seen[(omm, sl)] = imm
+
+    if has_duplicates(na.outer_in_mit_sot):
+        seen = {}
+        for oms, ims, _sl in zip(na.outer_in_mit_sot, na.inner_in_mit_sot, na.mit_sot_in_slices):
+            sl = tuple(_sl)
+            if (oms, sl) in seen:
+                sims = seen[(oms, sl)]
+                left  += ims
+                right += sims
+            else:
+                seen[(oms, sl)] = ims
+
+    def map_out(i, o, seen):
+        for si, so in seen:
+            if equal_computations([i], [si],left, right):
+                return so
+        seen.append((i, o))
+        return o
+
+    seen = []
+    na.outer_out_nit_sot = [map_out(i, o, seen) for i, o in zip(na.inner_out_nit_sot, na.outer_out_nit_sot)]
+
+    seen = []
+    na.outer_out_sit_sot = [map_out(i, o, seen) for i, o in zip(na.inner_out_sit_sot, na.outer_out_sit_sot)]
+
+    seen = []
+    na.outer_out_mit_sot = [map_out(i, o, seen) for i, o in zip(na.inner_out_mit_sot, na.outer_out_mit_sot)]
+
+    seen = []
+    new_outer_out_mit_mot = []
+    for imm, omm, osl in zip(na.inner_out_mit_mot, na.outer_out_mit_mot, na.mit_mot_out_slices):
+        for simm, somm, sosl in seen:
+            if osl == sosl and equal_computations(imm, simm, left, right):
+                new_outer_out_mit_mot.append(somm)
+                break
+        else:
+            seen.append((imm, omm, osl))
+            new_outer_out_mit_mot.append(omm)
+    na.outer_out_mit_mot = new_outer_out_mit_mot
+
+    return na.outer_outputs
+
+optdb.register('scanOp_merge_inouts'
+               , opt.in2out(scan_merge_inouts,ignore_newtrees=True)
+              , 1.91
+              , 'fast_run'
+              , 'scan')
+
 from theano.sandbox import cuda

 if cuda.cuda_available:

--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -12,13 +12,14 @@ __authors__ = ( "Razvan Pascanu "
 __copyright__ = "(c) 2010, Universite de Montreal"
 __contact__ = "Razvan Pascanu <r.pascanu@gmail>"

+import copy
 import logging
 import numpy

 from theano import config
 from theano.compile.pfunc import rebuild_collect_shared
 from theano import gof
-from theano import tensor
+from theano import tensor, scalar
 from theano.tensor.basic import get_constant_value

 from theano.sandbox import cuda
@@ -42,8 +43,16 @@ def safe_new(x, tag = ''):
        nw_name = x.name + tag
    else:
        nw_name = None
-    if isinstance(x.type, tensor.Constant):
+    # Should it be theano.Constant? What is the difference between the two?
+    if isinstance(x, tensor.Constant):
        return x.clone()
+    # Note, as_tensor_variable will convert the Scalar into a
+    # TensorScalar that will require a ScalarFromTensor op,
+    # making the pushout optimization fail
+    elif isinstance(x, scalar.ScalarVariable):
+        nw_x = x.type()
+        nw_x.name = nw_name
+        return nw_x
    else:
        try:
            x = tensor.as_tensor_variable(x)
@@ -69,25 +78,9 @@ class until(object):
    order, but since this was not impose up to know it can make quite a bit
    of code to fail).
    """
-    def __init__(self, condition, outputs = None, updates = None):
+    def __init__(self, condition):
        self.condition = tensor.as_tensor_variable(condition)
        assert self.condition.ndim == 0
-        if outputs is None:
-            self.outputs = []
-        elif type(outputs) in (list, tuple):
-            self.outputs = list(outputs)
-        else:
-            self.outptus = [outputs]
-        if updates is None:
-            self.updates = {}
-        elif type(updates) is dict:
-            self.updates = updates
-        elif type(udpates) is (list, tuple):
-            self.updates = dict(updates)
-        else:
-            raise Exception( ('Scan could not parse the returned values by'
-                              ' the lambda function describing the inner'
-                              ' operations of scan '))


 def traverse(out, x,x_copy, d):
@@ -162,136 +155,92 @@ def clone( output



-def get_updates_and_outputs(outputs_updates):
+def get_updates_and_outputs(ls):
    """
-    This function tries to recognize the updates dictionary and the
-    list of outputs from the input argument and return them in a
-    predefined order
+    This function tries to recognize the updates dictionary, the
+    list of outputs and the stopping condition returned by the
+    lambda expression and arrange them in a predefined order


-    The code that follows tries to be as flexible as possible allowing the
-    user to return the output and updates in any order, and giving the
-    updates however (s)he wants ( as a dictionary or a list o pairs ..)
-    Is there a way to compress all this by writing it in a more
-    pythonic/functional way?
    """
-    outputs = []
-    updates = {}
-    cond = None
-
-    def pick_from2(elem0, elem1):
-        lupd = {}
-        lout = []
-        if ( isinstance(elem0,dict) or
-                ( isinstance(elem0, (list,tuple)) and
-                    isinstance(elem0[0], (list,tuple)))):
-            # elem0 is the updates dictionary / list
-            lupd = dict(elem0)
-            lout = elem1
-            if not isinstance(outputs, (list,tuple)):
-                lout = [outputs]
-        elif ( isinstance(elem1, dict) or
-                ( isinstance(elem1, (list,tuple)) and
-                    isinstance(elem1[0], (list,tuple))) ):
-            # elem1 is the updates dictionary / list
-            lupd = dict(elem1)
-            lout = elem0
-            if not isinstance(outputs, (list,tuple)):
-                lout = [outputs]
-        else :
-            if ( isinstance(outputs_updates, (list,tuple)) and
-                    isinstance(outputs_updates[0], (list,tuple))):
-                lout = []
-                lupd = dict(outputs_updates)
-            else:
-                lout = outputs_updates
-                lupd = {}
-        return lupd, lout
-
-    def pick_from1(elem0):
-        lupd = {}
-        lout = []
-        if ( isinstance(elem0, dict) or
-            (isinstance(elem0, (list,tuple)) and
-             isinstance(elem0[0], (list, tuple)))):
-            lupd = dict(elem0)
+    def is_outputs(elem):
+        if (isinstance(elem, (list,tuple)) and
+            all([isinstance(x, theano.Variable) for x in elem])):
+            return True
+        if isinstance(elem, theano.Variable):
+            return True
+        return False
+
+    def is_updates(elem):
+        if isinstance(elem, dict):
+            return True
+        # Dictionaries can be given as lists of tuples
+        if (isinstance(elem, (list, tuple)) and
+            all([isinstance(x, (list,tuple)) and len(x) ==2
+                 for x in elem])):
+            return True
+        return False
+
+    def is_condition(elem):
+        return isinstance(elem, theano.scan_module.until)
+
+    def _list(x):
+        if isinstance(x, (list, tuple)):
+            return list(x)
        else:
-            if not isinstance(elem0, (list, tuple)):
-                lout = [elem0]
+            return [x]
+
+    if is_outputs(ls):
+        return None, _list(ls), {}
+    if is_updates(ls):
+        return None, [], dict(ls)
+    if not isinstance(ls, (list, tuple)):
+        raise ValueError(('Scan can not parse the return value'
+                          ' of your lambda expression'))
+    ls = list(ls)
+    deprication_msg = ('The return value of the lambda function'
+                    ' has been restricted. you have to always return first the'
+                    ' outputs (if any), afterwards the updates (if any) and'
+                    ' at the end the conclusion')
+    error_msg = 'Scan can not parse the return value of your lambda expression'
+    if len(ls) == 2:
+        if is_outputs(ls[0]):
+            if is_updates(ls[1]):
+                return (None, _list(ls[0]), dict(ls[1]))
+            elif is_condition(ls[1]):
+                return ( ls[1].condition, _list(ls[0]), {})
            else:
-                lout = elem0
-        return lupd, lout
-
-    # we will try now to separate the outputs from the updates
-    if not isinstance(outputs_updates, (list,tuple)):
-        if isinstance(outputs_updates, dict) :
-            # we have just an update dictionary
-            updates = outputs_updates
-        elif isinstance(outputs_updates, until):
-            updates = outputs_updates.updates
-            outputs = outputs_updates.outputs
-            cond    = outputs_updates.condition
-        else:
-            outputs = [outputs_updates]
-    elif len(outputs_updates) == 1:
-            rval = pick_from1(outputs_updates)
-            updates = rval[0]
-            outputs = rval[1]
-    elif len(outputs_updates) == 2:
-        elem0 = outputs_updates[0]
-        elem1 = outputs_updates[1]
-        if isinstance(elem0,until):
-            cond = elem0.condition
-            rval = pick_from1(elem1)
-            updates = rval[0].updates(elem0.updates)
-            outputs = rval[1] + elem0.outputs
-        elif isinstance(elem1, until):
-            cond = elem1.condition
-            rval = pick_from1(elem0)
-            updates = rval[0].update(elem1.updates)
-            outputs = rval[1] + elem1.outputs
+                raise ValueError(error_msg)
+        elif is_updates(ls[0]):
+            if is_outputs(ls[1]):
+                _logger.warning(deprication_msg)
+                return ( None, _list(ls[1]), dict(ls[0]) )
+            elif is_condition(ls[1]):
+                return (ls[1].condition, [], dict(ls[0]))
+            else:
+                raise ValueError(error_msg)
        else:
-            rval = pick_from2(elem0, elem1)
-            updates = rval[0]
-            outputs = rval[1]
-    elif len(outputs_updates) == 3:
-        elem0 = outputs_updates[0]
-        elem1 = outputs_updates[1]
-        elem2 = outputs_updates[2]
-        if isinstance(elem0, until):
-            cond = elem0.condition
-            rval = pick_from2(elem1, elem2)
-            updates = rval[0].update(elem0.updates)
-            outputs = rval[1] + elem0.outputs
-        elif isinstance(elem1, until):
-            cond = elem1.condition
-            rval = pick_from2(elem0, elem2)
-            updates = rval[0].update(elem1.updates)
-            outputs = rval[1] + elem1.outputs
-        elif isinstance(elem2, until):
-            cond = elem2.condition
-            rval = pick_from2(elem0, elem1)
-            updates = rval[0].update(elem2.updates)
-            outputs = rval[1] + elem2.outputs
+            raise ValueError(error_msg)
+    elif len(ls) == 3:
+        if is_outputs(ls[0]):
+            if is_updates(ls[1]):
+                if is_condition(ls[2]):
+                    return (ls[2].condition, _list(ls[0]), dict(ls[1]))
+                else:
+                    raise ValueError(error_msg)
+            else:
+                raise ValueError(error_msg)
+        elif is_updates(ls[0]):
+            if is_outputs(ls[1]):
+                if is_condition(ls[2]):
+                    _logger.warning(deprication_msg)
+                    return (ls[2].condition, _list(ls[1]), dict(ls[0]))
+                else:
+                    raise ValueError(error_msg)
+            else:
+                raise ValueError(error_msg)
        else:
-            outputs = outputs_updates
-    else:
-        outputs = outputs_updates
-
-    # in case you return a tuple .. convert it to a list (there are certain
-    # operation that are not permited on tuples, like element assignment)
-    if not isinstance(outputs, (list, tuple)):
-        outputs = [outputs]
-    else:
-        outputs = list(outputs)
-
-    # If you return numbers (highly unlikely) this will not go well for
-    # theano. We need to convert them to Theano constants:
-    for i,out in enumerate(outputs):
-        outputs[i] = tensor.as_tensor(out)
-
-    #return cond, outputs, updates
-    return outputs, updates
+            raise ValueError(error_msg)


 def isNaN_or_Inf_or_None(x):
@@ -339,9 +288,6 @@ def equal_computations(xs,ys, in_xs = None, in_ys = None, strict=True):
     equivalence of inputs defined by map).  Inputs are always assumed
     equal if strict is set to False.
    '''
-    import time
-    t00 = time.time()
-
    if in_xs is None:
        in_xs = []
    if in_ys is None:
@@ -356,6 +302,11 @@ def equal_computations(xs,ys, in_xs = None, in_ys = None, strict=True):
        if x.owner and y.owner:
            if x.owner.outputs.index(x) != y.owner.outputs.index(y):
                return False
+    if len(in_xs) != len(in_ys):
+        return False
+    for _x,_y in zip(in_xs, in_ys):
+        if _x.type != _y.type:
+            return False

    nds_x = gof.graph.io_toposort(in_xs, xs)
    nds_y = gof.graph.io_toposort(in_ys, ys)
@@ -371,14 +322,15 @@ def equal_computations(xs,ys, in_xs = None, in_ys = None, strict=True):
                return False
            elif (isinstance(dx, tensor.Constant) and
                isinstance(dy, tensor.Constant) and
-                dx.data == dy.data):
+                numpy.all(dx.data == dy.data)):
                pass
-            elif strict:
-                if dx != dy:
-                    return False
            else:
-                if dx.type != dy.type:
-                    return False
+                if not strict:
+                    if dx.type != dy.type:
+                        return False
+                else:
+                    if (dx,dy) not in common:
+                        return False

    while cont and idx < n_nodes:
        nd_x = nds_x[idx]
@@ -395,7 +347,7 @@ def equal_computations(xs,ys, in_xs = None, in_ys = None, strict=True):
                    if strict and dx!= dy:
                        if (isinstance(dx, tensor.Constant) and
                            isinstance(dy, tensor.Constant) and
-                            dx.data == dy.data):
+                            numpy.all(dx.data == dy.data)):
                            pass
                        else:
                            cont = False
@@ -597,6 +549,8 @@ def compress_outs(op, not_required, inputs):
    info['inplace']            = op.info['inplace']
    info['gpu']                = op.info['gpu']
    info['mode']               = op.info['mode']
+    info['as_while']           = op.info['as_while']
+    info['profile']            = op.info['profile']

    op_inputs   = op.inputs[:op.n_seqs]
    op_outputs  = []
@@ -705,6 +659,10 @@ def compress_outs(op, not_required, inputs):
    # other stuff
    op_inputs += op.inputs[i_offset:]
    node_inputs += inputs[ni_offset+op.n_shared_outs+op.n_nit_sot:]
+    if op.as_while:
+        op_outputs += [op.outputs[o_offset]]
+        map_old_new[o_offset] = len(op_outputs)-1
+        #map_old_new[len(op_outputs)-1] = o_offset

    return (op_inputs, op_outputs, info, node_inputs, map_old_new)

@@ -716,16 +674,10 @@ def find_up(l_node, f_node):
        l_outs = l_node.outputs
    else:
        l_outs = l_node
-    l_ins  = graph.inputs(l_outs)
-    nodes = graph.io_toposort(l_ins, l_outs)
+    l_ins  = gof.graph.inputs(l_outs)
+    nodes = gof.graph.io_toposort(l_ins, l_outs)
    return f_node in nodes

-
-def flatten(l):
-    """flattens a list by one level only"""
-    return sum(l , [])
-
-
 def reconstruct_graph(inputs, outputs, tag = None):
    """
    Different interface to clone, that allows you to pass inputs.
@@ -748,11 +700,11 @@ class scan_args(object):
                 _inner_inputs, _inner_outputs, info):
        self.n_steps = outer_inputs[0]
        rval = reconstruct_graph(_inner_inputs, _inner_outputs, '_merge')
-        #if info['as_while']:
-        #    self.cond = [rval[1][-1]]
-        #    inner_outputs = rval[1][:-1]
-        #else:
-        inner_outputs = rval[1]
+        if info['as_while']:
+            self.cond = [rval[1][-1]]
+            inner_outputs = rval[1][:-1]
+        else:
+            inner_outputs = rval[1]
        inner_inputs  = rval[0]

        p = 1
@@ -852,12 +804,12 @@ class scan_args(object):

        self.other_info = dict()
        for k in ('truncate_gradient', 'name', 'mode', 'inplace',
-                  'gpu', 'profile'):
+                  'gpu','as_while', 'profile'):
            self.other_info[k] = info[k]

    inner_inputs = property(lambda self: (self.inner_in_seqs +
-                                          flatten(self.inner_in_mit_mot) +
-                                          flatten(self.inner_in_mit_sot) +
+                                          sum(self.inner_in_mit_mot, []) +
+                                          sum(self.inner_in_mit_sot, []) +
                                          self.inner_in_sit_sot +
                                          self.inner_in_shared +
                                          self.inner_in_non_seqs))
@@ -871,7 +823,7 @@ class scan_args(object):
                                          self.outer_in_nit_sot +
                                          self.outer_in_non_seqs))

-    inner_outputs = property(lambda self: (flatten(self.inner_out_mit_mot) +
+    inner_outputs = property(lambda self: (sum(self.inner_out_mit_mot, []) +
                                           self.inner_out_mit_sot +
                                           self.inner_out_sit_sot +
                                           self.inner_out_nit_sot +

--- a/theano/scan_module/scan_views.py
+++ b/theano/scan_module/scan_views.py
@@ -102,31 +102,18 @@ def reduce( fn
    :param name: See ``scan``.
    """
    # Makes sure the outputs_info is a list.
-    if not isinstance(outputs_info, (list,tuple)):
-        outs_info = [outputs_info]
-    else:
-        outs_info = list(outputs_info)
-
-    for i,out_info in enumerate(outs_info):
-        if out_info:
-            if not isinstance(out_info, dict):
-                # Specifies that it should return only the last step.
-                outs_info[i] = dict(
-                    initial = out_info,  return_steps = 1)
-            else:
-                # Specifies that it should return only the last step.
-                outs_info[i]['return_steps'] = 1
-                # NOTE : If the user asks for more then the last step,
-                # it means he does not understand ``reduce``. We could
-                # issue a warning in that case
-    return scan.scan( fn                 = fn
+    rval = scan.scan( fn                 = fn
                , sequences         = sequences
-                , outputs_info      = outs_info
+                , outputs_info      = outputs_info
                , non_sequences     = non_sequences
                , go_backwards      = go_backwards
                , truncate_gradient = -1
                , mode              = mode
                , name              = name )
+    if isinstance(rval[0], (list,tuple)):
+        return [ x[-1] for x in rval[0]], rval[1]
+    else:
+        return rval[0][-1], rval[1]


 # The ``foldl`` view of Scan Op.

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -9,26 +9,11 @@ from theano import tensor
 from theano.tests  import unittest_tools as utt
 from theano.compile.pfunc import rebuild_collect_shared

-import theano.tensor as TT


 '''
  Questions and notes about scan that should be answered :

-   * Even though it does not make it publically known in
-   the documentation, scan allows you to set both a return_steps
-   flag and a store_steps flag ( the first one is a soft condition telling
-   you how many steps to return, the second one determines how much memory
-   to allocate). There is an optimization as well, that transforms
-   return_steps to
-   store_steps. Questions :
-      - what happens if both flags are set ?
-       answer: whatever return_steps says is ignored, and store_steps is used
-      - the optimization works only with return_steps = -1; can it be made
-       to work with other values ?
-       answer: 6 Jul 2010 RP :it is a bit harry to figure out from the
-       subtensors what exactly you need
-
   * Scan seems to do copies of every input variable. Is that needed?
   answer : probably not, but it doesn't hurt also ( what we copy is
   theano variables, which just cary information about the type / dimension
@@ -313,9 +298,6 @@ class T_Scan(unittest.TestCase):
        scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)]
        assert len(scan_node) == 1
        scan_node = scan_node[0]
-        #theano.printing.pydotprint(f2, outfile='out1.png', high_contrast=True)
-        #theano.printing.pydotprint(scan_node.op.fn,
-        #                           outfile='inner1.png', high_contrast=True)

        topo = f2.maker.env.toposort()
        assert sum([isinstance(node.op, theano.sandbox.cuda.HostFromGpu) for node in topo]) == 0
@@ -1960,7 +1942,6 @@ class T_Scan(unittest.TestCase):
            self.assertTrue(nb_shape_i == 1)

    def test_bug_josh_reported(self):
-        import theano
        import theano.tensor.signal.conv
        m1 = theano.tensor.matrix()
        m2 = theano.tensor.matrix()
@@ -2090,8 +2071,8 @@ class T_Scan(unittest.TestCase):

        mode = theano.compile.mode.FAST_RUN
        mode = mode.excluding('inplace')
-        f1 = theano.function([],o, mode= mode)
-        inputs, outputs = clone_optimized_graph(f1)
+        f0 = theano.function([],o, mode= mode)
+        inputs, outputs = clone_optimized_graph(f0)

        scan_nodes = grab_scan_node(outputs[0])
        assert scan_nodes is not None
@@ -2173,19 +2154,23 @@ class T_Scan(unittest.TestCase):
        n2o_u,_ = theano.scan( lambda i, o,u,h0,W,eu:
                              (theano.tensor.grad(o[i], u)*eu).sum(),
                              sequences = tensor.arange(o.shape[0]),
-                              non_sequences = [o,u,h0,W,eu])
+                              non_sequences = [o,u,h0,W,eu],
+                              name = 'jacobU'
+                             )


        n2o_h0,_ = theano.scan( lambda i, o,u,h0,W,eh0:
                              (theano.tensor.grad(o[i], h0)*eh0).sum(),
                              sequences = tensor.arange(o.shape[0]),
-                              non_sequences = [o,u,h0,W,eh0])
+                              non_sequences = [o,u,h0,W,eh0],
+                              name = 'jacobh')


        n2o_W,_ = theano.scan( lambda i, o,u,h0,W,eW:
                              (theano.tensor.grad(o[i], W)*eW).sum(),
                              sequences = tensor.arange(o.shape[0]),
-                              non_sequences = [o,u,h0,W,eW])
+                              non_sequences = [o,u,h0,W,eW],
+                             name = 'jacobW')


        fn_test = theano.function([u,h0,W,eu,eh0,eW],
@@ -2201,12 +2186,12 @@ class T_Scan(unittest.TestCase):


    def test_pushout(self):
-        W1 = TT.matrix('W1')
-        W2 = TT.matrix('W2')
-        h0 = TT.vector('h0')
+        W1 = tensor.matrix('W1')
+        W2 = tensor.matrix('W2')
+        h0 = tensor.vector('h0')

        def lambda_fn(h, W1, W2):
-            return TT.dot(h, W1 + W2)
+            return tensor.dot(h, W1 + W2)

        o, _ = theano.scan(lambda_fn, outputs_info= h0,
                           non_sequences =[W1,W2],
@@ -2223,14 +2208,14 @@ class T_Scan(unittest.TestCase):


    def test_alloc_inputs1(self):
-        W1 = TT.matrix('W1')
-        W2 = TT.matrix('W2')
-        h0 = TT.vector('h0')
+        W1 = tensor.matrix('W1')
+        W2 = tensor.matrix('W2')
+        h0 = tensor.vector('h0')

        def lambda_fn(h, W1, W2):
-            return TT.dot(h, W1 * W2)
+            return tensor.dot(h, W1 * W2)
        o, _ = theano.scan(lambda_fn, outputs_info= h0,
-                           non_sequences =[W1,TT.zeros_like(W2)],
+                           non_sequences =[W1,tensor.zeros_like(W2)],
                           n_steps = 5)

        f = theano.function([h0,W1,W2], o)
@@ -2242,17 +2227,17 @@ class T_Scan(unittest.TestCase):


    def test_alloc_inputs2(self):
-        W1 = TT.matrix()
-        W2 = TT.matrix()
-        h0 = TT.vector()
+        W1 = tensor.matrix()
+        W2 = tensor.matrix()
+        h0 = tensor.vector()

        def lambda_fn(W1,h, W2):
-            return W1 * TT.dot(h, W2)
+            return W1 * tensor.dot(h, W2)

        o, _ = theano.scan(lambda_fn,
-                           sequences = TT.zeros_like(W1),
+                           sequences = tensor.zeros_like(W1),
                           outputs_info= h0,
-                           non_sequences =[TT.zeros_like(W2)],
+                           non_sequences =[tensor.zeros_like(W2)],
                           n_steps = 5)

        f = theano.function([h0,W1,W2], o)
@@ -2266,21 +2251,21 @@ class T_Scan(unittest.TestCase):


    def test_alloc_inputs3(self):
-        _W1 = TT.matrix()
-        _W2 = TT.matrix()
-        _h0 = TT.vector()
+        _W1 = tensor.matrix()
+        _W2 = tensor.matrix()
+        _h0 = tensor.vector()

-        W1 = TT.specify_shape(_W1, (3,3))
-        W2 = TT.specify_shape(_W2, (3,3))
-        h0 = TT.specify_shape(_h0, (3,))
+        W1 = tensor.specify_shape(_W1, (3,3))
+        W2 = tensor.specify_shape(_W2, (3,3))
+        h0 = tensor.specify_shape(_h0, (3,))

        def lambda_fn(W1,h, W2):
-            return W1 * TT.dot(h, W2)
+            return W1 * tensor.dot(h, W2)

        o, _ = theano.scan(lambda_fn,
-                           sequences = TT.zeros_like(W1),
+                           sequences = tensor.zeros_like(W1),
                           outputs_info= h0,
-                           non_sequences =[TT.zeros_like(W2)],
+                           non_sequences =[tensor.zeros_like(W2)],
                           n_steps = 5)

        f = theano.function([_h0,_W1,_W2], o)
@@ -2292,9 +2277,9 @@ class T_Scan(unittest.TestCase):


    def test_while0(self):
-        x = TT.vector('x')
+        x = tensor.vector('x')
        def lambda_fn(x_t):
-            return x_t+1, theano.until( x_t > 3)
+            return x_t+1, theano.scan_module.until( x_t > 3)
        o, _ = theano.scan(lambda_fn, x)
        f = theano.function([x], o)
        vx = numpy.zeros((50,))
@@ -2303,9 +2288,9 @@ class T_Scan(unittest.TestCase):
        assert numpy.sum(out[24:]) == 0

    def test_while1(self):
-        x = TT.vector('x')
+        x = tensor.vector('x')
        def lambda_fn(x_t):
-            return x_t+1, theano.until( x_t > 3)
+            return x_t+1, theano.scan_module.until( x_t > 3)
        o, _  = theano.scan(lambda_fn, x)
        o2, _ = theano.scan(lambda x_t:x_t + 2,
                            x)
@@ -2322,11 +2307,11 @@ class T_Scan(unittest.TestCase):


    def test_while2(self):
-        x = TT.vector('x')
+        x = tensor.vector('x')
        def lambda_fn(x_t):
-            return x_t+1, theano.until( x_t > 3)
+            return x_t+1, theano.scan_module.until( x_t > 3)
        o, _  = theano.scan(lambda_fn, x)
-        o2, _ = theano.scan(lambda x_t:( x_t + 2, theano.until(x_t>3)),
+        o2, _ = theano.scan(lambda x_t:( x_t + 2, theano.scan_module.until(x_t>3)),
                            x)

        f = theano.function([x], [o,o2])
@@ -2339,6 +2324,68 @@ class T_Scan(unittest.TestCase):
                if isinstance(x.op, theano.scan_module.scan_op.Scan)]
        assert len(lssc) == 1

+    def test_return_steps(self):
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        vW_in2 = asarrayX(rng.uniform(size = (2,), low = -5.,high = 5.))
+        vW     = asarrayX(rng.uniform(size = (2,2), low = -5.,high = 5.))
+        vWout  = asarrayX(rng.uniform(size = (2,), low = -5.,high = 5.))
+        vW_in1 = asarrayX(rng.uniform(size = (2,2), low = -5.,high = 5.))
+        v_u1   = asarrayX(rng.uniform(size = (8,2), low = -5., high = 5.))
+        v_u2   = asarrayX(rng.uniform(size = (8,), low = -5.,high = 5.))
+        v_x0   = asarrayX(rng.uniform(size = (2,), low = -5.,high = 5.))
+        v_y0   = asarrayX(rng.uniform(size = (3,)))
+
+        W_in2 = theano.shared(vW_in2, name='win2')
+        W     = theano.shared(vW, name='w')
+        W_out = theano.shared(vWout, name = 'wout')
+        W_in1 = theano.tensor.matrix('win')
+        u1    = theano.tensor.matrix('u1')
+        u2    = theano.tensor.vector('u2')
+        x0    = theano.tensor.vector('x0')
+        y0    = theano.tensor.vector('y0')
+
+        def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, y_tm3, W_in1):
+            return [y_tm3+1, theano.dot(u1_t,W_in1) + u2_t * W_in2 + \
+                    theano.dot(x_tm1, W),
+                    y_tm1 + theano.dot(x_tm1, W_out)]
+
+        outputs, updates = theano.scan( f_rnn_cmpl
+                                       , [ u1
+                                          , u2]
+                                       , [ dict(store_steps = 3)
+                                          , dict(initial = x0, return_steps = 2)
+                                          , dict(initial=y0, taps=[-1,-3],
+                                                 return_steps = 4)]
+                                       , W_in1
+                                       , n_steps           = None
+                                       , truncate_gradient = -1
+                                       , go_backwards      = False)
+
+        f4     = theano.function([u1,u2,x0,y0,W_in1], outputs
+                                 , updates = updates
+                                 , allow_input_downcast = True
+                                )
+
+
+        # compute the values in numpy
+        v_x = numpy.zeros((8,2),dtype=theano.config.floatX)
+        v_y = numpy.zeros((8,),dtype=theano.config.floatX)
+        v_x[0] = numpy.dot(v_u1[0],vW_in1) + v_u2[0]*vW_in2 + \
+                    numpy.dot(v_x0,vW)
+        v_y[0] = numpy.dot(v_x0,vWout) + v_y0[2]
+
+        for i in xrange(1,8):
+
+            v_x[i] = numpy.dot(v_u1[i],vW_in1) + v_u2[i]*vW_in2 + \
+                        numpy.dot(v_x[i-1],vW)
+            v_y[i] = numpy.dot(v_x[i-1], vWout) + v_y[i-1]
+
+        (theano_dump, theano_x,theano_y) =  f4( v_u1, v_u2, v_x0, v_y0, vW_in1)
+
+        assert numpy.allclose(theano_x , v_x[-2:])
+        assert numpy.allclose(theano_y , v_y[-4:])
+
+

 def test_speed():
    #

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2838,7 +2838,7 @@ def extract_constant(x):
        if x.owner and isinstance(x.owner.op, ScalarFromTensor):
            x = x.owner.inputs[0]
        else:
-            x = tensor.tensor_from_scalar(x)
+            x = tensor_from_scalar(x)
    return x



--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1245,32 +1245,59 @@ def local_useless_subtensor(node):
        shape_of = node.env.shape_feature.shape_of
        node_input_idx = 1
        for pos, idx in enumerate(node.op.idx_list):
+            if not isinstance(idx, slice):
+                # If idx is not a slice, this means we remove this dimension
+                # from the output, so the subtensor is not useless
+                return False
+            if idx.start not in [0,None]:
+                # If the start of the slice is different from 0, or is a
+                # variable, then we assume the subtensor is not useless
+                return False
+            if idx.step not in [1, None]:
+                # If we are going backwards, or skipping elements, then this
+                # is not a useless subtensor
+                return False
+
+
            length_pos_data = sys.maxint
            length_pos_shape_i = None
            try:
                length_pos = shape_of[node.inputs[0]][pos]
-                if isinstance(length_pos, theano.tensor.basic.TensorConstant):
-                    length_pos_data = length_pos.data
-                else:
-                    length_pos_shape_i = node.inputs[node_input_idx].owner.inputs[0]
+                try:
+                    length_pos_data = get_constant_value(length_pos)
+                except TypeError:
+                    pass

+                if isinstance(idx.stop, theano.scalar.Scalar):
+                    if isinstance(node.inputs[node_input_idx].owner.op,
+                                T.ScalarFromTensor):
+                        length_pos_shape_i = node.inputs[node_input_idx].owner.inputs[0]
+                    else:
+                        length_pos_shape_i = node.inputs[node_input_idx]
+                    assert length_pos_shape_i.type == idx.stop
+                    # We already know that start and step are not variables
+                    # and so they don't appear in the input of the node
+                    node_input_idx += 1
+            # Catch exception from shape_of
            except Exception, e:
                length_pos = None
-            if ( isinstance(idx,slice) and
-                idx.start in [0,None] and
-                idx.step in [1,None] and
-                (idx.stop in [sys.maxint, None, length_pos_data] or
-                 (isinstance(idx.stop, int) and idx.stop>=length_pos_data) or
-                 (isinstance(idx.stop, theano.scalar.Scalar) and
-                  length_pos==length_pos_shape_i)
-                 )):
+
+            if isinstance(idx.stop, int):
+                if idx.stop < length_pos_data:
+                    return False
+
+            elif isinstance(idx.stop, theano.scalar.Scalar):
+                if length_pos_shape_i is None:
+                    return False
+                if length_pos is None:
+                    return False
+                if length_pos_shape_i != length_pos:
+                    return False
+            elif idx.stop is None:
                pass
            else:
                return False
-            if isinstance(idx, slice):
-                node_input_idx += sum([isinstance(idx.start, theano.scalar.Scalar),
-                                       isinstance(idx.stop, theano.scalar.Scalar),
-                                       isinstance(idx.step, theano.scalar.Scalar)])
+
        return [node.inputs[0]]



--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -136,6 +136,7 @@ def safe_make_node(op, *inputs):
        return node[0].owner
    else:
        return node.owner
+
 def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},
               bad_runtime = {}, grad = {}, mode = None, grad_rtol=None,
               eps = 1e-10, skip = False):
@@ -146,7 +147,7 @@ def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},

    class Checker(unittest.TestCase):

-        op = _op
+        op = staticmethod(_op)
        expected = staticmethod(_expected)
        checks = _checks
        good = _good
@@ -999,6 +1000,52 @@ SecondSameRankTester = makeTester(
                            mode=get_default_mode().excluding('local_fill_to_alloc')
                        )

+### Alloc
+AllocTester = makeBroadcastTester(
+        name = 'AllocTester',
+        op = alloc,
+        expected = (lambda x, *shp: numpy.zeros(shp, dtype=x.dtype) + x),
+        good = dict(
+            correct02 = (rand(), numpy.int32(4), numpy.int32(7)),
+            correct12 = (rand(7), numpy.int32(4), numpy.int32(7)),
+            correct13 = (rand(7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
+            correct23 = (rand(4,7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
+            ),
+        bad_runtime = dict(
+            bad_shape12 = (rand(7), numpy.int32(7), numpy.int32(5)),
+            too_big32 = (rand(6,2,4), numpy.int32(6), numpy.int32(2)),
+            too_big32b = (rand(6,2,4), numpy.int32(2), numpy.int32(4)),
+            ),
+        )
+
+# Since not all inputs of Alloc are differentiable, we need different testers
+s1, s2, s3 = randint_ranged(1, 13, (3,))
+# alloc a scalar into a vector
+Alloc01GradTester = makeBroadcastTester(
+        name = 'Alloc01GradTester',
+        #op = (lambda self, x: alloc(x, s1)),
+        op = (lambda x: alloc(x, s1)),
+        expected = (lambda x: numpy.zeros((s1,), dtype=x.dtype) + x),
+        grad = dict(
+            x1 = (rand(),),
+            x2 = (rand(),),
+            x3 = (rand(),),
+            ),
+        )
+
+# alloc a vector into a tensor3
+Alloc13GradTester = makeBroadcastTester(
+        name = 'Alloc13GradTester',
+        #op = (lambda self, x: alloc(x, s1, s2, s3)),
+        op = (lambda x: alloc(x, s1, s2, s3)),
+        expected = (lambda x: numpy.zeros((s1, s2, s3), dtype=x.dtype) + x),
+        grad = dict(
+            x1 = (rand(s3),),
+            x2 = (rand(s3),),
+            x3 = (rand(s3),),
+            ),
+        )
+
 def test_eye():
    def check(dtype, N, M_=None, k=0):
        # Theano does not accept None as a tensor.

--- a/theano/tensor/tests/test_rop.py
+++ b/theano/tensor/tests/test_rop.py
@@ -13,30 +13,33 @@ ops without:
    Prod
    MulwithoutZeros
    ProdWithoutZeros
-
+    CAReduce(for max,... done for MaxAndArgmax op)

 list of ops that support R-op:
+ * with test
+    * SpecifyShape
+    * MaxAndArgmax
+    * Subtensor
+    * IncSubtensor set_subtensor too
    * Alloc
+    * Dot
+    * Elemwise
+    * Sum
+    * Softmax
+    * Shape
+    * Join
+
+ * without test
    * Split
    * ARange
    * ScalarFromTensor
-    * Shape
-    * SpecifyShape
-    * MaxAndArgmax
-    * Subtensor
-    * IncSubtensor
    * Rebroadcast
-    * Join
    * Reshape
    * Flatten
    * AdvancedSubtensor1
    * AdvancedIncSubtensor1
    * AdvancedIncSubtensor
-    * Dot
    * DimShuffle
-    * Elemwise
-    * Sum
-    * Softmax
    * Scan


@@ -183,11 +186,17 @@ class test_RopLop(unittest.TestCase):
                           self.in_shape)


-    def test_max_argmax(self):
+    def test_max(self):
+        ## If we call max directly, we will return an CAReduce object
+        ## and he don't have R_op implemented!
+        #self.check_mat_rop_lop(TT.max(self.mx, axis=[0,1])[0],
+        #                       ())
+        self.check_mat_rop_lop(TT.max(self.mx, axis=0),
+                               (self.mat_in_shape[1],))
        self.check_mat_rop_lop(TT.max(self.mx, axis=1),
                               (self.mat_in_shape[0],))

-    def test_max_argmax(self):
+    def test_argmax(self):
        self.check_nondiff_rop(TT.argmax(self.mx,axis=1))

    def test_subtensor(self):
@@ -201,7 +210,7 @@ class test_RopLop(unittest.TestCase):
        self.check_rop_lop(out, self.in_shape)


-    def test_incsubtensor1(self):
+    def test_incsubtensor2(self):
        tv = numpy.asarray( self.rng.uniform(size=(10,)),
                           theano.config.floatX)
        t = theano.shared(tv)
@@ -217,7 +226,7 @@ class test_RopLop(unittest.TestCase):
        self.check_rop_lop(out, self.in_shape)


-    def test_setsubtensor1(self):
+    def test_setsubtensor2(self):
        tv = numpy.asarray( self.rng.uniform(size=(10,)),
                           theano.config.floatX)
        t = theano.shared(tv)

--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py
@@ -4,15 +4,31 @@ This is a REALLY PARTIAL TEST.
 I did them to help debug stuff.

 """
-
+import logging
+import StringIO

 import theano
 import theano.tensor as tensor


 def test_pydotprint_cond_highlight():
+    assert len(theano.theano_logger.handlers) == 1
+
    x = tensor.dvector()
    f = theano.function([x], x*2)
    f([1,2,3,4])

-    theano.printing.pydotprint(f, cond_highlight = True)
+    s = StringIO.StringIO()
+    new_handler = logging.StreamHandler(s)
+    new_handler.setLevel(logging.DEBUG)
+    orig_handler = theano.theano_logger.handlers[0]
+
+    theano.theano_logger.removeHandler(orig_handler)
+    theano.theano_logger.addHandler(new_handler)
+    try:
+        theano.printing.pydotprint(f, cond_highlight = True)
+    finally:
+        theano.theano_logger.addHandler(orig_handler)
+        theano.theano_logger.removeHandler(new_handler)
+
+    assert s.getvalue() == 'pydotprint: cond_highlight is set but there is no IfElse node in the graph\n'