Merge pull request #1459 from nouiz/presentation

Presentation

Merge pull request #1459 from nouiz/presentation
ae91b627 · Frédéric Bastien · 0bb60e21 · 0b7fba36 · ae91b627 · ae91b627
--- a/doc/cifarSC2011/boot_camp_overview.txt
+++ b/doc/cifarSC2011/boot_camp_overview.txt
@@ -15,7 +15,7 @@ Day 1

 * Show of hands - what is your background?

-* Python & Numpy in a nutshell
+* Python & NumPy in a nutshell

 * Theano basics


--- a/doc/cifarSC2011/gpundarray.txt
+++ b/doc/cifarSC2011/gpundarray.txt

-.. _gpundarray:
+.. _cifar2013_gpundarray:

 **********
 GpuNdArray

--- a/doc/cifarSC2011/index.txt
+++ b/doc/cifarSC2011/index.txt
@@ -18,7 +18,7 @@ What does it do?

 * symbolic differentiation.

-It complements the Python numeric/scientific software stack (e.g. numpy, scipy,
+It complements the Python numeric/scientific software stack (e.g. NumPy, SciPy,
 scikits, matplotlib, PIL.)

 Design and feature set has been driven by machine learning research

--- a/doc/cifarSC2011/introduction.txt
+++ b/doc/cifarSC2011/introduction.txt
@@ -13,7 +13,7 @@ Background Questionaire

 * What did you do with it?

-* Who has used Python? numpy? scipy? matplotlib?
+* Who has used Python? NumPy? SciPy? matplotlib?

 * Who has used iPython?

@@ -72,14 +72,14 @@ Python in one slide
    # PYTHON SYNTAX EXAMPLE
    #######################
    a = 1                     # no type declaration required!
-    b = (1,2,3)               # tuple of three int literals
-    c = [1,2,3]               # list of three int literals
+    b = (1, 2, 3)             # tuple of three int literals
+    c = [1, 2, 3]             # list of three int literals
    d = {'a': 5, b: None}     # dictionary of two elements
                              # N.B. string literal, None

    print d['a']              # square brackets index
    # -> 5
-    print d[(1,2,3)]          # new tuple == b, retrieves None
+    print d[(1, 2, 3)]        # new tuple == b, retrieves None
    # -> None
    print d[6]
    # raises KeyError Exception
@@ -116,18 +116,18 @@ Python in one slide
    print Bar(99).hello()     # Creating an instance of Bar
    # -> 99

-Numpy in one slide
+NumPy in one slide
 ------------------

 * Python floats are full-fledged objects on the heap

 * Not suitable for high-performance computing!

-* Numpy provides a N-dimensional numeric array in Python
+* NumPy provides a N-dimensional numeric array in Python

 * Perfect for high-performance computing.

-* Numpy provides
+* NumPy provides

 * elementwise computations

@@ -135,7 +135,7 @@ Numpy in one slide

 * pseudorandom numbers from many distributions

-* Scipy provides lots more, including
+* SciPy provides lots more, including

 * more linear algebra

@@ -148,29 +148,29 @@ Numpy in one slide
 .. code-block:: python

    ##############################
-    # Properties of Numpy arrays
+    # Properties of NumPy arrays
    # that you really need to know
    ##############################

    import numpy as np          # import can rename
-    a = np.random.rand(3,4,5)   # random generators
+    a = np.random.rand(3, 4, 5) # random generators
    a32 = a.astype('float32')   # arrays are strongly typed

    a.ndim                      # int: 3
-    a.shape                     # tuple: (3,4,5)
+    a.shape                     # tuple: (3, 4, 5)
    a.size                      # int: 60
    a.dtype                     # np.dtype object: 'float64'
    a32.dtype                   # np.dtype object: 'float32'

 Arrays can be combined with numeric operators, standard mathematical
-functions. Numpy has great `documentation <http://docs.scipy.org/doc/numpy/reference/>`_.
+functions. NumPy has great `documentation <http://docs.scipy.org/doc/numpy/reference/>`_.

-Training an MNIST-ready classification neural network in pure numpy might look like this:
+Training an MNIST-ready classification neural network in pure NumPy might look like this:

 .. code-block:: python

    #########################
-    # Numpy for Training a
+    # NumPy for Training a
    # Neural Network on MNIST
    #########################

@@ -186,23 +186,23 @@ Training an MNIST-ready classification neural network in pure numpy might look l

    batchsize = 100
    for i in xrange(1000):
-        x_i = x[i*batchsize:(i+1)*batchsize]
-        y_i = y[i*batchsize:(i+1)*batchsize]
+        x_i = x[i * batchsize: (i + 1) * batchsize]
+        y_i = y[i * batchsize: (i + 1) * batchsize]

        hidin = np.dot(x_i, w) + b

        hidout = np.tanh(hidin)

        outin = np.dot(hidout, v) + c
-        outout = (np.tanh(outin)+1)/2.0
+        outout = (np.tanh(outin) + 1) / 2.0

        g_outout = outout - y_i
-        err = 0.5 * np.sum(g_outout**2)
+        err = 0.5 * np.sum(g_outout ** 2)

        g_outin = g_outout * outout * (1.0 - outout)

        g_hidout = np.dot(g_outin, v.T)
-        g_hidin = g_hidout * (1 - hidout**2)
+        g_hidin = g_hidout * (1 - hidout ** 2)

        b -= lr * np.sum(g_hidin, axis=0)
        c -= lr * np.sum(g_outin, axis=0)
@@ -215,9 +215,9 @@ What's missing?

 * Non-lazy evaluation (required by Python) hurts performance

-* Numpy is bound to the CPU
+* NumPy is bound to the CPU

-* Numpy lacks symbolic or automatic differentiation
+* NumPy lacks symbolic or automatic differentiation

 Now let's have a look at the same algorithm in Theano, which runs 15 times faster if
 you have GPU (I'm skipping some dtype-details which we'll come back to).
@@ -229,40 +229,42 @@ you have GPU (I'm skipping some dtype-details which we'll come back to).
    # Neural Network on MNIST
    #########################

-    import theano as T
-    import theano.tensor as TT
+    import numpy as np
+
+    import theano
+    import theano.tensor as tensor

    x = np.load('data_x.npy')
    y = np.load('data_y.npy')

    # symbol declarations
-    sx = TT.matrix()
-    sy = TT.matrix()
-    w = T.shared(np.random.normal(avg=0, std=.1,
-        size=(784, 500)))
-    b = T.shared(np.zeros(500))
-    v = T.shared(np.zeros((500, 10)))
-    c = T.shared(np.zeros(10))
+    sx = tensor.matrix()
+    sy = tensor.matrix()
+    w = theano.shared(np.random.normal(avg=0, std=.1,
+                                       size=(784, 500)))
+    b = theano.shared(np.zeros(500))
+    v = theano.shared(np.zeros((500, 10)))
+    c = theano.shared(np.zeros(10))

    # symbolic expression-building
-    hid = TT.tanh(TT.dot(sx, w) + b)
-    out = TT.tanh(TT.dot(hid, v) + c)
-    err = 0.5 * TT.sum(out - sy)**2
-    gw, gb, gv, gc = TT.grad(err, [w,b,v,c])
+    hid = tensor.tanh(tensor.dot(sx, w) + b)
+    out = tensor.tanh(tensor.dot(hid, v) + c)
+    err = 0.5 * tensor.sum(out - sy) ** 2
+    gw, gb, gv, gc = tensor.grad(err, [w, b, v, c])

    # compile a fast training function
-    train = T.function([sx, sy], err,
+    train = theano.function([sx, sy], err,
        updates={
-            w:w - lr * gw,
-            b:b - lr * gb,
-            v:v - lr * gv,
-            c:c - lr * gc})
+            w: w - lr * gw,
+            b: b - lr * gb,
+            v: v - lr * gv,
+            c: c - lr * gc})

    # now do the computations
    batchsize = 100
    for i in xrange(1000):
-        x_i = x[i*batchsize:(i+1)*batchsize]
-        y_i = y[i*batchsize:(i+1)*batchsize]
+        x_i = x[i * batchsize: (i + 1) * batchsize]
+        y_i = y[i * batchsize: (i + 1) * batchsize]
        err_i = train(x_i, y_i)

    
@@ -286,7 +288,7 @@ Theano in one slide
 * Expression substitution optimizations automatically draw
  on many backend technologies for best performance.

- * FFTW, MKL, ATLAS, Scipy, Cython, CUDA
+ * FFTW, MKL, ATLAS, SciPy, Cython, CUDA

 * Slower fallbacks always available


--- a/doc/cifarSC2011/pyCUDA.txt
+++ b/doc/cifarSC2011/pyCUDA.txt
@@ -75,7 +75,7 @@ Exercise 6
 - Modify and execute it to work for a matrix of 20 x 10


-.. _pyCUDA_theano:
+.. _cifar2011_pyCUDA_theano:

 Theano + PyCUDA
 ---------------

--- a/doc/cifarSC2011/theano.txt
+++ b/doc/cifarSC2011/theano.txt
@@ -345,20 +345,11 @@ Differentiation details
 * We are working on the missing optimizations to be able to compute efficently the full Jacobian and Hessian and Jacobian x vector


+.. _cifar2011_benchmark:

 Benchmarks
 ----------

-Example:
-
-* Multi-layer perceptron
-* Convolutional Neural Networks
-* Misc Elemwise operations
-
-Competitors: NumPy + SciPy, MATLAB, EBLearn, Torch5, numexpr
-
-* EBLearn, Torch5: specialized libraries written by practitioners specifically for these tasks
-* numexpr: similar to Theano, 'virtual machine' for elemwise expressions

 **Multi-Layer Perceptron**:


--- a/doc/crei2013/advanced_theano.txt
+++ b/doc/crei2013/advanced_theano.txt
+
+.. _crei2013_advanced_theano:
+
+***************
+Advanced Theano
+***************
+
+
+Profiling
+---------
+
+- To replace the default mode with this mode, use the Theano flags ``profile=True``
+
+- To enable the memory profiling use the flags ``profile_memory=True``
+
+Theano output:
+
+.. literalinclude:: logreg_profile.txt
+
+Compilation pipeline
+--------------------
+
+.. image:: ../hpcs2011_tutorial/pics/pipeline.png
+   :width: 400 px
+
+
+Inplace optimization
+--------------------
+
+- 2 type of inplace operations:
+
+  - An op that return a view on its inputs (e.g. reshape, inplace transpose)
+  - An op that write the output on the inputs memory space
+
+- This allows some memory optimization
+- The Op must tell Theano if they work inplace
+- Inplace Op add constraints to the order of execution
+
+
+Conditions
+----------
+**IfElse**
+
+- Build condition over symbolic variables.
+- IfElse Op takes a boolean condition and two variables to compute as input.
+- While Switch Op evaluates both 'output' variables, IfElse Op is lazy and only
+  evaluates one variable respect to the condition.
+
+**IfElse Example: Comparison with Switch**
+
+.. literalinclude:: ifelse_switch.py
+
+IfElse Op spend less time (about an half) than Switch since it computes only
+one variable instead of both.
+
+>>> python ifelse_switch.py
+time spent evaluating both values 0.230000 sec
+time spent evaluating one value 0.120000 sec
+
+Note that IfElse condition is a boolean while Switch condition is a tensor, so
+Switch is more general.
+
+It is actually important to use  ``linker='vm'`` or ``linker='cvm'``,
+otherwise IfElse will compute both variables and take the same computation
+time as the Switch Op. The linker is not currently set by default to 'cvm' but
+it will be in a near future.
+
+Loops
+-----
+
+**Scan**
+
+- General form of **recurrence**, which can be used for looping.
+- **Reduction** and **map** (loop over the leading dimensions) are special cases of Scan
+- You 'scan' a function along some input sequence, producing an output at each time-step
+- The function can see the **previous K time-steps** of your function
+- ``sum()`` could be computed by scanning the z + x(i) function over a list, given an initial state of ``z=0``.
+- Often a for-loop can be expressed as a ``scan()`` operation, and ``scan`` is the closest that Theano comes to looping.
+- The advantage of using ``scan`` over for loops
+  
+  - The number of iterations to be part of the symbolic graph
+  - Minimizes GPU transfers if GPU is involved
+  - Compute gradients through sequential steps
+  - Slightly faster then using a for loop in Python with a compiled Theano function
+  - Can lower the overall memory usage by detecting the actual amount of memory needed
+
+**Scan Example: Computing pow(A,k)**
+
+.. literalinclude:: scan_pow.py
+
+
+**Scan Example: Calculating a Polynomial**
+
+.. literalinclude:: scan_poly.py
+
+Exercise 4
+-----------
+
+- Run both examples 
+- Modify and execute the polynomial example to have the reduction done by scan
+
+
+Exercise 5
+-----------
+
+- In the last exercises, do you see a speed up with the GPU?
+- Where does it come from? (Use ProfileMode)
+- Is there something we can do to speed up the GPU version?
+
+
+Printing/Drawing Theano graphs
+------------------------------
+
+- Pretty Printing
+
+``theano.printing.pprint(variable)``
+
+>>> theano.printing.pprint(prediction)
+gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorConstant{0.5})
+
+
+- Debug Print
+
+``theano.printing.debugprint({fct, variable, list of variables})``
+
+>>> theano.printing.debugprint(prediction)
+Elemwise{gt,no_inplace} [@181772236] ''
+ |Elemwise{true_div,no_inplace} [@181746668] ''
+ | |InplaceDimShuffle{x} [@181746412] ''
+ | | |TensorConstant{1} [@181745836]
+ | |Elemwise{add,no_inplace} [@181745644] ''
+ | | |InplaceDimShuffle{x} [@181745420] ''
+ | | | |TensorConstant{1} [@181744844]
+ | | |Elemwise{exp,no_inplace} [@181744652] ''
+ | | | |Elemwise{sub,no_inplace} [@181744012] ''
+ | | | | |Elemwise{neg,no_inplace} [@181730764] ''
+ | | | | | |dot [@181729676] ''
+ | | | | | | |x [@181563948]
+ | | | | | | |w [@181729964]
+ | | | | |InplaceDimShuffle{x} [@181743788] ''
+ | | | | | |b [@181730156]
+ |InplaceDimShuffle{x} [@181771788] ''
+ | |TensorConstant{0.5} [@181771148]
+>>> theano.printing.debugprint(predict)
+Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
+ |dot [@183018796] ''   1
+ | |x [@183000780]
+ | |w [@183000812]
+ |InplaceDimShuffle{x} [@183133580] ''   0
+ | |b [@183000876]
+ |TensorConstant{[ 0.5]} [@183084108]
+
+- Picture Printing of Graphs
+
+>>> theano.printing.pydotprint_variables(prediction)
+
+.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
+   :width: 800 px
+
+All pydotprint* requires graphviz and pydot
+
+>>> theano.printing.pydotprint(predict)
+
+.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
+   :width: 800 px
+
+>>> theano.printing.pydotprint(train) # This is a small train example!
+
+.. image:: ../hpcs2011_tutorial/pics/logreg_pydotprint_train.png
+   :width: 1500 px
+
+
+Debugging
+---------
+
+- Run with the Theano flag ``compute_test_value = {``off'',``ignore'', ``warn'', ``raise''}``
+
+  - Run the code as we create the graph
+  - Allows you to find the bug earlier (ex: shape mismatch)
+  - Makes it easier to identify where the problem is in *your* code
+  - Use the value of constants and shared variables directly
+  - For pure symbolic variables uses ``x.tag.test_value = numpy.random.rand(5,10)``
+
+- Run with the flag ``mode=FAST_COMPILE``
+  
+  - Few optimizations
+  - Run Python code (better error messages and can be debugged interactively in the Python debugger)
+
+- Run with the flag ``mode=DebugMode``
+
+  - 100-1000x slower
+  - Test all optimization steps from the original graph to the final graph
+  - Checks many things that Op should/shouldn't do
+  - Executes both the Python and C code versions
+
+Known limitations
+-----------------
+
+- Compilation phase distinct from execution phase
+
+  - Use ``a_tensor_variable.eval()`` to make this less visible
+
+- Compilation time can be significant
+
+  - Amortize it with functions over big input or reuse functions
+
+- Execution overhead
+
+  - We have worked on this, but more work needed
+  - So needs a certain number of operations to be useful
+
+- Compilation time superlinear in the size of the graph.
+
+  - Hundreds of nodes is fine
+  - Disabling a few optimizations can speed up compilation
+  - Usually too many nodes indicates a problem with the graph
--- a/doc/crei2013/gpundarray.txt
+++ b/doc/crei2013/gpundarray.txt
+
+.. _crei2013_gpundarray:
+
+**********
+GpuNdArray
+**********
+
+Why a common GPU ndarray?
+-------------------------
+
+- Currently there are at least 4 different GPU array data structures in use by Python packages
+
+  - CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ...
+  - There are even more if we include other languages
+
+- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU
+- Lots of duplicated effort
+
+  - GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
+
+- Lack of a common array API makes it harder to port/reuse code
+- Also harder to find/distribute code
+- Divides development work
+
+
+Design Goals
+------------
+
+- Make it VERY similar to ``numpy.ndarray``
+- Be compatible with both CUDA and OpenCL
+- Have the base object accessible from C to allow collaboration with more projects, across high-level languages
+
+  - We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
+
+
+Final Note
+----------
+
+- Under development
+- Will be the next GPU array container for Theano (*this summer!*)
+- Probably also for PyCUDA, PyOpenCL
+- Mailing list: http://lists.tiker.net/listinfo/gpundarray
--- a/doc/crei2013/ifelse_switch.py
+++ b/doc/crei2013/ifelse_switch.py
+import time
+
+import numpy
+
+import theano
+from theano import tensor as tt
+from theano.ifelse import ifelse
+
+a, b = tt.scalars('a', 'b')
+x, y = tt.matrices('x', 'y')
+
+z_switch = tt.switch(tt.lt(a, b), tt.mean(x), tt.mean(y))
+z_lazy = ifelse(tt.lt(a, b), tt.mean(x), tt.mean(y))
+
+f_switch = theano.function([a, b, x, y], z_switch)
+f_lazyifelse = theano.function([a, b, x, y], z_lazy)
+
+val1 = 0.
+val2 = 1.
+big_mat1 = numpy.ones((10000, 1000))
+big_mat2 = numpy.ones((10000, 1000))
+
+n_times = 10
+
+tic = time.clock()
+for i in xrange(n_times):
+    f_switch(val1, val2, big_mat1, big_mat2)
+print 'time spent evaluating both values %f sec' % (time.clock() - tic)
+
+tic = time.clock()
+for i in xrange(n_times):
+    f_lazyifelse(val1, val2, big_mat1, big_mat2)
+print 'time spent evaluating one value %f sec' % (time.clock() - tic)
\ No newline at end of file
--- a/doc/crei2013/index.txt
+++ b/doc/crei2013/index.txt
+
+.. _crei2013_index:
+
+===========================
+Theano Tutorial @ CREI 2013
+===========================
+
+July 19, 2013, Sherbrook, Québec, Canada.
+
+
+Theano is python software for evaluating complicated array expressions.
+
+What does it do?
+
+ * aggressive expression optimizations,
+
+ * automatic GPU use,
+
+ * symbolic differentiation and R op.
+
+It complements the Python numeric/scientific software stack (e.g. NumPy, SciPy,
+scikits, matplotlib, PIL.)
+
+Design and feature set has been driven by machine learning research
+at the University of
+Montreal (groups of Yoshua Bengio, Pascal Vincent, Aaron Courville and Roland Memisevic)
+The result is a very good library for doing research in deep
+learning and neural network training, and a flexible framework for
+many other models and algorithms in machine learning more generally.
+
+It has proven to be useful for implementing:
+
+ - linear and nonlinear neural network classifiers
+
+ - convolutional models
+
+ - Energy models: RBM, DBN, GRBM, ssRBM, AIS
+
+ - Auto-encoders: DAE, CAE
+
+ - GP regression
+
+ - sparse coding
+
+ - recurrent neural networks, echo state, (HMM?)
+
+ - online and batch learning and optimization
+
+ - Even SVM!
+
+As people's needs change this list will grow, but Theano is built
+around vector, matrix, and tensor expressions; there is little reason
+to use it for calculations on other data structures except. There is
+also some sparse matrix support.
+
+
+Contents
+--------
+
+The structured part of these lab sessions will be a walk-through of the following
+material. Interleaved with this structured part will be blocks of time for
+individual or group work.  The idea is that you can try out Theano and get help
+from gurus on hand if you get stuck.
+
+.. toctree::
+
+    introduction
+    theano
+    advanced_theano
+    gpundarray
+    /tutorial/extending_theano
--- a/doc/crei2013/introduction.txt
+++ b/doc/crei2013/introduction.txt
+
+.. _crei2013_Introduction:
+
+
+************
+Introduction
+************
+
+Background Questionaire
+-----------------------
+
+* Who has used Theano before?
+
+ * What did you do with it?
+
+* Who has used Python? NumPy? SciPy? matplotlib?
+
+* Who has used iPython?
+
+ * Who has used it as a distributed computing engine?
+
+* Who has done C/C++ programming?
+
+* Who has organized computation around a particular physical memory layout?
+
+* Who has used a multidimensional array of >2 dimensions?
+
+* Who has written a Python module in C before?
+
+ * Who has written a program to *generate* Python modules in C?
+
+* Who has used a templating engine?
+
+* Who has programmed a GPU before?
+
+ * Using OpenGL / shaders ?
+
+ * Using CUDA (runtime? / driver?)
+
+ * Using PyCUDA ?
+
+ * Using OpenCL / PyOpenCL ?
+
+ * Using cudamat / gnumpy ?
+
+ * Other?
+
+* Who has used Cython?
+
+
+Python in one slide
+-------------------
+
+* General-purpose high-level OO interpreted language
+ 
+* Emphasizes code readability
+ 
+* Comprehensive standard library
+ 
+* Dynamic type and memory management
+
+* Built-in types: int, float, str, list, dict, tuple, object
+
+* Slow execution
+
+* Popular in web-dev and scientific communities
+
+
+.. code-block:: python
+
+    #######################
+    # PYTHON SYNTAX EXAMPLE
+    #######################
+    a = 1                     # no type declaration required!
+    b = (1, 2, 3)             # tuple of three int literals
+    c = [1, 2, 3]             # list of three int literals
+    d = {'a': 5, b: None}     # dictionary of two elements
+                              # N.B. string literal, None
+
+    print d['a']              # square brackets index
+    # -> 5
+    print d[(1, 2, 3)]        # new tuple == b, retrieves None
+    # -> None
+    print d[6]
+    # raises KeyError Exception
+
+    x, y, z = 10, 100, 100    # multiple assignment from tuple
+    x, y, z = b               # unpacking a sequence
+
+    b_squared = [b_i**2 for b_i in b]  # list comprehension
+
+    def foo(b, c=3):          # function w default param c
+        return a + b + c      # note scoping, indentation
+
+    foo(5)                    # calling a function
+    # -> 1 + 5 + 3 == 9       # N.B. scoping
+    foo(b=6, c=2)             # calling with named args
+    # -> 1 + 6 + 2 == 9
+
+    print b[1:3]              # slicing syntax
+
+    class Foo(object):        # Defining a class
+        def __init__(self):
+            self.a = 5
+        def hello(self):
+            return self.a
+
+    f = Foo()                 # Creating a class instance
+    print f.hello()           # Calling methods of objects
+    # -> 5 
+
+    class Bar(Foo):           # Defining a subclass
+        def __init__(self, a):
+            self.a = a
+
+    print Bar(99).hello()     # Creating an instance of Bar
+    # -> 99
+
+NumPy in one slide
+------------------
+
+* Python floats are full-fledged objects on the heap
+
+ * Not suitable for high-performance computing!
+
+* NumPy provides a N-dimensional numeric array in Python
+
+ * Perfect for high-performance computing.
+ * Slice are return view (no copy)
+
+* NumPy provides
+
+ * elementwise computations
+
+ * linear algebra, Fourier transforms
+
+ * pseudorandom numbers from many distributions
+
+* SciPy provides lots more, including
+
+ * more linear algebra
+
+ * solvers and optimization algorithms
+
+ * matlab-compatible I/O
+
+ * I/O and signal processing for images and audio
+
+.. code-block:: python
+
+    ##############################
+    # Properties of NumPy arrays
+    # that you really need to know
+    ##############################
+
+    import numpy as np          # import can rename
+    a = np.random.rand(3, 4, 5) # random generators
+    a32 = a.astype('float32')   # arrays are strongly typed
+
+    a.ndim                      # int: 3
+    a.shape                     # tuple: (3, 4, 5)
+    a.size                      # int: 60
+    a.dtype                     # np.dtype object: 'float64'
+    a32.dtype                   # np.dtype object: 'float32'
+
+    assert a[1, 1, 1] != 10     # a[1, 1, 1] is a view
+    a[1, 1, 1] = 10             # So affectation to it change the
+    assert a[1, 1, 1] == 10     # original array
+
+
+Arrays can be combined with numeric operators, standard mathematical
+functions. NumPy has great `documentation <http://docs.scipy.org/doc/numpy/reference/>`_.
+
+Training an MNIST-ready classification neural network in pure NumPy might look like this:
+
+.. code-block:: python
+
+    #########################
+    # NumPy for Training a
+    # Neural Network on MNIST
+    #########################
+
+    x = np.load('data_x.npy')
+    y = np.load('data_y.npy')
+    w = np.random.normal(
+        avg=0,
+        std=.1,
+        size=(784, 500))
+    b = np.zeros((500,))
+    v = np.zeros((500, 10))
+    c = np.zeros((10,))
+
+    batchsize = 100
+    for i in xrange(1000):
+        x_i = x[i * batchsize: (i + 1) * batchsize]
+        y_i = y[i * batchsize: (i + 1) * batchsize]
+
+        hidin = np.dot(x_i, w) + b
+
+        hidout = np.tanh(hidin)
+
+        outin = np.dot(hidout, v) + c
+        outout = (np.tanh(outin) + 1) / 2.0
+
+        g_outout = outout - y_i
+        err = 0.5 * np.sum(g_outout) ** 2
+
+        g_outin = g_outout * outout * (1.0 - outout)
+
+        g_hidout = np.dot(g_outin, v.T)
+        g_hidin = g_hidout * (1 - hidout ** 2)
+
+        b -= lr * np.sum(g_hidin, axis=0)
+        c -= lr * np.sum(g_outin, axis=0)
+        w -= lr * np.dot(x_i.T, g_hidin)
+        v -= lr * np.dot(hidout.T, g_outin)
+
+
+What's missing?
+---------------
+
+* Non-lazy evaluation (required by Python) hurts performance
+
+* NumPy is bound to the CPU
+
+* NumPy lacks symbolic or automatic differentiation
+
+Now let's have a look at the same algorithm in Theano, which runs 15 times faster if
+you have GPU (I'm skipping some dtype-details which we'll come back to).
+
+.. code-block:: python
+
+    #########################
+    # Theano for Training a
+    # Neural Network on MNIST
+    #########################
+
+    import numpy as np
+
+    import theano
+    import theano.tensor as tensor
+
+    x = np.load('data_x.npy')
+    y = np.load('data_y.npy')
+
+    # symbol declarations
+    sx = tensor.matrix()
+    sy = tensor.matrix()
+    w = theano.shared(np.random.normal(avg=0, std=.1,
+                                       size=(784, 500)))
+    b = theano.shared(np.zeros(500))
+    v = theano.shared(np.zeros((500, 10)))
+    c = theano.shared(np.zeros(10))
+
+    # symbolic expression-building
+    hid = tensor.tanh(tensor.dot(sx, w) + b)
+    out = tensor.tanh(tensor.dot(hid, v) + c)
+    err = 0.5 * tensor.sum(out - sy) ** 2
+    gw, gb, gv, gc = tensor.grad(err, [w, b, v, c])
+
+    # compile a fast training function
+    train = theano.function([sx, sy], err,
+        updates={
+            w: w - lr * gw,
+            b: b - lr * gb,
+            v: v - lr * gv,
+            c: c - lr * gc})
+
+    # now do the computations
+    batchsize = 100
+    for i in xrange(1000):
+        x_i = x[i * batchsize: (i + 1) * batchsize]
+        y_i = y[i * batchsize: (i + 1) * batchsize]
+        err_i = train(x_i, y_i)
+
+    
+Theano in one slide
+-------------------
+
+* High-level domain-specific language tailored to numeric computation
+
+* Compiles most common expressions to C for CPU and GPU.
+
+* Limited expressivity means lots of opportunities for expression-level optimizations
+
+ * No function call -> global optimization
+
+ * Strongly typed -> compiles to machine instructions
+
+ * Array oriented -> parallelizable across cores
+
+ * Support for looping and branching in expressions
+
+* Expression substitution optimizations automatically draw
+  on many backend technologies for best performance.
+
+ * FFTW, MKL, ATLAS, SciPy, Cython, CUDA
+
+ * Slower fallbacks always available
+
+* Automatic differentiation and R op
+
+* Sparse matrices
+
+
+Project status
+--------------
+
+* Mature: theano has been developed and used since January 2008 (5.5 yrs old)
+
+* Driven over 87 research papers
+
+* Good user documentation
+
+* Active mailing list with participants from outside our lab
+
+* Core technology for a funded Silicon-Valley startup
+
+* Many contributors (some from outside our lab)
+
+* Used to teach IFT6266 for many years
+
+* Used for research at Google and Yahoo.
+
+* Downloads (January 2011 -  June 8 2011):
+
+ * Pypi (16 July 2013): 60k total, 159 last day, 823 last week 
+
+ * Github (`bleeding edge` repository): unknown
+
+
+
+
+Why scripting for GPUs?
+-----------------------
+
+They *Complement each other*:
+
+* GPUs are everything that scripting/high level languages are not
+
+ * Highly parallel
+
+ * Very architecture-sensitive
+
+ * Built for maximum FP/memory throughput
+
+ * So hard to program that meta-programming is easier.
+
+* CPU: largely restricted to control
+
+ * Optimized for sequential code and low latency (rather than high throughput)
+
+ * Tasks (1000/sec)
+
+ * Scripting fast enough
+
+Best of both: scripted CPU invokes JIT-compiled kernels on GPU.
+
+
+How Fast are GPUs?
+------------------
+
+* Theory
+
+ * Intel Core i7 980 XE (107Gf/s float64) 6 cores
+
+ * NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores
+
+ * NVIDIA GTX580 (1.5Tf/s float32) 512 cores
+
+ * GPUs are faster, cheaper, more power-efficient
+
+* Practice (our experience)
+
+ * Depends on algorithm and implementation!
+
+ * Reported speed improvements over CPU in lit. vary *widely* (.01x to 1000x)
+
+ * Matrix-matrix multiply speedup: usually about 10-20x.
+
+ * Convolution speedup: usually about 15x.
+
+ * Elemwise speedup: slower or up to 100x (depending on operation and layout)
+
+ * Sum: can be faster or slower depending on layout.
+
+* Benchmarking is delicate work...
+
+ * How to control quality of implementation?
+
+  * How much time was spent optimizing CPU vs GPU code?
+
+ * Theano goes up to 100x faster on GPU because it uses only one CPU core
+
+ * Theano can be linked with multi-core capable BLAS (GEMM and GEMV)
+
+* If you see speedup > 100x, the benchmark is probably not fair.
--- a/doc/crei2013/logreg.py
+++ b/doc/crei2013/logreg.py
+import numpy
+import theano
+import theano.tensor as tt
+rng = numpy.random
+
+N = 400
+feats = 784
+D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
+training_steps = 10000
+
+# Declare Theano symbolic variables
+x = tt.matrix("x")
+y = tt.vector("y")
+w = theano.shared(rng.randn(feats), name="w")
+b = theano.shared(0., name="b")
+print "Initial model:"
+print w.get_value(), b.get_value()
+
+# Construct Theano expression graph
+p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b))   # Probability that target = 1
+prediction = p_1 > 0.5                      # The prediction thresholded
+xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1)  # Cross-entropy loss
+cost = xent.mean() + 0.01 * (w ** 2).sum()  # The cost to minimize
+gw, gb = tt.grad(cost, [w, b])
+
+# Compile
+train = theano.function(
+    inputs=[x, y],
+    outputs=[prediction, xent],
+    updates=[(w, w - 0.1 * gw),
+             (b, b - 0.1 * gb)],
+    name='train')
+
+predict = theano.function(inputs=[x], outputs=prediction,
+                          name='predict')
+
+# Train
+for i in range(training_steps):
+    pred, err = train(D[0], D[1])
+
+print "Final model:"
+print w.get_value(), b.get_value()
+print "target values for D:", D[1]
+print "prediction on D:", predict(D[0])
--- a/doc/crei2013/logreg_profile.txt
+++ b/doc/crei2013/logreg_profile.txt
+Function profiling
+==================
+  Message: train
+  Time in 10000 calls to Function.__call__: 7.171231e+00s
+  Time in Function.fn.__call__: 6.686692e+00s (93.243%)
+  Time in thunks: 6.511275e+00s (90.797%)
+  Total compile time: 6.550491e-01s
+    Theano Optimizer time: 5.976810e-01s
+       Theano validate time: 1.260662e-02s
+    Theano Linker time (includes C, CUDA code generation/compiling): 2.649593e-02s
+
+Class
+---
+<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
+  87.0%    87.0%       5.665s       2.83e-04s     C     20000        2   <class 'theano.tensor.blas_c.CGemv'>
+  11.5%    98.4%       0.746s       7.46e-06s     C     100000       10   <class 'theano.tensor.elemwise.Elemwise'>
+   0.7%    99.1%       0.045s       2.27e-06s     C     20000        2   <class 'theano.tensor.basic.Alloc'>
+   0.5%    99.6%       0.030s       1.01e-06s     C     30000        3   <class 'theano.tensor.elemwise.DimShuffle'>
+   0.2%    99.8%       0.013s       1.34e-06s     C     10000        1   <class 'theano.tensor.elemwise.Sum'>
+   0.2%   100.0%       0.012s       6.00e-07s     C     20000        2   <class 'theano.tensor.opt.Shape_i'>
+   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
+
+Ops
+---
+<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
+  87.0%    87.0%       5.665s       2.83e-04s     C     20000        2   CGemv{inplace}
+   6.9%    93.9%       0.452s       4.52e-05s     C     10000        1   Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(
+   1.8%    95.7%       0.116s       1.16e-05s     C     10000        1   Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i
+   1.7%    97.4%       0.109s       1.09e-05s     C     10000        1   Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 
+   0.7%    98.1%       0.045s       2.27e-06s     C     20000        2   Alloc
+   0.3%    98.4%       0.020s       1.02e-06s     C     20000        2   InplaceDimShuffle{x}
+   0.2%    98.6%       0.015s       1.50e-06s     C     10000        1   Elemwise{sub,no_inplace}
+   0.2%    98.8%       0.014s       1.42e-06s     C     10000        1   Elemwise{gt,no_inplace}
+   0.2%    99.1%       0.013s       1.34e-06s     C     10000        1   Sum
+   0.2%    99.3%       0.013s       1.29e-06s     C     10000        1   Elemwise{neg,no_inplace}
+   0.2%    99.4%       0.012s       6.00e-07s     C     20000        2   Shape_i{0}
+   0.2%    99.6%       0.010s       9.84e-07s     C     10000        1   InplaceDimShuffle{1,0}
+   0.1%    99.7%       0.010s       9.58e-07s     C     10000        1   Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
+   0.1%    99.8%       0.007s       6.95e-07s     C     10000        1   Elemwise{Cast{float64}}
+   0.1%    99.9%       0.005s       5.46e-07s     C     10000        1   Elemwise{inv,no_inplace}
+   0.1%   100.0%       0.005s       4.88e-07s     C     10000        1   Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
+   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)
+
+Apply
+------
+<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
+  51.0%    51.0%       3.319s       3.32e-04s   10000     7 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{0.0})
+  36.0%    87.0%       2.345s       2.35e-04s   10000    18 CGemv{inplace}(w, TensorConstant{-0.1}, x.T, Elemwise{Composite{[Composite{[Compo
+   6.9%    93.9%       0.452s       4.52e-05s   10000    13 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_
+   1.8%    95.7%       0.116s       1.16e-05s   10000    16 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, n
+   1.7%    97.4%       0.109s       1.09e-05s   10000    14 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwis
+   0.5%    97.9%       0.031s       3.13e-06s   10000    12 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+   0.2%    98.1%       0.015s       1.50e-06s   10000     4 Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
+   0.2%    98.3%       0.014s       1.42e-06s   10000    15 Elemwise{gt,no_inplace}(Elemwise{ScalarSigmoid{output_types_preference=transfer_t
+   0.2%    98.5%       0.014s       1.40e-06s   10000     5 Alloc(TensorConstant{0.0}, Shape_i{0}.0)
+   0.2%    98.7%       0.013s       1.34e-06s   10000    17 Sum(Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i
+   0.2%    98.9%       0.013s       1.33e-06s   10000     0 InplaceDimShuffle{x}(b)
+   0.2%    99.1%       0.013s       1.29e-06s   10000    11 Elemwise{neg,no_inplace}(Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0)
+   0.2%    99.3%       0.010s       9.84e-07s   10000     2 InplaceDimShuffle{1,0}(x)
+   0.1%    99.4%       0.010s       9.58e-07s   10000     9 Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)](CGemv{inplace}.0, InplaceDimShuff
+   0.1%    99.6%       0.007s       7.11e-07s   10000     6 InplaceDimShuffle{x}(Shape_i{0}.0)
+   0.1%    99.7%       0.007s       6.95e-07s   10000     8 Elemwise{Cast{float64}}(InplaceDimShuffle{x}.0)
+   0.1%    99.8%       0.006s       6.18e-07s   10000     1 Shape_i{0}(x)
+   0.1%    99.8%       0.006s       5.82e-07s   10000     3 Shape_i{0}(y)
+   0.1%    99.9%       0.005s       5.46e-07s   10000    10 Elemwise{inv,no_inplace}(Elemwise{Cast{float64}}.0)
+   0.1%   100.0%       0.005s       4.88e-07s   10000    19 Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)](b, TensorConstant{0.1}, Sum.0
+   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
+
+Function profiling
+==================
+  Message: predict
+  Time in 1 calls to Function.__call__: 4.870892e-04s
+  Time in Function.fn.__call__: 4.608631e-04s (94.616%)
+  Time in thunks: 4.491806e-04s (92.217%)
+  Total compile time: 7.993293e-02s
+    Theano Optimizer time: 7.383800e-02s
+       Theano validate time: 2.010584e-03s
+    Theano Linker time (includes C, CUDA code generation/compiling): 4.319906e-03s
+
+Class
+---
+<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
+  94.2%    94.2%       0.000s       4.23e-04s     C        1        1   <class 'theano.tensor.blas_c.CGemv'>
+   4.0%    98.2%       0.000s       1.81e-05s     C        1        1   <class 'theano.tensor.elemwise.Elemwise'>
+   0.7%    98.9%       0.000s       3.10e-06s     C        1        1   <class 'theano.tensor.basic.Alloc'>
+   0.6%    99.5%       0.000s       2.86e-06s     C        1        1   <class 'theano.tensor.elemwise.DimShuffle'>
+   0.5%   100.0%       0.000s       2.15e-06s     C        1        1   <class 'theano.tensor.opt.Shape_i'>
+   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
+
+Ops
+---
+<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
+  94.2%    94.2%       0.000s       4.23e-04s     C        1        1   CGemv{inplace}
+   4.0%    98.2%       0.000s       1.81e-05s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Composite{[GT(scalar_sigmoid
+   0.7%    98.9%       0.000s       3.10e-06s     C        1        1   Alloc
+   0.6%    99.5%       0.000s       2.86e-06s     C        1        1   InplaceDimShuffle{x}
+   0.5%   100.0%       0.000s       2.15e-06s     C        1        1   Shape_i{0}
+   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)
+
+Apply
+------
+<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
+  94.2%    94.2%       0.000s       4.23e-04s      1     3 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{0.0})
+   4.0%    98.2%       0.000s       1.81e-05s      1     4 Elemwise{Composite{[Composite{[Composite{[Composite{[GT(scalar_sigmoid(i0), i1)]}
+   0.7%    98.9%       0.000s       3.10e-06s      1     2 Alloc(TensorConstant{0.0}, Shape_i{0}.0)
+   0.6%    99.5%       0.000s       2.86e-06s      1     0 InplaceDimShuffle{x}(b)
+   0.5%   100.0%       0.000s       2.15e-06s      1     1 Shape_i{0}(x)
+   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
+
+Function profiling
+==================
+  Message: Sum of all printed profiles at exit
+  Time in 10001 calls to Function.__call__: 7.171718e+00s
+  Time in Function.fn.__call__: 6.687153e+00s (93.243%)
+  Time in thunks: 6.511724e+00s (90.797%)
+  Total compile time: 7.349820e-01s
+    Theano Optimizer time: 6.715190e-01s
+       Theano validate time: 1.461720e-02s
+    Theano Linker time (includes C, CUDA code generation/compiling): 3.081584e-02s
+
+  [...]
--- a/doc/crei2013/scan_poly.py
+++ b/doc/crei2013/scan_poly.py
+import numpy
+
+import theano
+import theano.tensor as tt
+
+coefficients = theano.tensor.vector("coefficients")
+x = tt.scalar("x")
+max_coefficients_supported = 10000
+
+# Generate the components of the polynomial
+full_range = theano.tensor.arange(max_coefficients_supported)
+components, updates = theano.scan(fn=lambda coeff, power, free_var:
+                                  coeff * (free_var ** power),
+                                  outputs_info=None,
+                                  sequences=[coefficients, full_range],
+                                  non_sequences=x)
+polynomial = components.sum()
+calculate_polynomial = theano.function(inputs=[coefficients, x],
+                                       outputs=polynomial)
+
+test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
+print calculate_polynomial(test_coeff, 3)
+# 19.0
--- a/doc/crei2013/scan_pow.py
+++ b/doc/crei2013/scan_pow.py
+import theano
+import theano.tensor as tt
+
+k = tt.iscalar("k")
+A = tt.vector("A")
+
+
+def inner_fct(prior_result, A):
+    return prior_result * A
+# Symbolic description of the result
+result, updates = theano.scan(fn=inner_fct,
+                              outputs_info=tt.ones_like(A),
+                              non_sequences=A, n_steps=k)
+
+# Scan has provided us with A**1 through A**k.  Keep only the last
+# value. Scan notices this and does not waste memory saving them.
+final_result = result[-1]
+
+power = theano.function(inputs=[A, k],
+                        outputs=final_result,
+                        updates=updates)
+
+print power(range(10), 2)
+#[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
--- a/doc/crei2013/theano.txt
+++ b/doc/crei2013/theano.txt
--- a/doc/library/sparse/index.txt
+++ b/doc/library/sparse/index.txt
@@ -194,16 +194,19 @@ List of Implemented Operations
 - Dot Product
    - :class:`Dot <theano.sparse.basic.Dot>` and
      :func:`dot <theano.sparse.basic.dot>`.
+
        - The grad implemented is regular.
        - No C code for perform and no C code for grad.
        - Return a dense for perform and a dense for grad.
    - :class:`StructuredDot <theano.sparse.basic.StructuredDot>`
      and :func:`structured_dot <theano.sparse.basic.structured_dot>`.
+
        - The grad implemented is structured.
        - C code for perform and grad.
        - Return a dense for perforn and a sparse for grad.
    - :class:`TrueDot <theano.sparse.basic.TrueDot>` and
      :func:`true_dot <theano.sparse.basic.true_dot>`.
+
        - The grad implemented is regular.
        - No C code for perform and no C code for grad.
        - Return a Sparse for perform and a Sparse for grad.
@@ -211,11 +214,13 @@ List of Implemented Operations
          grad to be dense if the second input of the op is dense.
    - :class:`SamplingDot <theano.sparse.basic.SamplingDot>` and
      ``sampling_dot``.
+
        - The grad implemented is structured for `p`.
        - Sample of the dot and sample of the gradient.
        - C code for perform but not for grad.
        - Return sparse for perform and grad.
    - :class:`Usmm <theano.sparse.basic.Usmm>` and ``usmm``.
+
        - This op is the equivalent of gemm for sparse dot.
        - There is no grad implemented for this op.
        - There is optimization that transform a

--- a/doc/tutorial/adding.txt
+++ b/doc/tutorial/adding.txt
@@ -195,9 +195,10 @@ with NumPy arrays may be found here: :ref:`tensor creation<libdoc_tensor_creatio
   program will use 32- or 64-bit integers (``i`` prefix vs. the ``l`` prefix)
   and floats (``f`` prefix vs. the ``d`` prefix).

-------------------------------------------

-**Exercise**
+
+Exercise
+========

 .. code-block:: python


--- a/doc/tutorial/loop.txt
+++ b/doc/tutorial/loop.txt
@@ -84,10 +84,10 @@ The full documentation can be found in the library: :ref:`Scan <lib_scan>`.
  # 19.0


-------------------------------------------


-**Exercise**
+Exercise
+========

 Run both examples.


--- a/doc/tutorial/modes.txt
+++ b/doc/tutorial/modes.txt
@@ -36,9 +36,9 @@ variables, type this from the command-line:

 For more detail, see :ref:`Configuration <libdoc_config>` in the library.

-------------------------------------------

-**Exercise**
+Exercise
+========


 Consider the logistic regression:

--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -320,9 +320,9 @@ To change the value of a ``shared`` variable, e.g. to provide new data to proces
 use ``shared_variable.set_value(new_value)``. For a lot more detail about this,
 see :ref:`aliasing`.

-------------------------------------------

-**Exercise**
+Exercise
+========

 Consider again the logistic regression:

@@ -551,15 +551,14 @@ you feel competent enough, you may try yourself on the corresponding exercises.
  assert numpy.allclose(dest, a*b)
  print dest

-------------------------------------------

-**Exercise**
+Exercise
+========

 Run the preceding example.

 Modify and execute to work for a matrix of shape (20, 10).

-------------------------------------------


 .. _pyCUDA_theano:
@@ -615,9 +614,9 @@ Use this code to test it:
 >>> assert numpy.allclose(f(xv), xv*2)
 >>> print numpy.asarray(f(xv))

-------------------------------------------

-**Exercise**
+Exercise
+========


 Run the preceding example.

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -405,7 +405,7 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
    """
        :param ten4:     a list of lists of images
                         ten4 is of shape (list 1 dim, list 2 dim,
-                                           row, col)
+                         row, col)
        :type ten4:      A 4d tensor-like.
        :param neib_shape: (r,c) where r is the height of the neighborhood
                        in rows and c is the width of the neighborhood
@@ -416,13 +416,21 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
                          columns. When None, this is the same as
                          neib_shape(patch are disjoint)
        :type neib_step: A 1d tensor-like of 2 values.
-        :param mode: Possible values:
-            'valid': Requires an input that is a multiple of the
-                pooling factor (in each direction)
-            'ignore_borders': Same as valid, but will ignore the borders
-                if the shape(s) of the input
-                is not a multiple of the pooling factor(s)
-            'wrap_centered' : ?? TODO comment
+        :param mode:
+            Possible values:
+
+            ``valid``
+               Requires an input that is a multiple of the
+               pooling factor (in each direction)
+
+            ``ignore_borders``
+               Same as valid, but will ignore the borders
+               if the shape(s) of the input
+               is not a multiple of the pooling factor(s)
+
+            ``wrap_centered``
+               ?? TODO comment
+
        :type mode: str
        :return:
            Reshapes the input as a 2D tensor where each row is an