advance the presentation for crei.

8e7dd8cd · Frederic Bastien · 2f506f14 · 8e7dd8cd · 8e7dd8cd · 8e7dd8cd
--- a/doc/cifarSC2011/gpundarray.txt
+++ b/doc/cifarSC2011/gpundarray.txt

-.. _gpundarray:
+.. _cifar2013_gpundarray:

 **********
 GpuNdArray

--- a/doc/crei2013/advanced_theano.txt
+++ b/doc/crei2013/advanced_theano.txt

-.. _advanced_theano:
+.. _crei2013_advanced_theano:

 ***************
 Advanced Theano
 ***************

+
+Profiling
+---------
+
+- To replace the default mode with this mode, use the Theano flags ``profile=True``
+
+- To enable the memory profiling use the flags ``profile_memory=True``
+
+Theano output:
+
+.. literalinclude:: logreg_profile.txt
+
+Compilation pipeline
+--------------------
+
+.. image:: ../hpcs2011_tutorial/pics/pipeline.png
+   :width: 400 px
+
+
+Inplace optimization
+--------------------
+
+- 2 type of inplace operations:
+
+  - An op that return a view on its inputs (e.g. reshape, inplace transpose)
+  - An op that write the output on the inputs memory space
+
+- This allows some memory optimization
+- The Op must tell Theano if they work inplace
+- Inplace Op add constraints to the order of execution
+
+
 Conditions
 ----------
 **IfElse**
@@ -16,46 +48,14 @@ Conditions

 **IfElse Example: Comparison with Switch**

-.. code-block:: python
-
-  from theano import tensor as T
-  from theano.ifelse import ifelse
-  import theano, time, numpy
-
-  a,b = T.scalars('a','b')
-  x,y = T.matrices('x','y')
-  
-  z_switch = T.switch(T.lt(a,b), T.mean(x), T.mean(y))
-  z_lazy = ifelse(T.lt(a,b), T.mean(x), T.mean(y))
-
-  f_switch = theano.function([a,b,x,y], z_switch, 
-                      mode=theano.Mode(linker='vm'))
-  f_lazyifelse = theano.function([a,b,x,y], z_lazy,
-                      mode=theano.Mode(linker='vm'))
-
-  val1 = 0.
-  val2 = 1.
-  big_mat1 = numpy.ones((10000,1000))
-  big_mat2 = numpy.ones((10000,1000))
-
-  n_times = 10
-
-  tic = time.clock()
-  for i in xrange(n_times):
-      f_switch(val1, val2, big_mat1, big_mat2)
-  print 'time spent evaluating both values %f sec'%(time.clock()-tic)
-
-  tic = time.clock()
-  for i in xrange(n_times):
-      f_lazyifelse(val1, val2, big_mat1, big_mat2)
-  print 'time spent evaluating one value %f sec'%(time.clock()-tic)
+.. literalinclude:: ifelse_switch.py

 IfElse Op spend less time (about an half) than Switch since it computes only
 one variable instead of both.

 >>> python ifelse_switch.py
-time spent evaluating both values 0.6700 sec
-time spent evaluating one value 0.3500 sec
+time spent evaluating both values 0.230000 sec
+time spent evaluating one value 0.120000 sec

 Note that IfElse condition is a boolean while Switch condition is a tensor, so
 Switch is more general.
@@ -86,56 +86,12 @@ Loops

 **Scan Example: Computing pow(A,k)**

-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-
-  k = T.iscalar("k"); A = T.vector("A")
-
-  def inner_fct(prior_result, A): return prior_result * A
-  # Symbolic description of the result
-  result, updates = theano.scan(fn=inner_fct,
-                              outputs_info=T.ones_like(A),
-                              non_sequences=A, n_steps=k)
-
-  # Scan has provided us with A**1 through A**k.  Keep only the last
-  # value. Scan notices this and does not waste memory saving them.
-  final_result = result[-1]
-  
-  power = theano.function(inputs=[A,k], outputs=final_result,
-                        updates=updates)
-  
-  print power(range(10),2)
-  #[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
+.. literalinclude:: scan_pow.py


 **Scan Example: Calculating a Polynomial**

-.. code-block:: python
-
-  import theano
-  import theano.tensor as T
-
-  coefficients = theano.tensor.vector("coefficients")
-  x = T.scalar("x"); max_coefficients_supported = 10000
-
-  # Generate the components of the polynomial
-  full_range=theano.tensor.arange(max_coefficients_supported)
-  components, updates = theano.scan(fn=lambda coeff, power, free_var:
-                                     coeff * (free_var ** power),
-                                  outputs_info=None,
-                                  sequences=[coefficients, full_range],
-                                  non_sequences=x)
-  polynomial = components.sum()
-  calculate_polynomial = theano.function(inputs=[coefficients, x],
-                                       outputs=polynomial)
-
-  test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
-  print calculate_polynomial(test_coeff, 3)
-  # 19.0
-
-
+.. literalinclude:: scan_poly.py

 Exercise 4
 -----------
@@ -144,114 +100,6 @@ Exercise 4
 - Modify and execute the polynomial example to have the reduction done by scan


-
-Compilation pipeline
--------------------
-
-.. image:: ../hpcs2011_tutorial/pics/pipeline.png
-   :width: 400 px
-
-Inplace optimization
--------------------
-
- 2 type of inplace operations:
-
-  - An op that return a view on its inputs (e.g. reshape, inplace transpose)
-  - An op that write the output on the inputs memory space
-
- This allows some memory optimization
- The Op must tell Theano if they work inplace
- Inplace Op add constraints to the order of execution
-
-
-Profiling
---------
-
- To replace the default mode with this mode, use the Theano flags ``mode=ProfileMode``
-
- To enable the memory profiling use the flags ``ProfileMode.profile_memory=True``
-
-Theano output:
-
-.. code-block:: python
-
-    """
-    Time since import 33.456s
-    Theano compile time: 1.023s (3.1% since import)
-      Optimization time: 0.789s
-      Linker time: 0.221s
-    Theano fct call 30.878s (92.3% since import)
-     Theano Op time 29.411s 87.9%(since import) 95.3%(of fct call)
-     Theano function overhead in ProfileMode 1.466s 4.4%(since import)
-                                                  4.7%(of fct call)
-    10001 Theano fct call, 0.003s per call
-    Rest of the time since import 1.555s 4.6%
-
-    Theano fct summary:
-    <% total fct time> <total time> <time per call> <nb call> <fct name>
-     100.0% 30.877s 3.09e-03s 10000 train
-      0.0% 0.000s 4.06e-04s 1 predict
-
-    Single Op-wise summary:
-    <% of local_time spent on this kind of Op> <cumulative %>
-        <self seconds> <cumulative seconds> <time per call> <nb_call>
-        <nb_op> <nb_apply> <Op name>
-       87.3%   87.3%  25.672s  25.672s  2.57e-03s   10000  1  1 <Gemv>
-        9.7% s  97.0%  2.843s  28.515s  2.84e-04s   10001  1  2 <Dot>
-        2.4%   99.3%  0.691s  29.206s  7.68e-06s * 90001 10 10 <Elemwise>
-        0.4%   99.7%  0.127s  29.334s  1.27e-05s   10000  1  1 <Alloc>
-        0.2%   99.9%  0.053s  29.386s  1.75e-06s * 30001  2  4 <DimShuffle>
-        0.0%  100.0%  0.014s  29.400s  1.40e-06s * 10000  1  1 <Sum>
-        0.0%  100.0%  0.011s  29.411s  1.10e-06s * 10000  1  1 <Shape_i>
-    (*) Op is running a c implementation
-
-    Op-wise summary:
-    <% of local_time spent on this kind of Op> <cumulative %>
-        <self seconds> <cumulative seconds> <time per call>
-        <nb_call> <nb apply> <Op name>
-       87.3%   87.3%  25.672s  25.672s  2.57e-03s   10000  1 Gemv{inplace}
-        9.7%   97.0%  2.843s  28.515s  2.84e-04s   10001  2 dot
-        1.3%   98.2%  0.378s  28.893s  3.78e-05s * 10000  1 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}
-        0.4%   98.7%  0.127s  29.021s  1.27e-05s   10000  1 Alloc
-        0.3%   99.0%  0.092s  29.112s  9.16e-06s * 10000  1 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)]
-        0.1%   99.3%  0.033s  29.265s  1.66e-06s * 20001  3 InplaceDimShuffle{x}
-       ... (remaining 11 Apply account for 0.7%(0.00s) of the runtime)
-    (*) Op is running a c implementation
-
-    Apply-wise summary:
-    <% of local_time spent at this position> <cumulative %%>
-        <apply time> <cumulative seconds> <time per call>
-        <nb_call> <Apply position> <Apply Op name>
-       87.3%   87.3%  25.672s  25.672s 2.57e-03s  10000  15 Gemv{inplace}(w, TensorConstant{-0.01}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.9998})
-        9.7%   97.0%  2.843s  28.515s 2.84e-04s  10000   1 dot(x, w)
-        1.3%   98.2%  0.378s  28.893s 3.78e-05s  10000   9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
-        0.4%   98.7%  0.127s  29.020s 1.27e-05s  10000  10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
-        0.3%   99.0%  0.092s  29.112s 9.16e-06s  10000  13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0,0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0,0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0)
-        0.3%   99.3%  0.080s  29.192s 7.99e-06s  10000  11 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)](Elemwise{neg,no_inplace}.0)
-       ... (remaining 14 Apply instances account for
-           0.7%(0.00s) of the runtime)
-
-    Profile of Theano functions memory:
-    (This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)
-    Theano fct: train
-        Max without gc, inplace and view (KB) 2481
-        Max FAST_RUN_NO_GC (KB) 16
-        Max FAST_RUN (KB) 16
-        Memory saved by view (KB) 2450
-        Memory saved by inplace (KB) 15
-        Memory saved by GC (KB) 0
-        <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)>
-            <created/inplace/view> <Apply node>
-        <created/inplace/view> is taked from the op declaration, not ...
-             2508800B  [2508800] v InplaceDimShuffle{1,0}(x)
-                6272B  [6272] i Gemv{inplace}(w, ...)
-                3200B  [3200] c Elemwise{Composite{...}}(y, ...)
-
-    Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list).
-    Test them first, as they are not guaranteed to always provide a speedup.
-      - Try the Theano flag floatX=float32
-    """
-
 Exercise 5
 -----------

@@ -363,6 +211,6 @@ Known limitations

 - Compilation time superlinear in the size of the graph.

-  - A few hundreds nodes is fine
+  - Hundreds of nodes is fine
  - Disabling a few optimizations can speed up compilation
  - Usually too many nodes indicates a problem with the graph
--- a/doc/crei2013/gpundarray.txt
+++ b/doc/crei2013/gpundarray.txt
+
+.. _crei2013_gpundarray:
+
+**********
+GpuNdArray
+**********
+
+Why a common GPU ndarray?
+
+- Currently there are at least 4 different GPU array data structures in use by Python packages
+
+  - CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ...
+  - There are even more if we include other languages
+
+- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU
+- Lots of duplicated effort
+
+  - GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
+
+- Lack of a common array API makes it harder to port/reuse code
+- Also harder to find/distribute code
+- Divides development work
+
+
+Design Goals
+
+- Make it VERY similar to ``numpy.ndarray``
+- Be compatible with both CUDA and OpenCL
+- Have the base object accessible from C to allow collaboration with more projects, across high-level languages
+
+  - We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
+
+
+Final GpuNdArray Note
+
+- Under development
+- Will be the next GPU array container for Theano (this summer!)
+- Probably also for PyCUDA, PyOpenCL
+- Mailing list: http://lists.tiker.net/listinfo/gpundarray
+
--- a/doc/crei2013/ifelse_switch.py
+++ b/doc/crei2013/ifelse_switch.py
+import time
+
+import numpy
+
+import theano
+from theano import tensor as tt
+from theano.ifelse import ifelse
+
+a, b = tt.scalars('a', 'b')
+x, y = tt.matrices('x', 'y')
+
+z_switch = tt.switch(tt.lt(a, b), tt.mean(x), tt.mean(y))
+z_lazy = ifelse(tt.lt(a, b), tt.mean(x), tt.mean(y))
+
+f_switch = theano.function([a, b, x, y], z_switch)
+f_lazyifelse = theano.function([a, b, x, y], z_lazy)
+
+val1 = 0.
+val2 = 1.
+big_mat1 = numpy.ones((10000, 1000))
+big_mat2 = numpy.ones((10000, 1000))
+
+n_times = 10
+
+tic = time.clock()
+for i in xrange(n_times):
+    f_switch(val1, val2, big_mat1, big_mat2)
+print 'time spent evaluating both values %f sec' % (time.clock() - tic)
+
+tic = time.clock()
+for i in xrange(n_times):
+    f_lazyifelse(val1, val2, big_mat1, big_mat2)
+print 'time spent evaluating one value %f sec' % (time.clock() - tic)
\ No newline at end of file
--- a/doc/crei2013/index.txt
+++ b/doc/crei2013/index.txt
@@ -68,6 +68,4 @@ from gurus on hand if you get stuck.
    theano
    advanced_theano
    /tutorial/extending_theano
-    pyCUDA
    gpundarray
-
--- a/doc/crei2013/introduction.txt
+++ b/doc/crei2013/introduction.txt

-.. _cifarSS2011_Introduction:
+.. _crei2013_Introduction:


 ************

--- a/doc/crei2013/logreg.py
+++ b/doc/crei2013/logreg.py
+import numpy
+import theano
+import theano.tensor as tt
+rng = numpy.random
+
+N = 400
+feats = 784
+D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
+training_steps = 10000
+
+# Declare Theano symbolic variables
+x = tt.matrix("x")
+y = tt.vector("y")
+w = theano.shared(rng.randn(feats), name="w")
+b = theano.shared(0., name="b")
+print "Initial model:"
+print w.get_value(), b.get_value()
+
+# Construct Theano expression graph
+p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b))   # Probability that target = 1
+prediction = p_1 > 0.5                      # The prediction thresholded
+xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1)  # Cross-entropy loss
+cost = xent.mean() + 0.01 * (w ** 2).sum()  # The cost to minimize
+gw, gb = tt.grad(cost, [w, b])
+
+# Compile
+train = theano.function(
+    inputs=[x, y],
+    outputs=[prediction, xent],
+    updates=[(w, w - 0.1 * gw),
+             (b, b - 0.1 * gb)],
+    name='train')
+
+predict = theano.function(inputs=[x], outputs=prediction,
+                          name='predict')
+
+# Train
+for i in range(training_steps):
+    pred, err = train(D[0], D[1])
+
+print "Final model:"
+print w.get_value(), b.get_value()
+print "target values for D:", D[1]
+print "prediction on D:", predict(D[0])
--- a/doc/crei2013/logreg_profile.txt
+++ b/doc/crei2013/logreg_profile.txt
+Function profiling
+==================
+  Message: train
+  Time in 10000 calls to Function.__call__: 7.171231e+00s
+  Time in Function.fn.__call__: 6.686692e+00s (93.243%)
+  Time in thunks: 6.511275e+00s (90.797%)
+  Total compile time: 6.550491e-01s
+    Theano Optimizer time: 5.976810e-01s
+       Theano validate time: 1.260662e-02s
+    Theano Linker time (includes C, CUDA code generation/compiling): 2.649593e-02s
+
+Class
+---
+<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
+  87.0%    87.0%       5.665s       2.83e-04s     C     20000        2   <class 'theano.tensor.blas_c.CGemv'>
+  11.5%    98.4%       0.746s       7.46e-06s     C     100000       10   <class 'theano.tensor.elemwise.Elemwise'>
+   0.7%    99.1%       0.045s       2.27e-06s     C     20000        2   <class 'theano.tensor.basic.Alloc'>
+   0.5%    99.6%       0.030s       1.01e-06s     C     30000        3   <class 'theano.tensor.elemwise.DimShuffle'>
+   0.2%    99.8%       0.013s       1.34e-06s     C     10000        1   <class 'theano.tensor.elemwise.Sum'>
+   0.2%   100.0%       0.012s       6.00e-07s     C     20000        2   <class 'theano.tensor.opt.Shape_i'>
+   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
+
+Ops
+---
+<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
+  87.0%    87.0%       5.665s       2.83e-04s     C     20000        2   CGemv{inplace}
+   6.9%    93.9%       0.452s       4.52e-05s     C     10000        1   Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(
+   1.8%    95.7%       0.116s       1.16e-05s     C     10000        1   Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i
+   1.7%    97.4%       0.109s       1.09e-05s     C     10000        1   Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 
+   0.7%    98.1%       0.045s       2.27e-06s     C     20000        2   Alloc
+   0.3%    98.4%       0.020s       1.02e-06s     C     20000        2   InplaceDimShuffle{x}
+   0.2%    98.6%       0.015s       1.50e-06s     C     10000        1   Elemwise{sub,no_inplace}
+   0.2%    98.8%       0.014s       1.42e-06s     C     10000        1   Elemwise{gt,no_inplace}
+   0.2%    99.1%       0.013s       1.34e-06s     C     10000        1   Sum
+   0.2%    99.3%       0.013s       1.29e-06s     C     10000        1   Elemwise{neg,no_inplace}
+   0.2%    99.4%       0.012s       6.00e-07s     C     20000        2   Shape_i{0}
+   0.2%    99.6%       0.010s       9.84e-07s     C     10000        1   InplaceDimShuffle{1,0}
+   0.1%    99.7%       0.010s       9.58e-07s     C     10000        1   Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
+   0.1%    99.8%       0.007s       6.95e-07s     C     10000        1   Elemwise{Cast{float64}}
+   0.1%    99.9%       0.005s       5.46e-07s     C     10000        1   Elemwise{inv,no_inplace}
+   0.1%   100.0%       0.005s       4.88e-07s     C     10000        1   Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
+   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)
+
+Apply
+------
+<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
+  51.0%    51.0%       3.319s       3.32e-04s   10000     7 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{0.0})
+  36.0%    87.0%       2.345s       2.35e-04s   10000    18 CGemv{inplace}(w, TensorConstant{-0.1}, x.T, Elemwise{Composite{[Composite{[Compo
+   6.9%    93.9%       0.452s       4.52e-05s   10000    13 Elemwise{Composite{[Composite{[Composite{[sub(mul(i0, i1), neg(i2))]}(i0, scalar_
+   1.8%    95.7%       0.116s       1.16e-05s   10000    16 Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i0, n
+   1.7%    97.4%       0.109s       1.09e-05s   10000    14 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwis
+   0.5%    97.9%       0.031s       3.13e-06s   10000    12 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+   0.2%    98.1%       0.015s       1.50e-06s   10000     4 Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
+   0.2%    98.3%       0.014s       1.42e-06s   10000    15 Elemwise{gt,no_inplace}(Elemwise{ScalarSigmoid{output_types_preference=transfer_t
+   0.2%    98.5%       0.014s       1.40e-06s   10000     5 Alloc(TensorConstant{0.0}, Shape_i{0}.0)
+   0.2%    98.7%       0.013s       1.34e-06s   10000    17 Sum(Elemwise{Composite{[Composite{[Composite{[Composite{[mul(i0, add(i1, i2))]}(i
+   0.2%    98.9%       0.013s       1.33e-06s   10000     0 InplaceDimShuffle{x}(b)
+   0.2%    99.1%       0.013s       1.29e-06s   10000    11 Elemwise{neg,no_inplace}(Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)].0)
+   0.2%    99.3%       0.010s       9.84e-07s   10000     2 InplaceDimShuffle{1,0}(x)
+   0.1%    99.4%       0.010s       9.58e-07s   10000     9 Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)](CGemv{inplace}.0, InplaceDimShuff
+   0.1%    99.6%       0.007s       7.11e-07s   10000     6 InplaceDimShuffle{x}(Shape_i{0}.0)
+   0.1%    99.7%       0.007s       6.95e-07s   10000     8 Elemwise{Cast{float64}}(InplaceDimShuffle{x}.0)
+   0.1%    99.8%       0.006s       6.18e-07s   10000     1 Shape_i{0}(x)
+   0.1%    99.8%       0.006s       5.82e-07s   10000     3 Shape_i{0}(y)
+   0.1%    99.9%       0.005s       5.46e-07s   10000    10 Elemwise{inv,no_inplace}(Elemwise{Cast{float64}}.0)
+   0.1%   100.0%       0.005s       4.88e-07s   10000    19 Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)](b, TensorConstant{0.1}, Sum.0
+   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
+
+Function profiling
+==================
+  Message: predict
+  Time in 1 calls to Function.__call__: 4.870892e-04s
+  Time in Function.fn.__call__: 4.608631e-04s (94.616%)
+  Time in thunks: 4.491806e-04s (92.217%)
+  Total compile time: 7.993293e-02s
+    Theano Optimizer time: 7.383800e-02s
+       Theano validate time: 2.010584e-03s
+    Theano Linker time (includes C, CUDA code generation/compiling): 4.319906e-03s
+
+Class
+---
+<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
+  94.2%    94.2%       0.000s       4.23e-04s     C        1        1   <class 'theano.tensor.blas_c.CGemv'>
+   4.0%    98.2%       0.000s       1.81e-05s     C        1        1   <class 'theano.tensor.elemwise.Elemwise'>
+   0.7%    98.9%       0.000s       3.10e-06s     C        1        1   <class 'theano.tensor.basic.Alloc'>
+   0.6%    99.5%       0.000s       2.86e-06s     C        1        1   <class 'theano.tensor.elemwise.DimShuffle'>
+   0.5%   100.0%       0.000s       2.15e-06s     C        1        1   <class 'theano.tensor.opt.Shape_i'>
+   ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
+
+Ops
+---
+<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
+  94.2%    94.2%       0.000s       4.23e-04s     C        1        1   CGemv{inplace}
+   4.0%    98.2%       0.000s       1.81e-05s     C        1        1   Elemwise{Composite{[Composite{[Composite{[Composite{[GT(scalar_sigmoid
+   0.7%    98.9%       0.000s       3.10e-06s     C        1        1   Alloc
+   0.6%    99.5%       0.000s       2.86e-06s     C        1        1   InplaceDimShuffle{x}
+   0.5%   100.0%       0.000s       2.15e-06s     C        1        1   Shape_i{0}
+   ... (remaining 0 Ops account for   0.00%(0.00s) of the runtime)
+
+Apply
+------
+<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
+  94.2%    94.2%       0.000s       4.23e-04s      1     3 CGemv{inplace}(Alloc.0, TensorConstant{1.0}, x, w, TensorConstant{0.0})
+   4.0%    98.2%       0.000s       1.81e-05s      1     4 Elemwise{Composite{[Composite{[Composite{[Composite{[GT(scalar_sigmoid(i0), i1)]}
+   0.7%    98.9%       0.000s       3.10e-06s      1     2 Alloc(TensorConstant{0.0}, Shape_i{0}.0)
+   0.6%    99.5%       0.000s       2.86e-06s      1     0 InplaceDimShuffle{x}(b)
+   0.5%   100.0%       0.000s       2.15e-06s      1     1 Shape_i{0}(x)
+   ... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
+
+Function profiling
+==================
+  Message: Sum of all printed profiles at exit
+  Time in 10001 calls to Function.__call__: 7.171718e+00s
+  Time in Function.fn.__call__: 6.687153e+00s (93.243%)
+  Time in thunks: 6.511724e+00s (90.797%)
+  Total compile time: 7.349820e-01s
+    Theano Optimizer time: 6.715190e-01s
+       Theano validate time: 1.461720e-02s
+    Theano Linker time (includes C, CUDA code generation/compiling): 3.081584e-02s
+
+  [...]
--- a/doc/crei2013/scan_poly.py
+++ b/doc/crei2013/scan_poly.py
+import numpy
+
+import theano
+import theano.tensor as tt
+
+coefficients = theano.tensor.vector("coefficients")
+x = tt.scalar("x")
+max_coefficients_supported = 10000
+
+# Generate the components of the polynomial
+full_range = theano.tensor.arange(max_coefficients_supported)
+components, updates = theano.scan(fn=lambda coeff, power, free_var:
+                                  coeff * (free_var ** power),
+                                  outputs_info=None,
+                                  sequences=[coefficients, full_range],
+                                  non_sequences=x)
+polynomial = components.sum()
+calculate_polynomial = theano.function(inputs=[coefficients, x],
+                                       outputs=polynomial)
+
+test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
+print calculate_polynomial(test_coeff, 3)
+# 19.0
--- a/doc/crei2013/scan_pow.py
+++ b/doc/crei2013/scan_pow.py
+import theano
+import theano.tensor as tt
+
+k = tt.iscalar("k")
+A = tt.vector("A")
+
+
+def inner_fct(prior_result, A):
+    return prior_result * A
+# Symbolic description of the result
+result, updates = theano.scan(fn=inner_fct,
+                              outputs_info=tt.ones_like(A),
+                              non_sequences=A, n_steps=k)
+
+# Scan has provided us with A**1 through A**k.  Keep only the last
+# value. Scan notices this and does not waste memory saving them.
+final_result = result[-1]
+
+power = theano.function(inputs=[A, k],
+                        outputs=final_result,
+                        updates=updates)
+
+print power(range(10), 2)
+#[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
--- a/doc/crei2013/theano.txt
+++ b/doc/crei2013/theano.txt

-.. _theano:
+.. _crei2013_theano:

 ******
 Theano
@@ -92,49 +92,7 @@ Real example
 * Speed optimizations
 * Stability optimizations

-.. code-block:: python
-
-  import numpy
-  import theano
-  import theano.tensor as tt
-  rng = numpy.random
-  
-  N = 400
-  feats = 784
-  D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2))
-  training_steps = 10000
-  
-  # Declare Theano symbolic variables
-  x = tt.matrix("x")
-  y = tt.vector("y")
-  w = theano.shared(rng.randn(feats), name="w")
-  b = theano.shared(0., name="b")
-  print "Initial model:"
-  print w.get_value(), b.get_value()
-
-  # Construct Theano expression graph
-  p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b))   # Probability that target = 1
-  prediction = p_1 > 0.5                    # The prediction thresholded
-  xent = -y*tt.log(p_1) - (1-y)*tt.log(1-p_1) # Cross-entropy loss function
-  cost = xent.mean() + 0.01 * (w**2).sum()  # The cost to minimize
-  gw,gb = tt.grad(cost, [w, b])
-
-  # Compile
-  train = theano.function(
-            inputs=[x,y],
-            outputs=[prediction, xent],
-            updates={w: w - 0.1 * gw,
-                     b: b - 0.1 * gb})
-  predict = theano.function(inputs=[x], outputs=prediction)
-
-  # Train
-  for i in range(training_steps):
-      pred, err = train(D[0], D[1])
-
-  print "Final model:"
-  print w.get_value(), b.get_value()
-  print "target values for D:", D[1]
-  print "prediction on D:", predict(D[0])
+.. literalinclude:: logreg.py


 **Optimizations:**
@@ -159,15 +117,15 @@ Where are those optimization applied?
  xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1)
  # Log(1-sigmoid(var)) -> -sigmoid(var)
  prediction = p_1 > 0.5
-  cost = xent.mean() + 0.01 * (w**2).sum()
+  cost = xent.mean() + 0.01 * (w ** 2).sum()
  gw,gb = tt.grad(cost, [w, b])

  train = theano.function(
-            inputs=[x,y],
+            inputs=[x, y],
            outputs=[prediction, xent],
            # w - 0.1 * gw: GEMV with the dot in the grad
-            updates={w: w - 0.1 * gw,
-                     b: b - 0.1 * gb})
+            updates=[(w, w - 0.1 * gw),
+                     (b, b - 0.1 * gb)])


 Theano flags
@@ -350,6 +308,7 @@ Differentiation details


 TODO: update the benchmark
+
 Benchmarks
 ----------