Merged -- no conflict

2ac55778 · Olivier Delalleau · 690b1491 · c79bdd16 · 2ac55778 · 2ac55778
--- a/doc/hpcs2011_tutorial/logreg_example.py
+++ b/doc/hpcs2011_tutorial/logreg_example.py
@@ -6,7 +6,7 @@ rng = numpy.random
 N = 400
 feats = 784
 D = (rng.randn(N, feats).astype(theano.config.floatX), rng.randint(size=N,low=0, high=2).astype(theano.config.floatX))
-training_steps = 10
+training_steps = 10000

 # Declare Theano symbolic variables
 x = T.matrix("x")
@@ -15,8 +15,8 @@ w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
 b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
 x.tag.test_value = D[0]
 y.tag.test_value = D[1]
-print "Initial model:"
-print w.get_value(), b.get_value()
+#print "Initial model:"
+#print w.get_value(), b.get_value()


 # Construct Theano expression graph
@@ -30,15 +30,25 @@ gw,gb = T.grad(cost, [w,b])
 train = theano.function(
            inputs=[x,y],
            outputs=[prediction, xent],
-            updates={w:w-0.1*gw, b:b-0.1*gb},
+            updates={w:w-0.01*gw, b:b-0.01*gb},
            name = "train")
 predict = theano.function(inputs=[x], outputs=prediction,
            name = "predict")

+if any( [x.op.__class__.__name__=='Gemv' for x in train.maker.env.toposort()]):
+    print 'Used the cpu'
+elif any( [x.op.__class__.__name__=='GpuGemm' for x in train.maker.env.toposort()]):
+    print 'Used the gpu'
+else:
+    print 'ERROR, not able to tell if theano used the cpu or the gpu'
+    print train.maker.env.toposort()
+
+
+
 for i in range(training_steps):
    pred, err = train(D[0], D[1])
-print "Final model:"
-print w.get_value(), b.get_value()
+#print "Final model:"
+#print w.get_value(), b.get_value()

 print "target values for D"
 print D[1]

--- a/doc/hpcs2011_tutorial/presentation.tex
+++ b/doc/hpcs2011_tutorial/presentation.tex
@@ -171,10 +171,10 @@ HPCS 2011, Montr\'eal
    \item Real example
 % More info on T.grad
 % Where are the optimization in the example?
-% Exercises 2
+% Exercises 2: logreg\_example.py
    \item Theano Flags
    \item GPU
-% Exercises 3
+% Exercises 3: logreg\_example.py on the gpu
    \item Symbolic Variables
    \item Differentiation Details
    \item Benchmarks % MLP, Convolucion, Elemwise
@@ -193,10 +193,11 @@ HPCS 2011, Montr\'eal
    \item Compilation Pipeline
    \item Inplace Optimization
    \item Profiling
-%exercises 4
+%exercises 4: ProfileMode on logreg\_example, CPU vs GPU
    \item Drawing/Printing Theano Graph
    \item Debugging
    \item Scan (For-Loop generalization)
+%exercises 5: about scan
    \item Known Limitations
    \end{itemize} %& \includegraphics[width=1.in]{pics/theano_logo.png}
  \begin{tabular}{lcr}
@@ -213,7 +214,7 @@ HPCS 2011, Montr\'eal
    \begin{itemize}
    \item Introduction
    \item Example
-% PyCUDA Exercices
+% Exercices 6: pycuda_simple.py
    \end{itemize}
  \item CUDA Overview
  \item Extending Theano
@@ -221,8 +222,9 @@ HPCS 2011, Montr\'eal
    \item Theano Graph
    \item Op Contract
    \item Op Example
+% Exercises 7: double.py
    \item Theano + PyCUDA
-% Theano+PyCUDA Exercises
+% Exercises 8: pycuda_double_op.py
    \end{itemize}
  \item GpuNdArray
  \item Conclusion
@@ -522,7 +524,7 @@ rng = numpy.random
 N = 400
 feats = 784
 D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2))
-training_steps = 10
+training_steps = 10000
 \end{Verbatim}
 \end{frame}

@@ -657,7 +659,7 @@ Theano can be configured with flags. They can be defined in two ways
 python logreg_example.py
 \end{Verbatim}
 \vfill
-Modify and execute the example to run on CPU with floatX=float32
+Modify and execute the example in the file logreg\_example.py to run on CPU with floatX=float32

 * You will need to use: theano.config.floatX and ndarray.astype("str")
 \end{frame}
@@ -715,7 +717,6 @@ Computers in the class

 \begin{itemize}
 \item Modify and execute the code to run with floatX=float32 on GPU
-\item Run the code on the GPU
 \item Time with: \texttt{time python file.py}
 \end{itemize}
 \end{frame}
@@ -752,7 +753,7 @@ Computers in the class
  \end{itemize}
  \vfill
  \begin{itemize}
-  \item Broadcastability must be specified when creating the variable.
+  \item Broadcastability must be specified when creating the variable
  \item The only shorcut with broadcastable dimensions are: {\bf T.row} and {\bf T.col}
  \item For all others: T.tensor(dtype, broadcastable={\bf ([False or True])*nd})
  \end{itemize}
@@ -849,16 +850,16 @@ To replace the default mode with this mode, use the Theano flags \texttt{mode=Pr

 To enable the memory profiling use the flags \texttt{ProfileMode.profile\_memory=True} 
 \begin{Verbatim}
-Time since import 2.697s
-Theano compile time: 1.046s (38.8% since import)
-    Optimization time: 0.804s
-    Linker time: 0.230s
-Theano fct call 0.028s (1.0% since import)
-   Theano Op time 0.026s 1.0%(since import) 93.7%(of fct call)
-   Theano function overhead in ProfileMode 0.002s 0.1%(since import) 
-                                                  6.3%(of fct call)
-11 Theano fct call, 0.003s per call
-Rest of the time since import 1.623s 60.2%
+Time since import 33.456s
+Theano compile time: 1.023s (3.1% since import)
+    Optimization time: 0.789s
+    Linker time: 0.221s
+Theano fct call 30.878s (92.3% since import)
+   Theano Op time 29.411s 87.9%(since import) 95.3%(of fct call)
+   Theano function overhead in ProfileMode 1.466s 4.4%(since import)
+                                                  4.7%(of fct call)
+10001 Theano fct call, 0.003s per call
+Rest of the time since import 1.555s 4.6%
 \end{Verbatim}
 \end{frame}

@@ -869,8 +870,8 @@ Theano outputs:
 \begin{Verbatim}
 Theano fct summary:
 <% total fct time> <total time> <time per call> <nb call> <fct name>
-   97.2% 0.027s 2.70e-03s 10 train
-    2.8% 0.001s 7.84e-04s 1 predict
+   100.0% 30.877s 3.09e-03s 10000 train
+    0.0% 0.000s 4.06e-04s 1 predict
 \end{Verbatim}
 \end{frame}

@@ -883,13 +884,13 @@ Single Op-wise summary:
 <% of local_time spent on this kind of Op> <cumulative %> 
    <self seconds> <cumulative seconds> <time per call> <nb_call>
    <nb_op> <nb_apply> <Op name>
-   82.0%   82.0%  0.021s  0.021s  2.13e-03s      10  1  1 <Gemv>
-   14.1%   96.1%  0.004s  0.025s  3.33e-04s      11  1  2 <Dot>
-    2.9%   98.9%  0.001s  0.026s  8.24e-06s *    91 10 10 <Elemwise>
-    0.6%   99.6%  0.000s  0.026s  1.69e-05s      10  1  1 <Alloc>
-    0.3%   99.9%  0.000s  0.026s  2.43e-06s *    31  2  4 <DimShuffle>
-    0.1%  100.0%  0.000s  0.026s  1.91e-06s *    10  1  1 <Sum>
-    0.0%  100.0%  0.000s  0.026s  1.19e-06s *    10  1  1 <Shape_i>
+   87.3%   87.3%  25.672s  25.672s  2.57e-03s   10000  1  1 <Gemv>
+    9.7%   97.0%  2.843s  28.515s  2.84e-04s   10001  1  2 <Dot>
+    2.4%   99.3%  0.691s  29.206s  7.68e-06s * 90001 10 10 <Elemwise>
+    0.4%   99.7%  0.127s  29.334s  1.27e-05s   10000  1  1 <Alloc>
+    0.2%   99.9%  0.053s  29.386s  1.75e-06s * 30001  2  4 <DimShuffle>
+    0.0%  100.0%  0.014s  29.400s  1.40e-06s * 10000  1  1 <Sum>
+    0.0%  100.0%  0.011s  29.411s  1.10e-06s * 10000  1  1 <Shape_i>
 (*) Op is running a c implementation
 \end{Verbatim}
 \end{frame}
@@ -903,15 +904,15 @@ Op-wise summary:
 <% of local_time spent on this kind of Op> <cumulative %>
    <self seconds> <cumulative seconds> <time per call>
    <nb_call> <nb apply> <Op name>
-   82.0%   82.0%  0.021s  0.021s  2.13e-03s      10  1 Gemv{inplace}
-   14.1%   96.1%  0.004s  0.025s  3.33e-04s      11  2 dot
-    1.4%   97.5%  0.000s  0.025s  3.63e-05s *    10  1 Elemwise{Composite{
+   87.3%   87.3%  25.672s  25.672s  2.57e-03s   10000  1 Gemv{inplace}
+    9.7%   97.0%  2.843s  28.515s  2.84e-04s   10001  2 dot
+    1.3%   98.2%  0.378s  28.893s  3.78e-05s * 10000  1 Elemwise{Composite{
        scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}
-    0.6%   98.1%  0.000s  0.026s  1.69e-05s      10  1 Alloc
-    0.4%   98.5%  0.000s  0.026s  1.02e-05s *    10  1 Elemwise{Composite{
+    0.4%   98.7%  0.127s  29.021s  1.27e-05s   10000  1 Alloc
+    0.3%   99.0%  0.092s  29.112s  9.16e-06s * 10000  1 Elemwise{Composite{
        exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)]
-    0.2%   99.0%  0.000s  0.026s  2.40e-06s *    21  3 InplaceDimShuffle{x}
-   ... (remaining 11 Apply account for 1.3%(0.00s) of the runtime)
+    0.1%   99.3%  0.033s  29.265s  1.66e-06s * 20001  3 InplaceDimShuffle{x}
+   ... (remaining 11 Apply account for 0.7%(0.00s) of the runtime)
 (*) Op is running a c implementation
 \end{Verbatim}
 \end{frame}
@@ -925,15 +926,15 @@ Apply-wise summary:
 <% of local_time spent at this position> <cumulative %%>
    <apply time> <cumulative seconds> <time per call>
    <nb_call> <Apply position> <Apply Op name>
-   82.0%   82.0%  0.021s  0.021s 2.13e-03s  10  15 Gemv{inplace}(
-        w, TensorConstant{-0.1}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.998})
-   11.5%   93.4%  0.003s  0.024s 2.99e-04s  10   1 dot(x, w)
-    2.6%   96.1%  0.001s  0.025s 6.81e-04s  1   1 dot(x, w)
-    1.4%   97.5%  0.000s  0.025s 3.63e-05s  10   9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
-    0.6%   98.1%  0.000s  0.026s 1.69e-05s  10  10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
-    0.4%   98.5%  0.000s  0.026s 1.02e-05s  10  13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0)
+   87.3%   87.3%  25.672s  25.672s 2.57e-03s  10000  15 Gemv{inplace}(
+        w, TensorConstant{-0.01}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.9998})
+    9.7%   97.0%  2.843s  28.515s 2.84e-04s  10000   1 dot(x, w)
+    1.3%   98.2%  0.378s  28.893s 3.78e-05s  10000   9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
+    0.4%   98.7%  0.127s  29.020s 1.27e-05s  10000  10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+    0.3%   99.0%  0.092s  29.112s 9.16e-06s  10000  13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0)
+    0.3%   99.3%  0.080s  29.192s 7.99e-06s  10000  11 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)](Elemwise{neg,no_inplace}.0)
   ... (remaining 14 Apply instances account for 
-       1.5%(0.00s) of the runtime)
+       0.7%(0.00s) of the runtime)
 \end{Verbatim}
 \end{frame}

@@ -979,6 +980,7 @@ Test them first, as they are not guaranteed to always provide a speedup.
 \begin{itemize}
 \item In the last exercises, do you see a speed up with the GPU?
 \item Where does it come from? (Use ProfileMode)
+\item Is there something we can do to speed up the GPU version?
 \end{itemize}
 \end{frame}

@@ -1167,7 +1169,8 @@ print calculate_polynomial(test_coeff, 3)
 \frame{
 \frametitle{Exercises 5}
 \begin{itemize}
-\item Scan: modify the polynomial example to have the reduction done by scan
+\item Run the example in the file scan\_pow.py and scan\_poly.py
+\item Modify and execute the polynomial example to have the reduction done by scan
 \end{itemize}
 }

@@ -1335,9 +1338,9 @@ multiply_them(
 }

 \begin{frame}
-\frametitle{PyCUDA Exercises}
+\frametitle{Exercises 6}
 \begin{itemize}
-\item Run the example
+\item Run the example in the file pycuda\_simple.py
 \item Modify and execute it to work for a matrix of 20 $\times$ 10
 \end{itemize}
 \end{frame}
@@ -1429,6 +1432,18 @@ print out
 \end{Verbatim}
 \end{frame}

+\begin{frame}
+\frametitle{Exercises 7}
+\begin{itemize}
+\item Run the code in the file double\_op.py.
+\item Modify and execute to compute: $x * y$
+\item Modify and execute the example to return 2 outputs: $x + y$ and $x - y$
+  \begin{itemize}
+  \item Our current elemwise fusion generate computation with only 1 outputs
+  \end{itemize}
+\end{itemize}
+\end{frame}
+
 \subsection{Theano+PyCUDA}
 \begin{frame}[fragile]
 \frametitle{Theano+PyCUDA Op Example}
@@ -1501,8 +1516,9 @@ print numpy.asarray(f(xv))
 \end{frame}

 \begin{frame}
-\frametitle{Theano + PyCUDA Exercises}
+\frametitle{Exercises 8}
 \begin{itemize}
+\item Run the example in the file pycuda\_double\_op.py
 \item Modify and execute the example to multiple two matrix: $x * y$
 \item Modify and execute the example to return 2 outputs: $x + y$ and $x - y$
  \begin{itemize}

--- a/doc/hpcs2011_tutorial/pycuda_simple.py
+++ b/doc/hpcs2011_tutorial/pycuda_simple.py
+import pycuda.autoinit
+import pycuda.driver as drv
+import numpy
+
+from pycuda.compiler import SourceModule
+mod = SourceModule("""
+__global__ void multiply_them(float *dest, float *a, float *b)
+{
+  const int i = threadIdx.x;
+  dest[i] = a[i] * b[i];
+}
+""")
+
+multiply_them = mod.get_function("multiply_them")
+
+a = numpy.random.randn(400).astype(numpy.float32)
+b = numpy.random.randn(400).astype(numpy.float32)
+
+dest = numpy.zeros_like(a)
+multiply_them(
+        drv.Out(dest), drv.In(a), drv.In(b),
+        block=(400,1,1), grid=(1,1))
+
+assert numpy.allclose(dest, a*b)
+print dest
--- a/doc/hpcs2011_tutorial/scan_poly.py
+++ b/doc/hpcs2011_tutorial/scan_poly.py
+import numpy
+
+import theano
+import theano.tensor as T
+
+coefficients = theano.tensor.vector("coefficients")
+x = T.scalar("x"); max_coefficients_supported = 10000
+
+# Generate the components of the polynomial
+full_range=theano.tensor.arange(max_coefficients_supported)
+components, updates = theano.scan(fn=lambda coeff, power, free_var:
+                                     coeff * (free_var ** power),
+                                  outputs_info=None,
+                                  sequences=[coefficients, full_range],
+                                  non_sequences=x)
+polynomial = components.sum()
+calculate_polynomial = theano.function(inputs=[coefficients, x],
+                                       outputs=polynomial)
+
+test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
+print calculate_polynomial(test_coeff, 3)
+# 19.0
--- a/doc/hpcs2011_tutorial/scan_pow.py
+++ b/doc/hpcs2011_tutorial/scan_pow.py
+import theano
+import theano.tensor as T
+
+k = T.iscalar("k"); A = T.vector("A")
+
+def inner_fct(prior_result, A): return prior_result * A
+# Symbolic description of the result
+result, updates = theano.scan(fn=inner_fct,
+                              outputs_info=T.ones_like(A),
+                              non_sequences=A, n_steps=k)
+
+# Scan has provided us with A**1 through A**k.  Keep only the last
+# value. Scan notices this and does not waste memory saving them.
+final_result = result[-1]
+
+power = theano.function(inputs=[A,k], outputs=final_result,
+                        updates=updates)
+
+print power(range(10),2)
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -21,7 +21,9 @@ since 2007.  But it is also approachable enough to be used in the classroom
    :scale: 75%
    :align: left

-**NEW!** You can watch a quick (20 minute) introduction to Theano given as a talk at `SciPy 2010 <http://conference.scipy.org/scipy2010/>`_ via streaming (or downloaded) video:
+**NEW!** `HPCS 2011 Tutorial <http://www.iro.umontreal.ca/~lisa/pointeurs/tutorial_hpcs2011_fixed.pdf>`_. I included a few fix discovered while doing the Tutorial.
+
+You can watch a quick (20 minute) introduction to Theano given as a talk at `SciPy 2010 <http://conference.scipy.org/scipy2010/>`_ via streaming (or downloaded) video:

  `Transparent GPU Computing With Theano`_.
  James Bergstra, SciPy 2010, June 30, 2010.