Started to reorder the slide.

401c2a4c · Frederic Bastien · 50740a0c · 401c2a4c
--- a/doc/hpcs2011_tutorial/presentation.tex
+++ b/doc/hpcs2011_tutorial/presentation.tex
@@ -101,10 +101,59 @@ HPCS 2011, Montr\'eal
 }

 \section{Overview}
+\subsection{Motivation}
+\frame{
+  \frametitle{Theano Goal}
+\begin{itemize}
+\item Tries to be the {\bf holy grail} in computing: {\it easy to code} and {\it fast to execute}!
+\item Only on mathematical expression
+\item So you won't have:
+  \begin{itemize}
+  \item Function call inside a theano function
+  \item Structure, enum
+  \item Dynamic type (Theano is Fully taped)
+  \item Goto
+  \item ...
+  \item And don't do coffee!
+  \end{itemize}
+\end{itemize}
+}
+
+\frame{
+  \frametitle{Faster on CPU and GPU}
+\includegraphics[width=3.in]{pics/mlp.pdf}
+}
+
+\frame{
+  \frametitle{Project Status}
+  Why you can rely on Theano:
+  \begin{itemize}  
+  \item Theano has been developed and used since January 2008 (3.5 yrs old)
+  \item Core technology for a funded Silicon-Valley startup
+  \item Driven over 40 research papers in the last few years
+  \item Good user documentation
+  \item Active mailing list with participants from outside our lab
+  \item Many contributors (some from outside our lab)
+
+    \vfill
+  \item Used to teach IFT6266 for two years
+  \item Used by everyone in our lab (\textasciitilde 30 people)
+  \item Deep Learning Tutorials
+  \item Unofficial RPMs for Mandriva
+  \item Downloads (June 8 2011, since last January):
+    \begin{itemize}
+      \item Pypi 780
+      \item MLOSS: 483
+      \item Assembla (``bleeding edge'' repository): unknown
+    \end{itemize}
+  \end{itemize}
+}
+
 \subsection{Overview}
 \frame{
  \frametitle{Overview 1}
  \begin{itemize}
+  \item {\bf Exercises as we go}
  \item Introduction
    \begin{itemize}
    \item Why Scripting for GPUs?
@@ -133,16 +182,15 @@ HPCS 2011, Montr\'eal
    \item Profiling
    \item Printing
    \item Debugging
-    \item Scan (For-Loop generalization)
+    \item break?
    \item GPU
-    \item Exercises/break
+    \item Scan (For-Loop generalization)
    \end{itemize} %& \includegraphics[width=1.in]{pics/theano_logo.png}
  \item PyCUDA
    \begin{itemize}
    \item Introduction
    \item Example
    \item PyCUDA + Theano
-    \item Exercises
    \end{itemize} %& \includegraphics[width=.6in]{pics/pycuda-logo-crop.pdf}
  \item GpuNdArray
  \item Conclusion
@@ -382,30 +430,6 @@ HPCS 2011, Montr\'eal
  \end{itemize}
 }

-\frame{
-  \frametitle{Project Status}
-  Why you can rely on Theano:
-  \begin{itemize}  
-  \item Theano has been developed and used since January 2008 (3.5 yrs old)
-  \item Core technology for a funded Silicon-Valley startup
-  \item Used to teach IFT6266 for two years
-  \item Used by everyone in our lab (\textasciitilde 30 people)
-  \item Driven over 40 research papers over the last few years
-  \item Active mailing list with participants from outside our lab
-  \item Good user documentation
-  \item Some(lots?) of users beyond our lab.
-  \item Many contributors (some from outside our lab)
-  \item Deep Learning Tutorials
-  \item Unofficial RPMs for Mandriva
-  \item Downloads (June 8 2011, since last January):
-    \begin{itemize}
-      \item Pypi 780
-      \item MLOSS: 483
-      \item Assembla (``bleeding edge'' repository): unknown
-    \end{itemize}
-  \end{itemize}
-}
-
 \newcommand\codeHighlight[1]{\textcolor[rgb]{1,0,0}{\textbf{#1}}}
 \subsection{Simple Example}

@@ -433,6 +457,38 @@ print f([0,1,2])              {\color{gray} # prints `array([0,2,1026])`}
  \end{itemize}
 }

+\frame{
+\frametitle{GPU for Exercises}
+\begin{itemize}
+\item Intel Core i7 980 XE (107Gf/s float64, 1050\$, 6 cores/12 threads)
+\item NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32, 2400\$, 480 cores), compute capability 2.0
+\item NVIDIA GTX580 (1.5Tf/s float32, 500\$, 512 cores), compute capability 2.0
+\end{itemize}
+Computer in the class
+\begin{itemize}
+\item Intel Xeon X3450 (?56? flops/s, 383\$, 4 cores)
+\item NVIDIA Quadro FX 580 (71GF/s single, 140\$, 32 cores), compute capability 1.1, 'profesionnal card'
+\end{itemize}
+
+%Device 0: "Quadro FX 580"
+% Total amount of global memory:                 536150016 bytes
+% Multiprocessors x Cores/MP = Cores:            4 (MP) x 8 (Cores/MP) = 32 (Cores)
+% Clock rate:                                    1.12 GHz
+% Run time limit on kernels:                     Yes
+% Compute mode:                                  Default (multiple host
+%threads can use this device simultaneously)
+}
+
+\begin{frame}[fragile]
+  \frametitle{Exercises 1}
+  \begin{Verbatim}
+    source /groups/h/hpc2011/bin/GPU.csh
+    hg clone http://hg.assembla.com/theano Theano
+    cd Theano/doc/hpcs2011_tutorial
+    python simple_example.py
+  \end{Verbatim}
+\end{frame}
+
 \subsection{Real Example}
 \frame{
  \frametitle{A Real Example: Logistic Regression}
@@ -576,6 +632,16 @@ train = theano.function(
 \end{itemize}
 \end{frame}

+\begin{frame}[fragile]
+\frametitle{Exercises 2}
+\begin{Verbatim}
+python logreg_example.py
+\end{Verbatim}
+\vfill
+Now modif the code to run with floatX=float32
+\end{frame}
+
+\subsection{Symbolic Variables}
 \frame{
  \frametitle{Creating symbolic variables}
  \begin{itemize}
@@ -614,6 +680,41 @@ train = theano.function(
  \end{itemize}
 }

+\subsection{GPU}
+\frame{
+\frametitle{GPU}
+\begin{itemize}
+\item Only 32 bit floats are supported (being worked on)
+\item Only 1 GPU per process
+\item Use the Theano flag \texttt{device=gpu} to tell to use the GPU device
+  \begin{itemize}
+  \item Use \texttt{device=gpu{0, 1, ...}} to specify which GPU if you have more than one
+  \item Shared variables with float32 dtype are by default moved to the GPU memory space
+  \end{itemize}
+\item Use the Theano flag \texttt{floatX=float32}
+  \begin{itemize}
+  \item Be sure to use \texttt{floatX} (\texttt{theano.config.floatX}) in your code
+  \item Cast inputs before putting them into a shared variable
+  \item Cast "problem": int32 with float32 $\to$ float64
+    \begin{itemize}    
+    \item A new casting mechanism is being developed
+    \item Insert manual cast in your code or use [u]int{8,16}
+    \item Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
+    \end{itemize}
+  \end{itemize}
+\end{itemize}
+}
+
+\begin{frame}
+\frametitle{Exercises 3}
+
+\begin{itemize}
+\item Now modif the code to run with floatX=float32 on GPU
+\item Run the code on the GPU
+\item Time with: \texttt{time python file.py}
+\end{itemize}
+\end{frame}
+
 \subsection{Benchmarks}
 \frame{
 \frametitle{Benchmarks}
@@ -661,7 +762,15 @@ Convolutional Network: 256x256 images convolved with 6 7x7 filters, downsampled
 }

 \section{Advanced Theano}
-\subsection{Miscellaneous}
+\subsection{Pipeline}
+\frame{
+\frametitle{Compilation Pipeline}
+\begin{center}
+\includegraphics[width=2.7in]{pics/pipeline.pdf}
+\end{center}
+}
+
+\subsection{Theano Flags}
 \frame{
 \frametitle{Theano Flags}
 Theano can be configured with flags. They can be defined in two ways
@@ -671,14 +780,6 @@ Theano can be configured with flags. They can be defined in two ways
 \end{itemize}
 }

-\subsection{Pipeline}
-\frame{
-\frametitle{Compilation Pipeline}
-\begin{center}
-\includegraphics[width=2.7in]{pics/pipeline.pdf}
-\end{center}
-}
-
 \subsection{Profiling}
 \begin{frame}[fragile]
 \frametitle{Profile Mode}
@@ -811,6 +912,16 @@ Test them first, as they are not guaranteed to always provide a speedup.
 \end{Verbatim}
 \end{frame}

+\begin{frame}
+\frametitle{Exercises 4}
+
+\begin{itemize}
+\item In the last exercises, do you see a speed up with the GPU?
+\item Where does it come from? (Use ProfileMode)
+\end{itemize}
+\end{frame}
+
+
 \subsection{Printing}
 \begin{frame}[fragile]
 \frametitle{Text Printing of Your Theano Graph: Pretty Printing}
@@ -1015,64 +1126,9 @@ print calculate_polynomial(test_coeff, 3)
 \end{Verbatim}
 \end{frame}

-\subsection{GPU}
-\frame{
-\frametitle{GPU}
-\begin{itemize}
-\item Only 32 bit floats are supported (being worked on)
-\item Only 1 GPU per process
-\item Use the Theano flag \texttt{device=gpu} to tell to use the GPU device
-  \begin{itemize}
-  \item Use \texttt{device=gpu{0, 1, ...}} to specify which GPU if you have more than one
-  \item Shared variables with float32 dtype are by default moved to the GPU memory space
-  \end{itemize}
-\item Use the Theano flag \texttt{floatX=float32}
-  \begin{itemize}
-  \item Be sure to use \texttt{floatX} (\texttt{theano.config.floatX}) in your code
-  \item Cast inputs before putting them into a shared variable
-  \item Cast "problem": int32 with float32 $\to$ float64
-    \begin{itemize}    
-    \item A new casting mechanism is being developed
-    \item Insert manual cast in your code or use [u]int{8,16}
-    \item Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
-    \end{itemize}
-  \end{itemize}
-\end{itemize}
-}
-\frame{
-\frametitle{GPU for Exercises}
-\begin{itemize}
-\item Intel Core i7 980 XE(107Gf/s float64) (1050\$)
-\item NVIDIA C2050(515 Gf/s float64, 1Tf/s float32), compute capability 2.0 (2400\$) 6 cores/12 threads
-\item NVIDIA GTX580(1.5Tf/s float32), compute capability 2.0 (500\$) 512 cores
-\end{itemize}
-Computer in the class
-\begin{itemize}
-\item Intel Xeon X3450(?TODO) (383\$)
-\item NVIDIA Quadro FX 580(71GF/s single), compute capability 1.1 (140\$ But 'profesionnal card'), 32 cores
-\end{itemize}
-
-%Device 0: "Quadro FX 580"
-% Total amount of global memory:                 536150016 bytes
-% Multiprocessors x Cores/MP = Cores:            4 (MP) x 8 (Cores/MP) = 32 (Cores)
-% Clock rate:                                    1.12 GHz
-% Run time limit on kernels:                     Yes
-% Compute mode:                                  Default (multiple host
-%threads can use this device simultaneously)
-}
-
 \frame{
-\frametitle{Theano Exercises}
-TODO
-
-source /groups/h/hpc2011/bin/GPU.csh
-hg clone http://hg.assembla.com/theano Theano
+\frametitle{Exercises 5}
 \begin{itemize}
-\item Run the simple example
-\item Run the real example
-\item Modify your version to run in float32 with \texttt{floatX}.
-\item Run your version on the CPU and GPU
-\item Do you see a speed up with the GPU? Where does it come from? (Try to profile it)
 \item Scan: modify the polynomial example to have the reduction done by scan
 \end{itemize}
 }
@@ -1146,6 +1202,13 @@ multiply_them(
 \end{Verbatim}
 \end{frame}

+\begin{frame}
+\frametitle{PyCUDA Exercises}
+\begin{itemize}
+\item Run the example
+\item Modify it to work for a matrix of 200 $\times$ 200
+\end{itemize}
+\end{frame}

 \frame{
 \frametitle{GpuArray}
@@ -1231,6 +1294,15 @@ print out
 \end{Verbatim}
 \end{frame}

+\begin{frame}
+\frametitle{PyCUDA Exercises}
+\begin{itemize}
+\item Run the example
+\item Modify it to multiple two matrix (rename it to MulMatrix)
+\item Modify it to multiple two inputs with arbitrary number of dimensions
+\end{itemize}
+\end{frame}
+
 \subsection{Theano+PyCUDA}
 \begin{frame}[fragile]
 \frametitle{Theano+PyCUDA Op Example}
@@ -1305,9 +1377,12 @@ print numpy.asarray(f(xv))
 \begin{frame}
 \frametitle{Theano + PyCUDA Exercises}
 \begin{itemize}
-\item Elemwise add: $x + y$
-\item Elemwise with 2 outputs: $x + y$ and $x - y$
-\item Elemwise with stride
+\item Modify the example multiple two matrix: $x * y$
+\item Modify the example to return 2 outputs: $x + y$ and $x - y$
+  \begin{itemize}
+  \item Our current elemwise fusion generate computation with only 1 outputs
+  \end{itemize}
+\item Modify the example to support stride? (Don't force the input to be c contiguous)
 \end{itemize}
 \end{frame}