Small refactoring to move the GPU example earlier.

4707121a · Frederic Bastien · 97cfb787 · 4707121a
--- a/doc/hpcs2011_tutorial/presentation.tex
+++ b/doc/hpcs2011_tutorial/presentation.tex
@@ -172,12 +172,14 @@ HPCS 2011, Montr\'eal
 % More info on T.grad
 % Where are the optimization in the example?
 % Exercises 2
+    \item Theano Flags
-    \item Symbolic Variables
    \item GPU
 % Exercises 3
+    \item Symbolic Variables
+    \item Differentiation Details
    \item Benchmarks % MLP, Convolucion, Elemwise
    \end{itemize}
+  \item break?
  \end{itemize}
 }
@@ -190,12 +192,10 @@ HPCS 2011, Montr\'eal
    \begin{itemize}
    \item Compilation Pipeline
    \item Inplace Optimization
-    \item Theano Flags
    \item Profiling
 %exercises 4
    \item Drawing/Printing Theano Graph
    \item Debugging
-    \item break?
    \item Scan (For-Loop generalization)
    \item Known Limitations
    \end{itemize} %& \includegraphics[width=1.in]{pics/theano_logo.png}
@@ -488,29 +488,6 @@ print f([0,1,2])              {\color{gray} # prints `array([0,2,1026])`}
  \end{itemize}
 }
-\frame{
-\frametitle{GPU for Exercises}
-\begin{itemize}
-\item Intel Core i7 980 XE (107Gf/s float64, 1050\$, 6 cores/12 threads)
-\item NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32, 2400\$, 480 cores), compute capability 2.0
-\item NVIDIA GTX580 (1.5Tf/s float32, 500\$, 512 cores), compute capability 2.0
-\end{itemize}
-Computers in the class
-\begin{itemize}
-\item Intel Xeon X3450 (?56? flops/s, 383\$, 4 cores)
-\item NVIDIA Quadro FX 580 (71GF/s single, 140\$, 32 cores), compute capability 1.1, 'profesionnal card'
-% BLAS on the cpu took 48s, 4s on the GPU
-\end{itemize}
-%Device 0: "Quadro FX 580"
-% Total amount of global memory:                 536150016 bytes
-% Multiprocessors x Cores/MP = Cores:            4 (MP) x 8 (Cores/MP) = 32 (Cores)
-% Clock rate:                                    1.12 GHz
-% Run time limit on kernels:                     Yes
-% Compute mode:                                  Default (multiple host
-%threads can use this device simultaneously)
-}
 \begin{frame}[fragile]
  \frametitle{Exercises 1}
  \begin{Verbatim}
@@ -580,21 +557,6 @@ cost = xent.mean() + 0.01*(w**2).sum()   {\color{gray}# The cost to minimize}
 \end{Verbatim}
 \end{frame}
-\begin{frame}[fragile]
-  \frametitle{A Real Example: Logistic Regression}
-\begin{Verbatim}[commandchars=\\\{\}]
-\end{Verbatim}
-\begin{itemize}
-\item T.grad works symbolically: takes and returns a Theano variable
-\item T.grad can be compared to a macro: it can be applied multiple times
-\item T.grad takes scalar costs only
-\item Simple recipe allows to compute efficiently vector $\times$ Jacobian and vector $\times$ Hessian
-\item We are working on the missing optimizations to be able to compute efficently the full Jacobian and Hessian and Jacobian $\times$ vector
-\end{itemize}
-\end{frame}
 \begin{frame}[fragile]
  \frametitle{A Real Example: Logistic Regression}
 \begin{Verbatim}[commandchars=\\\{\}]
@@ -679,6 +641,16 @@ train = theano.function(
 \end{itemize}
 \end{frame}
+\subsection{Theano Flags}
+\frame{
+\frametitle{Theano Flags}
+Theano can be configured with flags. They can be defined in two ways
+\begin{itemize}
+\item With an environment variable: \texttt{THEANO\_FLAGS="mode=ProfileMode,ProfileMode.profile\_memory=True"}
+\item With a configuration file that defaults to \textasciitilde/.theanorc
+\end{itemize}
+}
 \begin{frame}[fragile]
 \frametitle{Exercises 2}
 \begin{Verbatim}
@@ -690,6 +662,64 @@ Modify and execute the example to run on CPU with floatX=float32
 * You will need to use: theano.config.floatX and ndarray.astype("str")
 \end{frame}
+\subsection{GPU}
+\frame{
+\frametitle{GPU}
+\begin{itemize}
+\item Only 32 bit floats are supported (being worked on)
+\item Only 1 GPU per process
+\item Use the Theano flag \texttt{device=gpu} to tell to use the GPU device
+  \begin{itemize}
+  \item Use \texttt{device=gpu{0, 1, ...}} to specify which GPU if you have more than one
+  \item Shared variables with float32 dtype are by default moved to the GPU memory space
+  \end{itemize}
+\item Use the Theano flag \texttt{floatX=float32}
+  \begin{itemize}
+  \item Be sure to use \texttt{floatX} (\texttt{theano.config.floatX}) in your code
+  \item Cast inputs before putting them into a shared variable
+  \item Cast "problem": int32 with float32 $\to$ float64
+    \begin{itemize}    
+    \item A new casting mechanism is being developed
+    \item Insert manual cast in your code or use [u]int{8,16}
+    \item Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
+    \end{itemize}
+  \end{itemize}
+\end{itemize}
+}
+\frame{
+\frametitle{GPU for Exercises}
+\begin{itemize}
+\item Intel Core i7 980 XE (107Gf/s float64, 1050\$, 6 cores/12 threads)
+\item NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32, 2400\$, 480 cores), compute capability 2.0
+\item NVIDIA GTX580 (1.5Tf/s float32, 500\$, 512 cores), compute capability 2.0
+\end{itemize}
+Computers in the class
+\begin{itemize}
+\item Intel Xeon X3450 (?56? flops/s, 383\$, 4 cores)
+\item NVIDIA Quadro FX 580 (71GF/s single, 140\$, 32 cores), compute capability 1.1, 'profesionnal card'
+% BLAS on the cpu took 48s, 4s on the GPU
+\end{itemize}
+%Device 0: "Quadro FX 580"
+% Total amount of global memory:                 536150016 bytes
+% Multiprocessors x Cores/MP = Cores:            4 (MP) x 8 (Cores/MP) = 32 (Cores)
+% Clock rate:                                    1.12 GHz
+% Run time limit on kernels:                     Yes
+% Compute mode:                                  Default (multiple host
+%threads can use this device simultaneously)
+}
+\begin{frame}
+\frametitle{Exercises 3}
+\begin{itemize}
+\item Modify and execute the code to run with floatX=float32 on GPU
+\item Run the code on the GPU
+\item Time with: \texttt{time python file.py}
+\end{itemize}
+\end{frame}
 \subsection{Symbolic Variables}
 \frame{
  \frametitle{Creating symbolic variables}
@@ -728,38 +758,18 @@ Modify and execute the example to run on CPU with floatX=float32
  \end{itemize}
 }
-\subsection{GPU}
+\subsection{Differentiation Details}
-\frame{
+\begin{frame}[fragile]
-\frametitle{GPU}
+  \frametitle{Differentiation Details}
-\begin{itemize}
+\begin{Verbatim}[commandchars=\\\{\}]
-\item Only 32 bit floats are supported (being worked on)
+{\color{gray}gw,gb = T.grad(cost, [w,b])}
-\item Only 1 GPU per process
+\end{Verbatim}
-\item Use the Theano flag \texttt{device=gpu} to tell to use the GPU device
-  \begin{itemize}
-  \item Use \texttt{device=gpu{0, 1, ...}} to specify which GPU if you have more than one
-  \item Shared variables with float32 dtype are by default moved to the GPU memory space
-  \end{itemize}
-\item Use the Theano flag \texttt{floatX=float32}
-  \begin{itemize}
-  \item Be sure to use \texttt{floatX} (\texttt{theano.config.floatX}) in your code
-  \item Cast inputs before putting them into a shared variable
-  \item Cast "problem": int32 with float32 $\to$ float64
-    \begin{itemize}    
-    \item A new casting mechanism is being developed
-    \item Insert manual cast in your code or use [u]int{8,16}
-    \item Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
-    \end{itemize}
-  \end{itemize}
-\end{itemize}
-}
-\begin{frame}
-\frametitle{Exercises 3}
 \begin{itemize}
-\item Modify and execute the code to run with floatX=float32 on GPU
+\item T.grad works symbolically: takes and returns a Theano variable
-\item Run the code on the GPU
+\item T.grad can be compared to a macro: it can be applied multiple times
-\item Time with: \texttt{time python file.py}
+\item T.grad takes scalar costs only
+\item Simple recipe allows to compute efficiently vector $\times$ Jacobian and vector $\times$ Hessian
+\item We are working on the missing optimizations to be able to compute efficently the full Jacobian and Hessian and Jacobian $\times$ vector
 \end{itemize}
 \end{frame}
@@ -832,16 +842,6 @@ Convolutional Network: 256x256 images convolved with 6 7x7 filters, downsampled
 \end{itemize}
 }
-\subsection{Theano Flags}
-\frame{
-\frametitle{Theano Flags}
-Theano can be configured with flags. They can be defined in two ways
-\begin{itemize}
-\item With an environment variable: \texttt{THEANO\_FLAGS="mode=ProfileMode,ProfileMode.profile\_memory=True"}
-\item With a configuration file that defaults to \textasciitilde/.theanorc
-\end{itemize}
-}
 \subsection{Profiling}
 \begin{frame}[fragile]
 \frametitle{Profile Mode}