提交 401c2a4c authored 作者: Frederic Bastien's avatar Frederic Bastien

Started to reorder the slide.

上级 50740a0c
......@@ -101,10 +101,59 @@ HPCS 2011, Montr\'eal
}
\section{Overview}
\subsection{Motivation}
\frame{
\frametitle{Theano Goal}
\begin{itemize}
\item Tries to be the {\bf holy grail} in computing: {\it easy to code} and {\it fast to execute}!
\item Only on mathematical expression
\item So you won't have:
\begin{itemize}
\item Function call inside a theano function
\item Structure, enum
\item Dynamic type (Theano is Fully taped)
\item Goto
\item ...
\item And don't do coffee!
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{Faster on CPU and GPU}
\includegraphics[width=3.in]{pics/mlp.pdf}
}
\frame{
\frametitle{Project Status}
Why you can rely on Theano:
\begin{itemize}
\item Theano has been developed and used since January 2008 (3.5 yrs old)
\item Core technology for a funded Silicon-Valley startup
\item Driven over 40 research papers in the last few years
\item Good user documentation
\item Active mailing list with participants from outside our lab
\item Many contributors (some from outside our lab)
\vfill
\item Used to teach IFT6266 for two years
\item Used by everyone in our lab (\textasciitilde 30 people)
\item Deep Learning Tutorials
\item Unofficial RPMs for Mandriva
\item Downloads (June 8 2011, since last January):
\begin{itemize}
\item Pypi 780
\item MLOSS: 483
\item Assembla (``bleeding edge'' repository): unknown
\end{itemize}
\end{itemize}
}
\subsection{Overview}
\frame{
\frametitle{Overview 1}
\begin{itemize}
\item {\bf Exercises as we go}
\item Introduction
\begin{itemize}
\item Why Scripting for GPUs?
......@@ -133,16 +182,15 @@ HPCS 2011, Montr\'eal
\item Profiling
\item Printing
\item Debugging
\item Scan (For-Loop generalization)
\item break?
\item GPU
\item Exercises/break
\item Scan (For-Loop generalization)
\end{itemize} %& \includegraphics[width=1.in]{pics/theano_logo.png}
\item PyCUDA
\begin{itemize}
\item Introduction
\item Example
\item PyCUDA + Theano
\item Exercises
\end{itemize} %& \includegraphics[width=.6in]{pics/pycuda-logo-crop.pdf}
\item GpuNdArray
\item Conclusion
......@@ -382,30 +430,6 @@ HPCS 2011, Montr\'eal
\end{itemize}
}
\frame{
\frametitle{Project Status}
Why you can rely on Theano:
\begin{itemize}
\item Theano has been developed and used since January 2008 (3.5 yrs old)
\item Core technology for a funded Silicon-Valley startup
\item Used to teach IFT6266 for two years
\item Used by everyone in our lab (\textasciitilde 30 people)
\item Driven over 40 research papers over the last few years
\item Active mailing list with participants from outside our lab
\item Good user documentation
\item Some(lots?) of users beyond our lab.
\item Many contributors (some from outside our lab)
\item Deep Learning Tutorials
\item Unofficial RPMs for Mandriva
\item Downloads (June 8 2011, since last January):
\begin{itemize}
\item Pypi 780
\item MLOSS: 483
\item Assembla (``bleeding edge'' repository): unknown
\end{itemize}
\end{itemize}
}
\newcommand\codeHighlight[1]{\textcolor[rgb]{1,0,0}{\textbf{#1}}}
\subsection{Simple Example}
......@@ -433,6 +457,38 @@ print f([0,1,2]) {\color{gray} # prints `array([0,2,1026])`}
\end{itemize}
}
\frame{
\frametitle{GPU for Exercises}
\begin{itemize}
\item Intel Core i7 980 XE (107Gf/s float64, 1050\$, 6 cores/12 threads)
\item NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32, 2400\$, 480 cores), compute capability 2.0
\item NVIDIA GTX580 (1.5Tf/s float32, 500\$, 512 cores), compute capability 2.0
\end{itemize}
Computer in the class
\begin{itemize}
\item Intel Xeon X3450 (?56? flops/s, 383\$, 4 cores)
\item NVIDIA Quadro FX 580 (71GF/s single, 140\$, 32 cores), compute capability 1.1, 'profesionnal card'
\end{itemize}
%Device 0: "Quadro FX 580"
% Total amount of global memory: 536150016 bytes
% Multiprocessors x Cores/MP = Cores: 4 (MP) x 8 (Cores/MP) = 32 (Cores)
% Clock rate: 1.12 GHz
% Run time limit on kernels: Yes
% Compute mode: Default (multiple host
%threads can use this device simultaneously)
}
\begin{frame}[fragile]
\frametitle{Exercises 1}
\begin{Verbatim}
source /groups/h/hpc2011/bin/GPU.csh
hg clone http://hg.assembla.com/theano Theano
cd Theano/doc/hpcs2011_tutorial
python simple_example.py
\end{Verbatim}
\end{frame}
\subsection{Real Example}
\frame{
\frametitle{A Real Example: Logistic Regression}
......@@ -576,6 +632,16 @@ train = theano.function(
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Exercises 2}
\begin{Verbatim}
python logreg_example.py
\end{Verbatim}
\vfill
Now modif the code to run with floatX=float32
\end{frame}
\subsection{Symbolic Variables}
\frame{
\frametitle{Creating symbolic variables}
\begin{itemize}
......@@ -614,6 +680,41 @@ train = theano.function(
\end{itemize}
}
\subsection{GPU}
\frame{
\frametitle{GPU}
\begin{itemize}
\item Only 32 bit floats are supported (being worked on)
\item Only 1 GPU per process
\item Use the Theano flag \texttt{device=gpu} to tell to use the GPU device
\begin{itemize}
\item Use \texttt{device=gpu{0, 1, ...}} to specify which GPU if you have more than one
\item Shared variables with float32 dtype are by default moved to the GPU memory space
\end{itemize}
\item Use the Theano flag \texttt{floatX=float32}
\begin{itemize}
\item Be sure to use \texttt{floatX} (\texttt{theano.config.floatX}) in your code
\item Cast inputs before putting them into a shared variable
\item Cast "problem": int32 with float32 $\to$ float64
\begin{itemize}
\item A new casting mechanism is being developed
\item Insert manual cast in your code or use [u]int{8,16}
\item Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
\end{itemize}
\end{itemize}
\end{itemize}
}
\begin{frame}
\frametitle{Exercises 3}
\begin{itemize}
\item Now modif the code to run with floatX=float32 on GPU
\item Run the code on the GPU
\item Time with: \texttt{time python file.py}
\end{itemize}
\end{frame}
\subsection{Benchmarks}
\frame{
\frametitle{Benchmarks}
......@@ -661,7 +762,15 @@ Convolutional Network: 256x256 images convolved with 6 7x7 filters, downsampled
}
\section{Advanced Theano}
\subsection{Miscellaneous}
\subsection{Pipeline}
\frame{
\frametitle{Compilation Pipeline}
\begin{center}
\includegraphics[width=2.7in]{pics/pipeline.pdf}
\end{center}
}
\subsection{Theano Flags}
\frame{
\frametitle{Theano Flags}
Theano can be configured with flags. They can be defined in two ways
......@@ -671,14 +780,6 @@ Theano can be configured with flags. They can be defined in two ways
\end{itemize}
}
\subsection{Pipeline}
\frame{
\frametitle{Compilation Pipeline}
\begin{center}
\includegraphics[width=2.7in]{pics/pipeline.pdf}
\end{center}
}
\subsection{Profiling}
\begin{frame}[fragile]
\frametitle{Profile Mode}
......@@ -811,6 +912,16 @@ Test them first, as they are not guaranteed to always provide a speedup.
\end{Verbatim}
\end{frame}
\begin{frame}
\frametitle{Exercises 4}
\begin{itemize}
\item In the last exercises, do you see a speed up with the GPU?
\item Where does it come from? (Use ProfileMode)
\end{itemize}
\end{frame}
\subsection{Printing}
\begin{frame}[fragile]
\frametitle{Text Printing of Your Theano Graph: Pretty Printing}
......@@ -1015,64 +1126,9 @@ print calculate_polynomial(test_coeff, 3)
\end{Verbatim}
\end{frame}
\subsection{GPU}
\frame{
\frametitle{GPU}
\begin{itemize}
\item Only 32 bit floats are supported (being worked on)
\item Only 1 GPU per process
\item Use the Theano flag \texttt{device=gpu} to tell to use the GPU device
\begin{itemize}
\item Use \texttt{device=gpu{0, 1, ...}} to specify which GPU if you have more than one
\item Shared variables with float32 dtype are by default moved to the GPU memory space
\end{itemize}
\item Use the Theano flag \texttt{floatX=float32}
\begin{itemize}
\item Be sure to use \texttt{floatX} (\texttt{theano.config.floatX}) in your code
\item Cast inputs before putting them into a shared variable
\item Cast "problem": int32 with float32 $\to$ float64
\begin{itemize}
\item A new casting mechanism is being developed
\item Insert manual cast in your code or use [u]int{8,16}
\item Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
\end{itemize}
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{GPU for Exercises}
\begin{itemize}
\item Intel Core i7 980 XE(107Gf/s float64) (1050\$)
\item NVIDIA C2050(515 Gf/s float64, 1Tf/s float32), compute capability 2.0 (2400\$) 6 cores/12 threads
\item NVIDIA GTX580(1.5Tf/s float32), compute capability 2.0 (500\$) 512 cores
\end{itemize}
Computer in the class
\begin{itemize}
\item Intel Xeon X3450(?TODO) (383\$)
\item NVIDIA Quadro FX 580(71GF/s single), compute capability 1.1 (140\$ But 'profesionnal card'), 32 cores
\end{itemize}
%Device 0: "Quadro FX 580"
% Total amount of global memory: 536150016 bytes
% Multiprocessors x Cores/MP = Cores: 4 (MP) x 8 (Cores/MP) = 32 (Cores)
% Clock rate: 1.12 GHz
% Run time limit on kernels: Yes
% Compute mode: Default (multiple host
%threads can use this device simultaneously)
}
\frame{
\frametitle{Theano Exercises}
TODO
source /groups/h/hpc2011/bin/GPU.csh
hg clone http://hg.assembla.com/theano Theano
\frametitle{Exercises 5}
\begin{itemize}
\item Run the simple example
\item Run the real example
\item Modify your version to run in float32 with \texttt{floatX}.
\item Run your version on the CPU and GPU
\item Do you see a speed up with the GPU? Where does it come from? (Try to profile it)
\item Scan: modify the polynomial example to have the reduction done by scan
\end{itemize}
}
......@@ -1146,6 +1202,13 @@ multiply_them(
\end{Verbatim}
\end{frame}
\begin{frame}
\frametitle{PyCUDA Exercises}
\begin{itemize}
\item Run the example
\item Modify it to work for a matrix of 200 $\times$ 200
\end{itemize}
\end{frame}
\frame{
\frametitle{GpuArray}
......@@ -1231,6 +1294,15 @@ print out
\end{Verbatim}
\end{frame}
\begin{frame}
\frametitle{PyCUDA Exercises}
\begin{itemize}
\item Run the example
\item Modify it to multiple two matrix (rename it to MulMatrix)
\item Modify it to multiple two inputs with arbitrary number of dimensions
\end{itemize}
\end{frame}
\subsection{Theano+PyCUDA}
\begin{frame}[fragile]
\frametitle{Theano+PyCUDA Op Example}
......@@ -1305,9 +1377,12 @@ print numpy.asarray(f(xv))
\begin{frame}
\frametitle{Theano + PyCUDA Exercises}
\begin{itemize}
\item Elemwise add: $x + y$
\item Elemwise with 2 outputs: $x + y$ and $x - y$
\item Elemwise with stride
\item Modify the example multiple two matrix: $x * y$
\item Modify the example to return 2 outputs: $x + y$ and $x - y$
\begin{itemize}
\item Our current elemwise fusion generate computation with only 1 outputs
\end{itemize}
\item Modify the example to support stride? (Don't force the input to be c contiguous)
\end{itemize}
\end{frame}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论