提交 0d06d18f authored 作者: Frederic Bastien's avatar Frederic Bastien

First commited version of the Theano Tutorial at HPCS 2011.

上级 c75bcf17
all:
pdflatex presentation.tex
import numpy
import theano
class DoubleOp(theano.Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x):
x = theano.tensor.as_tensor_variable(x)
return theano.Apply(self, [x], [x.type()])
def perform(self, node, inputs, output_storage):
x = inputs[0]
z = output_storage[0]
z[0] = x * 2
x = theano.tensor.matrix()
f = theano.function([x], DoubleOp()(x))
inp = numpy.random.rand(5,5)
out = f(inp)
assert numpy.allclose(inp*2, out)
print inp
print out
import numpy
import theano
import theano.tensor as T
rng = numpy.random
N = 400
feats = 784
D = (rng.randn(N, feats).astype(theano.config.floatX), rng.randint(size=N,low=0, high=2).astype(theano.config.floatX))
training_steps = 10
# Declare Theano symbolic variables
x = T.matrix("x")
y = T.vector("y")
w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
x.tag.test_value = D[0]
y.tag.test_value = D[1]
print "Initial model:"
print w.get_value(), b.get_value()
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b)) # Probabily of having a one
prediction = p_1 > 0.5 # The prediction that is done: 0 or 1
xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) # Cross-entropy
cost = xent.mean() + 0.01*(w**2).sum() # The cost to optimize
gw,gb = T.grad(cost, [w,b])
# Compile expressions to functions
train = theano.function(
inputs=[x,y],
outputs=[prediction, xent],
updates={w:w-0.1*gw, b:b-0.1*gb},
name = "train")
predict = theano.function(inputs=[x], outputs=prediction,
name = "predict")
for i in range(training_steps):
pred, err = train(D[0], D[1])
print "Final model:"
print w.get_value(), b.get_value()
print "target values for D"
print D[1]
print "prediction on D"
print predict(D[0])
# Print the graph used in the slides
theano.printing.pydotprint(predict,
outfile="pics/logreg_pydotprint_predic.png",
var_with_name_simple=True)
theano.printing.pydotprint_variables(prediction,
outfile="pics/logreg_pydotprint_prediction.png",
var_with_name_simple=True)
theano.printing.pydotprint(train,
outfile="pics/logreg_pydotprint_train.png",
var_with_name_simple=True)
\documentclass[a4paper,9pt]{beamer}
\usetheme{Malmoe} % Now it's a beamer presentation with the lisa theme!
\setbeamertemplate{footline}[page number]
\usecolortheme{beaver}
\usepackage{url}
\usepackage{ragged2e}
\usepackage{multirow}
\usepackage{fancyvrb}
%\usepackage{color}
\def\imagetop#1{\vtop{\null\hbox{#1}}}
\logo{\includegraphics[width=.8in]{pics/UdeM_NoirBleu_logo_Marie_crop.pdf}}
% Standard LaTeX stuff - note the optional abbreviated title being provided
\title[GPU Programming made Easy]{GPU Programming made Easy}
\author[LISA lab]{Fr\'ed\'eric Bastien, Machine Learning Laboratory}
\date{
James Bergstra, Olivier Breuleux, Frederic Bastien, Pascal Lamblin, Razvan Pascanu, Guillaume Desjardins, Joseph Turian, David Warde-Farley, Olivier Delalleau, Arnaud Bergeron, Josh Bleecher Snyder, Ian Goodfellow, Fran\c{c}ois Savard, Xavier Glorot, Douglas Eck, Dumitru Erhan, Michael Mandel, Philippe Hamel, Simon Lemieux, Thierry Bertin-Mahieux, Yoshua Bengio
Presented on June 13\textsuperscript{th} 2011\\
HPCS 2011, Montr\'eal
}
\begin{document}
\frame{\titlepage}
\section{Overview}
\subsection{Overview}
\frame{
\frametitle{Overview 1}
\begin{itemize}
\item Introduction
\begin{itemize}
\item Why Scripting for GPUs?
\item Theano vs PyCUDA
\item Python in 1 slide
\item Numpy in 1 slide
\end{itemize}
\item Theano
\begin{itemize}
\item Intro
\item Simple example
\item Real example
\item Benchmarks
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{Overview 2}
\begin{itemize}
\item Advanced Theano
\begin{itemize}
\item Compilation Pipeline
\item Profiling
\item Printing
\item Debugging
\item Scan(for-Loop generalization)
\item GPU
\item Exercices/break
\end{itemize}
\item PyCUDA
\begin{itemize}
\item Intro
\item Example
\item PyCUDA + Theano
\item Exercices
\end{itemize}
\item GpuNdArray
\item Conclusion
\end{itemize}
\begin{tabular}{lcr}
\imagetop{\includegraphics[width=.4in]{pics/lisabook_logo_text2.png}} &
%% \imagetop{\includegraphics[width=.2in]{pics/white.png}}&
\imagetop{\includegraphics[width=1.in]{pics/theano_logo.png}}&
\imagetop{\includegraphics[width=.6in]{pics/pycuda-logo-crop.pdf}}
\end{tabular}
}
\frame{
\frametitle{Won't cover}
\begin{itemize}
\item How to write GPU code
\item How to optimize GPU code
\end{itemize}
}
\section{Introduction}
\subsection{Introduction}
\frame{
\frametitle{Why GPU}
\begin{itemize}
\item Faster, cheaper, more efficient power usage
\item How much faster? I saw numbers from 100x slower to 1000x faster.
\begin{itemize}
\item It depends on the algorithms
\item How the benchmark is done
\begin{itemize}
\item Quality of implementation
\item How much time spent optimizing CPU vs GPU code
\end{itemize}
\item Theory:
\begin{itemize}
\item Intel Core i7 980 XE(107Gf/s float64) 6 cores
\item NVIDIA C2050(515 Gf/s float64, 1Tf/s float32) 480 cores
\item NVIDIA GTX580(1.5Tf/s float32) 512 cores
\end{itemize}
\end{itemize}
\item With Theano, up to 100x can be seen as we don't do multiple core on cpu (except for call to gemm)
\item If you see 1000x, it means the benchmark is not fair
\end{itemize}
}
\frame{
\frametitle{Why Scripting for GPUs}
They {\bf Complement each other}
\begin{itemize}
\item GPUs are everything that scripting/high level languages are not
\begin{itemize}
\item Highly parallel
\item Very architecture-sensitive
\item Built for maximum FP/memory throughput
\end{itemize}
\item CPU: largely restricted to control
\begin{itemize}
\item Optimized for sequential code
\item tasks (1000/sec)
\item Scripting fast enough
\item Theano = Mathematical expression compiler
\item Python + CUDA = PyCUDA
\item Python + OpenCL = PyOpenCL
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{Theano vs PyCUDA vs PyOpenCL vs CUDA}
\begin{itemize}
\item Theano
\begin{itemize}
\item Mathematical expression compiler
\item Generates costum C and CUDA code
\item Uses python code when performance is not critical
\end{itemize}
\item CUDA
\begin{itemize}
\item C extension by NVIDA that allow to code and use GPU
\end{itemize}
\item PyCUDA
\begin{itemize}
\item Python interface to CUDA
\item Memory management of GPU objects
\item Compilation of code for the low-level driver
\end{itemize}
\item PyOpenCL
\begin{itemize}
\item PyCUDA for OpenCL
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{What is your background ?}
Do you have experinece with :
\begin{itemize}
\item Python
\item Numpy / Scipy / Matlab
\item Maple / Mathematica / SymPy
\item GPU programming / CUDA / OpenCL
\item Cython / Weave / Numexpr
\item C / Java / Fortran
\end{itemize}
}
\frame{
\frametitle{Python in 1 Slide}
\begin{itemize}
\item Interpreted language
\item General-purpose high-level programming language
\item OO and scripting language
\item Emphasizes code readability
\item Large and comprehensive standard library
\item Indentation for block delimiters
\item Dynamic type and memory management
\item Dictionary \texttt{d=\{'var1':'value1', 'var2':42, ...\}}
%\item List comprehension: [i+3 for i in range(10)] not used in the tutorial
\end{itemize}
}
\frame{
\frametitle{Numpy in 1 Slide}
\begin{itemize}
\item Base scientific computing package on the CPU
\item A powerful N-dimensional array object
\begin{itemize}
\item ndarray.\{ndim, shape, size, dtype, itemsize, stride\}
\end{itemize}
\item Sophisticated (broadcasting) functions
\begin{itemize}
\item numpy.random.rand(4,5) * numpy.random.rand(1,5) = mat(4,5)
\item numpy.random.rand(4,5) * numpy.random.rand(4,1) = mat(4,5)
\item numpy.random.rand(4,5) * numpy.random.rand(5) = mat(4,5)
\end{itemize}
\item Tools for integrating C/C++ and Fortran code
\item Linear algebra, Fourier transform and random number capable
\end{itemize}
}
%\frame{
% \frametitle{Competitors TODO: Remove? Missing many I think!}
% There are some competitors for easy computing on gpu.
% \begin{itemize}
% \item Jacket(GPU for matlab): http://www.accelereyes.com/
% \item GPUmat(GPU for matlab, free): http://gp-you.org/
% \item numexpr, algopy
% \end{itemize}
%}
\section{Theano}
\subsection{Introduction}
\frame{
%% \frametitle{Theano}
\begin{center}
\includegraphics[width=3.in]{../images/theano_logo_allblue_350x95.png}
% \includegraphics[width=3.in]{../images/theano_logo_allblue_200x54.png}
\end{center}
}
\frame{
\frametitle{Pointers}
\begin{itemize}
\item Website: http://deeplearning.net/software/theano/
\item Announcements mailing list: http://groups.google.com/group/theano-announce
\item User mailing list: http://groups.google.com/group/theano-users
\item Deep Learning Tutorials: http://www.deeplearning.net/tutorial/
\end{itemize}
}
\frame{
\frametitle{Description}
\begin{itemize}
\item Mathematical expression compiler
\item Statically typed and purely functional
\item Dynamic C/CUDA code generation
\item Efficient symbolic differentiation
\begin{itemize}
\item Theano computes derivatives of functions with one or many inputs.
\end{itemize}
\item Speed and stability optimizations
\begin{itemize}
\item Gives the right answer for log(1+x) even if x is really tiny.
\end{itemize}
\item Extensive unit-testing and self-verification
\begin{itemize}
\item Detects and diagnoses many types of errors
\end{itemize}
\item Expressions mimic NumPy's syntax \& semantics
\item Works on linux, Mac and Windows
\end{itemize}
}
\frame{
\frametitle{Description 2}
\begin{itemize}
\item Transparent use of a GPU
\begin{itemize}
\item float32 only for now (working on other data types)
\item Doesn't work on Windows for now
\item On GPU data-intensive calculations are typically between 6.5x and 44x faster. We've seen speedsups up to 140x
\end{itemize}
\item On CPU, common machine learning-algorithms are 1.6x to 7.5x faster than competitive alternatives
\begin{itemize}
\item including those in C/C++, NumPy, SciPy, and Matlab
\end{itemize}
\item The project was started by James Bergstra and Olivier Breuleux
\item For the past 1-2 years, I've been replacing as lead contributor
\item Some Sparse operation (cpu only)
\end{itemize}
}
\frame{
\frametitle{Why Theano is better}
Executing the code is faster because:
\begin{itemize}
\item Rearranges high-level expressions
\item Produces customized low-level code
\item Can use a variety of backend technologies(GPU,...)
\end{itemize}
\vfill
Writing the code is faster because:
\begin{itemize}
\item High-level language allows to concentrate on the algorithm
\item Automatic optimization
\begin{itemize}
\item No need to manually optimize for each algo you want to test
\end{itemize}
\item Automatic efficient symbolic differentiation
\begin{itemize}
\item No need to manually differentiate your functions
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{Project Status}
Why you can rely on Theano:
\begin{itemize}
\item Theano has been developed and used since January 2008 (3.5 yrs old)
\item Core technology for a funded Silicon-Valley startup
\item Used to teach IFT6266 for two years
\item Used by everyone in our lab (\textasciitilde 30 people)
\item Driven over 40 research papers over the last few years
\item Active mailing list with participants from outside our lab
\item Good user documentation
\item Many contributors (some from outside our lab)
\item Some(lots?) of users beyond our lab.
\item Deep Learning Tutorials
\item Unofficial RPMs for Mandriva
\item Downloads (June 8 2011, since last January):
\begin{itemize}
\item Pypi 780
\item MLOSS: 483
\item Assembla(main repo): unknown
\end{itemize}
\end{itemize}
}
\newcommand\codeHighlight[1]{\textcolor[rgb]{1,0,0}{\textbf{#1}}}
\subsection{Simple Example}
\begin{frame}[fragile]
\frametitle{Simple Example}
\begin{Verbatim}[commandchars=\\\{\}]
import theano
a = theano.tensor.vector("a") {\color{gray} # declare symbolic variable}
b = a + a**10 {\color{gray} # build symbolic expression}
f = theano.function([a], b) {\color{gray} # compile function}
print f([0,1,2]) {\color{gray} # prints `array([0,2,1026])`}
\end{Verbatim}
\includegraphics[width=1.2in]{pics/f_unoptimized.png}
\end{frame}
\frame{
\frametitle{Simple Example: Optimized graph}
{\bf no pow, fused elemwise op!}
\includegraphics[width=2.3in]{pics/f_optimized.png}
Symbolic programming
\begin{itemize}
\item Paradigm change: people need to use it to understand it
\end{itemize}
}
\frame{
\frametitle{A Real Example: Logistic Regression}
\begin{itemize}
\item GPU-ready
\item Symbolic differentiation
\item Speed optimizations
\item Stability optimizations
\end{itemize}
}
\begin{frame}[fragile]
\frametitle{A Real Example: Logistic Regression}
\begin{Verbatim}[commandchars=\\\{\}]
import numpy
import theano
import theano.tensor as T
rng = numpy.random
# Declare Theano symbolic variables
x = T.matrix("x")
y = T.vector("y")
\codeHighlight{w = theano.shared(rng.randn(100), name="w")}
\codeHighlight{b = theano.shared(0., name="b")}
print "Initial model:"
print w.get_value(), b.get_value()
\end{Verbatim}
\end{frame}
\subsection{Real Example}
\begin{frame}[fragile]
\frametitle{A Real Example: Logistic Regression}
\begin{Verbatim}[commandchars=\\\{\}]
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b)) {\color{gray}# Probabily of having a one}
prediction = p_1 > 0.5 {\color{gray}# The prediction: 0 or 1}
xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) {\color{gray}# Cross-entropy}
cost = xent.mean() + 0.01*(w**2).sum() {\color{gray}# The cost to optimize}
\codeHighlight{gw,gb = T.grad(cost, [w,b])}
\end{Verbatim}
\begin{itemize}
\item T.grad works symbolically: takes and returns a Theano variable
\item T.grad can be compared to a macro. So it can be applied multiple times
\item T.grad takes scalar costs only
\item Simple recipe allows to compute efficiently vector*Jabobian and vector*Hessian
\item We are working on the missing optimizations to be able to compute efficently the full Jabobian and Hessian.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{A Real Example: Logistic Regression}
\begin{Verbatim}[commandchars=\\\{\}]
# Compile expressions to functions
train = theano.function(
inputs=[x,y],
\codeHighlight{outputs=[prediction, xent]},
\codeHighlight{updates=\{w:w-0.1*gw, b:b-0.1*gb\}})
predict = theano.function(inputs=[x], outputs=prediction)
N = 4
feats = 100
D = (rng.randn(N, feats), rng.randint(size=4,low=0, high=2))
training_steps = 10
for i in range(training_steps):
pred, err = train(D[0], D[1])
print "Final model:"
print w.get_value(), b.get_value()
print "target values for D:", D[1]
print "prediction on D:", predict(D[0])
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{A Real Example: optimization}
\begin{Verbatim}[commandchars=\\\{\}]
{\color{gray}# Construct Theano expression graph}
p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
prediction = p_1 > 0.5
cost = xent.mean() + 0.01*(w**2).sum()
gw,gb = T.grad(cost, [w,b])
{\color{gray}# Compile expressions to functions}
train = theano.function(
inputs=[x,y],
outputs=[prediction, xent],
updates=\{w:w-0.1*gw, b:b-0.1*gb\}) {\color{gray}# This is a dictionary}
\end{Verbatim}
Where are those optimization applied?
\begin{itemize}
\item Log(1+exp(x))
\item 1 / (1 + T.exp(var)) (sigmoid)
\item Log(1-sigmoid(var)) (softplus, stabilisation)
\item GEMV
\item Loop fusion
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{A Real Example: optimization!}
\begin{Verbatim}[commandchars=\\\{\}]
p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
\codeHighlight{# 1 / (1 + T.exp(var)) -> sigmoid(var)}
\codeHighlight{# Log(1-sigmoid(var)) -> -sigmoid(var)}
prediction = p_1 > 0.5
cost = xent.mean() + 0.01*(w**2).sum()
gw,gb = T.grad(cost, [w,b])
train = theano.function(
inputs=[x,y],
outputs=[prediction, xent],
\codeHighlight{# w-0.1*gw: GEMV with the dot in th grad}
updates=\{w:w-0.1*gw, b:b-0.1*gb\})
\end{Verbatim}
\begin{itemize}
\item Loop fusion in many places
\end{itemize}
\end{frame}
\frame{
\frametitle{Nb Dimensions, dtype and Broadcast}
\begin{itemize}
\item T.scalar, T.vector, T.matrix, T.row, T.col
\item T.row(floatX), T.[fdczbwil]row (float32, float64, complex64, complex128, int8, int16, int32, int64)
\end{itemize}
\begin{itemize}
\item All are shortcuts to: T.tensor(dtype, broadcastable=([False,True])*nd)
\item Other dtype: uint[8,16,32,64]
\item floatX: configurable dtype that can be float32 or float64.
\end{itemize}
}
\subsection{Benchmarks}
\frame{
\frametitle{Benchmarks}
Example:
\begin{itemize}
\item Multi-layer perceptron
\item Convolutional Neural Networks
\item Misc Elemwise operations
\end{itemize}
Competitors: Numpy+Scipy, MATLAB, EBLearn, Torch5, numexpr
}
\frame{
\frametitle{Benchmark MLP}
Multi-Layer Perceptron: 60x784 matrix times 784x500 matrix, tanh, times 500x10 matrix, elemwise, then all in reverse for backpropagation
\begin{center}
\includegraphics[width=3.in]{pics/mlp.pdf}
\end{center}
}
\frame{
\frametitle{Benchmark Convolutional Network}
Convolutional Network: 256x256 images convolved with 6 7x7 filters, downsampled to 6x50x50, tanh, convolution with 16 6x7x7 filter, tanh, matrix multiply, elemwise, then in reverse
\begin{center}
\includegraphics[width=3.in]{pics/conv.pdf}
\end{center}
}
\frame{
\frametitle{Benchmark elemwise}
\begin{itemize}
\item All on CPU
\item Solid blue: Theano
\item Dashed Red: numexpr(without MKL)
\end{itemize}
\begin{center}
\includegraphics[width=3.in]{pics/multiple_graph.pdf}
\end{center}
}
\section{Advanced Theano}
\subsection{Misc}
\frame{
\frametitle{Theano Flags}
Theano can be configured with flags. They can be defined in two way
\begin{itemize}
\item With the environment variable: THEANO\_FLAGS="mode=ProfileMode,ProfileMode.profile\_memory=True"
\item With an configuration file that defaults to \textasciitilde/.theanorc
\end{itemize}
}
\frame{
\frametitle{Theano Graph}
\begin{itemize}
\item Theano works with symbolic graphs
\item Those graphs are bi-partite graph (graph with 2 types of nodes)
\item Those 2 nodes types are Apply and Variable nodes
\end{itemize}
\begin{itemize}
\item Inputs and Outputs are list of Theano variables
\item Can navigate through the graph from any point to any point
\end{itemize}
\begin{center}
\includegraphics[width=3.5in]{pics/apply_node.pdf}
\end{center}
}
\subsection{Pipeline}
\frame{
\frametitle{Compilation Pipeline}
\begin{center}
\includegraphics[width=2.7in]{pics/pipeline.pdf}
\end{center}
}
\subsection{Profiling}
\begin{frame}[fragile]
\frametitle{Profile Mode}
To replace the default mode with this mode, use the theano flags ``mode=ProfileMode''.
To enable the memory profiling use the flags ProfileMode.profile\_memory=True
\begin{Verbatim}
Time since import 1.486s
Theano compile time: 1.017s (67.9% since import)
Optimization time: 0.805s
Linker time: 0.199s
Theano fct call 0.002s (0.1% since import)
Theano Op time 0.001s 0.0%(since import) 36.8%(of fct call)
Theano function overhead in ProfileMode 0.001s 0.1%(since import)
63.2%(of fct call)
11 Theano fct call, 0.000s per call
Rest of the time since import 0.479s 32.0%
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Profile Mode: Function Summary}
Theano outputs:
\vfill
\begin{Verbatim}
Theano fct summary:
<% total fct time> <total time> <time per call> <nb call> <fct name>
97.1% 0.002s 1.64e-04s 10 train
2.9% 0.000s 4.91e-05s 1 predict
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Profile Mode: Single Op-Wise Summary}
Theano outputs:
\vfill
\begin{Verbatim}
Single Op-wise summary:
<% of local_time spent on this kind of Op> <cumulative %>
<self seconds> <cumulative seconds> <time per call> <nb_call>
<nb_op> <nb_apply> <Op name>
30.8% 30.8% 0.000s 0.000s 1.86e-05s 10 1 1 <'Gemv'>
23.8% 54.6% 0.000s 0.000s 1.58e-06s * 91 10 10 <'Elemwise'>
18.3% 72.9% 0.000s 0.000s 1.10e-05s 10 1 1 <'Alloc'>
15.9% 88.7% 0.000s 0.001s 8.71e-06s 11 1 2 <'Dot'>
7.7% 96.4% 0.000s 0.001s 1.49e-06s * 31 2 4 <'DimShuffle'>
2.0% 98.4% 0.000s 0.001s 1.22e-06s * 10 1 1 <'Sum'>
1.6% 100.0% 0.000s 0.001s 9.78e-07s * 10 1 1 <'Shape_i'>
(*) Op is running a c implementation
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Profile Mode: Op-Wise Summary}
Theano outputs:
\vfill
\begin{Verbatim}
Op-wise summary:
<% of local_time spent on this kind of Op> <cumulative %>
<self seconds> <cumulative seconds> <time per call>
<nb_call> <nb apply> <Op name>
31.4% 31.4% 0.000s 0.000s 1.93e-05s 10 1 Gemv{inplace}
16.9% 48.3% 0.000s 0.000s 1.04e-05s 10 1 Alloc
15.5% 63.8% 0.000s 0.000s 8.65e-06s 11 2 dot
5.0% 68.8% 0.000s 0.000s 3.05e-06s * 10 1 Elemwise{
Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}
4.3% 73.1% 0.000s 0.000s 1.27e-06s * 21 3 InplaceDimShuffle{x}
3.3% 76.4% 0.000s 0.000s 2.00e-06s * 10 1 Elemwise{sub,no_inplace}
2.9% 79.3% 0.000s 0.000s 1.79e-06s * 10 1 Elemwise{gt,no_inplace}
2.5% 84.5% 0.000s 0.001s 1.53e-06s * 10 1 InplaceDimShuffle{1,0}
... (remaining 9 Apply account for 18.3%(0.00s) of the runtime)
(*) Op is running a c implementation
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Profile Mode: Apply-Wise Summary}
Theano outputs:
\vfill
\begin{Verbatim}
Apply-wise summary:
<% of local_time spent at this position> <cumulative %%>
<apply time> <cumulative seconds> <time per call>
<nb_call> <Apply position> <Apply Op name>
29.8% 29.8% 0.000s 0.000s 1.96e-05s 10 15 Gemv{inplace}
(<TensorType(float64, vector)>, {-0.1}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, {0.998})
15.8% 45.6% 0.000s 0.000s 1.04e-05s 10 10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
14.0% 59.6% 0.000s 0.000s 9.20e-06s 10 1 dot(<TensorType(float64, matrix)>, <TensorType(float64, vector)>)
5.6% 65.2% 0.000s 0.000s 3.67e-06s 10 9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(<TensorType(float64, vector)>, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
3.2% 68.4% 0.000s 0.000s 2.12e-06s 10 4 Elemwise{sub,no_inplace}(TensorConstant{[ 1.]}, <TensorType(float64, vector)>)
2.9% 71.3% 0.000s 0.000s 1.93e-06s 10 12 Elemwise{gt,no_inplace}(Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0, TensorConstant{[ 0.5]})
... (remaining 14 Apply instances account for 28.6%(0.00s) of the runtime)
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Profile Mode: Memory Profile}
Theano outputs:
\vfill
\begin{Verbatim}
Profile of Theano functions memory:
(This check only the output of each apply node. It don't check the
temporary memory used by the op in the apply node.)
Theano fct: train
Max without gc, inplace and view (KB) 4
Max FAST_RUN_NO_GC (KB) 0
Max FAST_RUN (KB) 0
Memory saved by view (KB) 3
Memory saved by inplace (KB) 0
Memory saved by GC (KB) 0
<Sum apply outputs (bytes)> <Apply outputs memory size(bytes)>
<created/inplace/view> <Apply node>
3200B [3200] v InplaceDimShuffle{1,0}(<TensorType(float64, matrix)>)
800B [800] i Gemv{inplace}(<TensorType(float64, vector)>, TensorConstant{-0.1}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.998})
32B [32] c Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Profile Mode: Tips}
Theano outputs:
\vfill
\begin{Verbatim}
Here are tips to potentially make your code run faster
(if you think of new ones, suggest them on the mailing list).
Test them first, as they are not guaranteed to always provide a speedup.
- Try the Theano flag floatX=float32
\end{Verbatim}
\end{frame}
\subsection{Printing}
\begin{frame}[fragile]
\frametitle{Text Printing of Graph: Pretty Printing}
theano.printing.pprint(variable)
\vfill
\begin{Verbatim}
>>> theano.printing.pprint(prediction)
gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),
TensorConstant{0.5})
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Text Printing of Graph: Debug Print}
theano.printing.debugprint({fct, variable, list of variables})
\vfill
\small
\begin{Verbatim}
>>> theano.printing.debugprint(prediction)
Elemwise{gt,no_inplace} [@181772236] ''
|Elemwise{true_div,no_inplace} [@181746668] ''
| |InplaceDimShuffle{x} [@181746412] ''
| | |TensorConstant{1} [@181745836]
| |Elemwise{add,no_inplace} [@181745644] ''
| | |InplaceDimShuffle{x} [@181745420] ''
| | | |TensorConstant{1} [@181744844]
| | |Elemwise{exp,no_inplace} [@181744652] ''
| | | |Elemwise{sub,no_inplace} [@181744012] ''
| | | | |Elemwise{neg,no_inplace} [@181730764] ''
| | | | | |dot [@181729676] ''
| | | | | | |x [@181563948]
| | | | | | |w [@181729964]
| | | | |InplaceDimShuffle{x} [@181743788] ''
| | | | | |b [@181730156]
|InplaceDimShuffle{x} [@181771788] ''
| |TensorConstant{0.5} [@181771148]
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Text Printing of Graph: Debug Print}
theano.printing.debugprint({fct, variable, list of variables})
\vfill
\small
\begin{Verbatim}
>>> theano.printing.debugprint(predict)
Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] '' 2
|dot [@183018796] '' 1
| |x [@183000780]
| |w [@183000812]
|InplaceDimShuffle{x} [@183133580] '' 0
| |b [@183000876]
|TensorConstant{[ 0.5]} [@183084108]
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Picture Printing of Graph}
\begin{Verbatim}
>>> theano.printing.pydotprint_variables(prediction)
\end{Verbatim}
\includegraphics[width=2.0in]{pics/logreg_pydotprint_prediction.png}
\end{frame}
\begin{frame}[fragile]
\frametitle{Picture Printing of Graph}
\begin{Verbatim}
>>> theano.printing.pydotprint(predict)
\end{Verbatim}
\includegraphics[width=4in]{pics/logreg_pydotprint_predic.png}
\end{frame}
\begin{frame}[fragile]
\frametitle{Picture Printing of Graph}
\begin{Verbatim}[commandchars=\\\{\}]
>>> theano.printing.pydotprint(train) {\color{gray}# This is a small train example!}
\end{Verbatim}
\hspace{-.8cm}
\includegraphics[width=5.0in]{pics/logreg_pydotprint_train.png}
\end{frame}
\subsection{Debugging}
\frame{
\frametitle{How to Debug}
\begin{itemize}
\item Run with the flag mode=DebugMode
\begin{itemize}
\item 100-1000x slower
\item Test all optimization steps from the original graph to the final graph
\item Checks many properties that Op should/shoudn't do
\item Executes the Python and C code versions
\end{itemize}
\item Run with the flag mode=FAST\_COMPILE
\begin{itemize}
\item Few optimizations
\item Run Python code (better error messages and can go in the python debugger)
\end{itemize}
\item Run with the Theano flag compute\_test\_value = {``off'', ``ignore'', ``warn'', ``raise''}
\begin{itemize}
\item Run the code as we create the graph
\item Allow to find the bug earlier (ex: shape mismatch)
\item Make identification of the wrong line in the code easier
\item Use the value of constant and shared variable directly
\item For pure symbolic varible use: x.tag.test\_value = numpy.random.rand(5,10)
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{Known Limitation}
\begin{itemize}
\item Compilation phase distinct from execution phase
\item Compilation time significant
\begin{itemize}
\item Amortize it with functions over big input or reuse functions
\end{itemize}
\item Execution overhead (We have something in a branch that lowers it)
\begin{itemize}
\item Needs a certain number of operations to be useful
\end{itemize}
\item Compilation time super linear to the size of the graph.
\begin{itemize}
\item A few hundreds node OK
\item You can disable some optimizations to make it faster with bigger graphs
\item When this happened to us, it always indicated a problem in the graph
\end{itemize}
\end{itemize}
}
\subsection{Loop}
\frame{
\frametitle{Scan}
\begin{itemize}
\item General form of {\bf recurrence}, which can be used for looping.
\item {\bf Reduction} and {\bf map}(loop over the leading dimensions) are special case of Scan
\item You *scan* a function along some input sequence, producing an
output at each time-step
\item The function can see the {\bf previous K time-steps} of your function
\item ``sum()`` could be computed by scanning the $z + x_i$ function
over a list, given an initial state of ``z=0``.
\item Often a for-loop can be expressed as a ``scan()`` operation, and
``scan`` is the closest that Theano comes to looping.
\item The advantage of using ``scan`` over for loops is that it allows
the number of iterations to be part of the symbolic graph.
\item calls: ``scan()``, ``map()``, ``reduce()``, ``foldl()``, ``foldr()``.
\end{itemize}
}
\begin{frame}[fragile]
\frametitle{Scan Example: Computing pow(A,k)}
\begin{Verbatim}
k = T.iscalar("k"); A = T.vector("A")
def inner_fct(prior_result, A): return prior_result * A
# Symbolic description of the result
result, updates = theano.scan(fn=inner_fct,
outputs_info=T.ones_like(A),
non_sequences=A, n_steps=k)
# Scan has provided us with A**1 through A**k. Keep only the last
# value. Scan notices this and does not waste memory saving them.
final_result = result[-1]
power = theano.function(inputs=[A,k], outputs=final_result,
updates=updates)
print power(range(10),2)
#[ 0. 1. 4. 9. 16. 25. 36. 49. 64. 81.]
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Scan Example: Calculating a Polynomial}
\begin{Verbatim}
coefficients = theano.tensor.vector("coefficients")
x = T.scalar("x"); max_coefficients_supported = 10000
# Generate the components of the polynomial
full_range=theano.tensor.arange(max_coefficients_supported)
components, updates = theano.scan(fn=lambda coeff, power, free_var:
coeff * (free_var ** power),
outputs_info=None,
sequences=[coefficients, full_range],
non_sequences=x)
polynomial = components.sum()
calculate_polynomial = theano.function(inputs=[coefficients, x],
outputs=polynomial)
test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
print calculate_polynomial(test_coeff, 3)
# 19.0
\end{Verbatim}
\end{frame}
\subsection{GPU}
\frame{
\frametitle{GPU}
\begin{itemize}
\item Now only 32 bits float supported (being worked on)
\item Only 1 GPU per process
\item Use the Theano flag device=gpu to tell to use the gpu device
\begin{itemize}
\item Use device=gpu[gpu\_id] to specify witch gpu
\item Shared variable with float32 data are by default in the GPU memory space
\end{itemize}
\item Use the Theano flag floatX=float32
\begin{itemize}
\item Be sure to use floatX (theano.config.floatX) in your code
\item Cast input before putting them into a shared variable
\item Cast "problem": int32 with float32 $\to$ float64
\begin{itemize}
\item A new cast mechanism is being developed
\item Insert manual cast in your code or use [u]int{8,16}
\item Insert manual cast around the mean op (divide by the length that is a int64!)
\end{itemize}
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{GPU for Exercices:}
\begin{itemize}
\item Intel Core i7 980 XE(107Gf/s float64) (1050\$)
\item NVIDIA C2050(515 Gf/s float64, 1Tf/s float32), compute capability 2.0 (2400\$) 6 cores/12 threads
\item NVIDIA GTX580(1.5Tf/s float32), compute capability 2.0 (500\$) 512 cores
\item NVIDIA Quadro FX 580(71GF/s single), compute capability 1.1 (140\$ But 'profesionnal card'), 32 cores
\end{itemize}
%Device 0: "Quadro FX 580"
% Total amount of global memory: 536150016 bytes
% Multiprocessors x Cores/MP = Cores: 4 (MP) x 8 (Cores/MP) = 32 (Cores)
% Clock rate: 1.12 GHz
% Run time limit on kernels: Yes
% Compute mode: Default (multiple host
%threads can use this device simultaneously)
}
\frame{
\frametitle{Theano Exercices}
\begin{itemize}
\item Run the simple example
\item Run the real example
\item Modify your version to run in float32 with floatX.
\item Run your version on the CPU and GPU
\item Do you see a speed up with the GPU? Where does it come from?(Try to profile it)
\item Scan: modify the polynomial example to have the reduction done by scan
\end{itemize}
}
\section{PyCUDA}
\subsection{PyCUDA}
\frame{
\frametitle{Intro}
Authors: Andreas Kl\"{o}ckner
PyCUDA lets you access Nvidia's CUDA parallel computation API from Python. Several wrappers of the CUDA API already exist. So what's so special about PyCUDA?
\begin{itemize}
\item Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
\begin{itemize}
\item Makes it much easier to write correct, leak- and crash-free code
\item PyCUDA knows about dependencies, too, so (for example) it won't detach from a context before all memory allocated in it is also freed
\end{itemize}
\item Convenience
\begin{itemize}
\item Abstractions to compile CUDA code from python pycuda.driver.SourceModule
\item A GPU memory buffer pycuda.gpuarray.GPUArray
\end{itemize}
\item Completeness
\begin{itemize}
\item Binding to all of CUDA's driver API
\end{itemize}
\item Automatic Error Checking
\begin{itemize}
\item All CUDA errors are automatically translated into Python exceptions
\end{itemize}
\item Speed
\begin{itemize}
\item PyCUDA's base layer is written in C++
\end{itemize}
\item Helpful Documentation.
\end{itemize}
}
\begin{frame}[fragile]
\frametitle{Example}
\begin{Verbatim}
import pycuda.autoinit
import pycuda.driver as drv
import numpy
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Example}
\begin{Verbatim}
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400,1,1), grid=(1,1))
\end{Verbatim}
\end{frame}
\frame{
\frametitle{GpuArray}
No support for strides.
}
\subsection{PyCUDA+Theano}
\begin{frame}[fragile]
\frametitle{Theano Op Contract}
\begin{Verbatim}
class MyOp(Op):
def __eq__(self, other):
def __hash__(self):
def __str__(self):
def make_node(self, *inputs):
python implementation:
def perform(self, node, inputs_storage, outputs_storage):
c implementation: [see theano web site]
others implementation (pycuda, ...):
def make_thunk(self, node, storage_map, _, _2):
optinal:
def __init__(self, ...):
def grad(self, inputs, g):
def infer_shape(node, (i0_shapes, i1_shapes, ...))
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Theano Op Example}
\begin{Verbatim}
import theano
class DoubleOp(theano.Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x):
x = theano.tensor.as_tensor_variable(x)
return theano.Apply(self, [x], [x.type()])
def perform(self, node, inputs, output_storage):
x = inputs[0]
z = output_storage[0]
z[0] = x * 2
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Theano Op Example: Test it!}
\begin{Verbatim}
x = theano.tensor.matrix()
f = theano.function([x],DoubleOp()(x))
import numpy
inp = numpy.random.rand(5,5)
out = f(inp)
assert numpy.allclose(inp*2, out)
print inp
print out
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Theano+PyCUDA Op Example}
\begin{Verbatim}
import numpy, theano
import theano.misc.pycuda_init
from pycuda.compiler import SourceModule
import theano.sandbox.cuda as cuda
class PyCUDADoubleOp(theano.Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, inp):
inp = cuda.basic_ops.gpu_contiguous(
cuda.basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()])
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Theano+PyCUDA Op Example: make\_thunk}
\begin{Verbatim}
def make_thunk(self, node, storage_map, _, _2):
mod = SourceModule( THE_C_CODE )
pycuda_fct = mod.get_function("my_fct")
inputs = [ storage_map[v] for v in node.inputs]
outputs = [ storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape!=inputs[0][0].shape:
z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
block=(512,1,1), grid=grid)
return thunk
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Theano+PyCUDA Op Example: GPU Code}
\begin{Verbatim}
THE_C_CODE = """
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<size){
o0[i] = i0[i]*2;
}
}""")
\end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{Theano+PyCUDA Op Example: Test it!}
\begin{Verbatim}
x = theano.tensor.fmatrix()
f = theano.function([x], PyCUDADoubleOp()(x))
xv=numpy.ones((4,5), dtype="float32")
assert numpy.allclose(f(xv), xv*2)
print numpy.asarray(f(xv))
\end{Verbatim}
\end{frame}
\begin{frame}
\frametitle{Theano+PyCUDA Exercices}
\begin{itemize}
\item Elemwise add: $x + y$
\item Elemwise with 2 outputs: $x + y$ and $x - y$
\item Elemwise with stride
\end{itemize}
\end{frame}
\section{GpuNdArray}
\subsection{GpuNdArray}
\frame{
\frametitle{Why a common GPU ndarray?}
\begin{itemize}
\item Currently there are at least 4 different GPU arrays in python only
\begin{itemize}
\item CudaNdarray(Theano), GPUArray(PyCUDA) and CUDAMatrix(cudamat), GPUArray(PyOpenCL), ...
\item There are even more if we include other languages
\end{itemize}
\item All of them are a subset of numpy.ndarray on the GPU!
\item Duplicate work
\begin{itemize}
\item GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
\end{itemize}
\item Harder to port/reuse code
\item Harder to find/distribute code
\item Divides development work
\end{itemize}
}
\frame{
\frametitle{Design Goals}
\begin{itemize}
\item Make it VERY similar to numpy.ndarray
\item Be compatible with CUDA and OpenCL
\item Have the base object in C to allow collaboration with more projects
\begin{itemize}
\item We want people from C, C++, ruby, R, ... all use the same base GPU n-dimensional array
\end{itemize}
\end{itemize}
}
\frame{
\frametitle{Final GpuNdArray Note}
\begin{itemize}
\item Under development
\item Will be the next GPU ndarray for Theano (This summer!)
\item Probably also for PyCUDA, PyOpenCL
\item Mailing list: http://lists.tiker.net/listinfo/gpundarray
\end{itemize}
}
\section{Conclusion}
\subsection{Conclusion}
\frame{
\frametitle{Conclusion}
\begin{itemize}
\item I presented a tool that try to be the holy grail in computing: {\bf easy to code} and {\bf fast to execute}!
\item Allows to run code on CPU and can move them in many case on the GPU
\item Easy wrapping of existing GPU code in Theano
\item It {\bf works} and is {\bf used in real world}
\end{itemize}
}
\end{document}
import numpy, theano
import theano.misc.pycuda_init
from pycuda.compiler import SourceModule
import theano.sandbox.cuda as cuda
class PyCUDADoubleOp(theano.Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, inp):
inp = cuda.basic_ops.gpu_contiguous(
cuda.basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2):
mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<size){
o0[i] = i0[i]*2;
}
}""")
pycuda_fct = mod.get_function("my_fct")
inputs = [ storage_map[v] for v in node.inputs]
outputs = [ storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape!=inputs[0][0].shape:
z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
block=(512,1,1), grid=grid)
return thunk
x = theano.tensor.fmatrix()
f = theano.function([x], PyCUDADoubleOp()(x))
xv=numpy.ones((4,5), dtype="float32")
assert numpy.allclose(f(xv), xv*2)
print numpy.asarray(f(xv))
import theano
a = theano.tensor.vector("a") # declare variable
b = a + a**10 # build symbolic expression
f = theano.function([a], b) # compile function
print f([0,1,2])
# prints `array([0,2,1026])`
theano.printing.pydotprint_variables(b, outfile="pics/f_unoptimized.png", var_with_name_simple=True)
theano.printing.pydotprint(f, outfile="pics/f_optimized.png", var_with_name_simple=True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论