First commited version of the Theano Tutorial at HPCS 2011.

0d06d18f · Frederic Bastien · c75bcf17 · 0d06d18f · 0d06d18f · 0d06d18f
--- a/doc/hpcs2011_tutorial/Makefile
+++ b/doc/hpcs2011_tutorial/Makefile
+all:
+	pdflatex presentation.tex
--- a/doc/hpcs2011_tutorial/double_op.py
+++ b/doc/hpcs2011_tutorial/double_op.py
+import numpy
+import theano
+
+class DoubleOp(theano.Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x):
+        x = theano.tensor.as_tensor_variable(x)
+        return theano.Apply(self, [x], [x.type()])
+    def perform(self, node, inputs, output_storage):
+        x = inputs[0]
+        z = output_storage[0]
+        z[0] = x * 2
+
+x = theano.tensor.matrix()
+
+f = theano.function([x], DoubleOp()(x))
+
+inp = numpy.random.rand(5,5)
+out = f(inp)
+assert numpy.allclose(inp*2, out)
+print inp
+print out
--- a/doc/hpcs2011_tutorial/logreg_example.py
+++ b/doc/hpcs2011_tutorial/logreg_example.py
+import numpy
+import theano
+import theano.tensor as T
+rng = numpy.random
+
+N = 400
+feats = 784
+D = (rng.randn(N, feats).astype(theano.config.floatX), rng.randint(size=N,low=0, high=2).astype(theano.config.floatX))
+training_steps = 10
+
+# Declare Theano symbolic variables
+x = T.matrix("x")
+y = T.vector("y")
+w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
+b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
+x.tag.test_value = D[0]
+y.tag.test_value = D[1]
+print "Initial model:"
+print w.get_value(), b.get_value()
+
+
+# Construct Theano expression graph
+p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b)) # Probabily of having a one
+prediction = p_1 > 0.5 # The prediction that is done: 0 or 1
+xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) # Cross-entropy
+cost = xent.mean() + 0.01*(w**2).sum() # The cost to optimize
+gw,gb = T.grad(cost, [w,b])
+
+# Compile expressions to functions
+train = theano.function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+            updates={w:w-0.1*gw, b:b-0.1*gb},
+            name = "train")
+predict = theano.function(inputs=[x], outputs=prediction,
+            name = "predict")
+
+for i in range(training_steps):
+    pred, err = train(D[0], D[1])
+print "Final model:"
+print w.get_value(), b.get_value()
+
+print "target values for D"
+print D[1]
+
+print "prediction on D"
+print predict(D[0])
+
+# Print the graph used in the slides
+theano.printing.pydotprint(predict,
+                           outfile="pics/logreg_pydotprint_predic.png",
+                           var_with_name_simple=True)
+theano.printing.pydotprint_variables(prediction,
+                           outfile="pics/logreg_pydotprint_prediction.png",
+                           var_with_name_simple=True)
+theano.printing.pydotprint(train,
+                           outfile="pics/logreg_pydotprint_train.png",
+                           var_with_name_simple=True)
--- a/doc/hpcs2011_tutorial/presentation.tex
+++ b/doc/hpcs2011_tutorial/presentation.tex
+\documentclass[a4paper,9pt]{beamer}
+\usetheme{Malmoe}  % Now it's a beamer presentation with the lisa theme!
+\setbeamertemplate{footline}[page number]
+\usecolortheme{beaver}
+\usepackage{url}
+\usepackage{ragged2e}
+\usepackage{multirow}
+\usepackage{fancyvrb}
+%\usepackage{color}
+\def\imagetop#1{\vtop{\null\hbox{#1}}}
+
+
+\logo{\includegraphics[width=.8in]{pics/UdeM_NoirBleu_logo_Marie_crop.pdf}}
+% Standard LaTeX stuff - note the optional abbreviated title being provided
+\title[GPU Programming made Easy]{GPU Programming made Easy}
+\author[LISA lab]{Fr\'ed\'eric Bastien, Machine Learning Laboratory}
+
+\date{
+James Bergstra, Olivier Breuleux, Frederic Bastien, Pascal Lamblin, Razvan Pascanu, Guillaume Desjardins, Joseph Turian, David Warde-Farley, Olivier Delalleau, Arnaud Bergeron, Josh Bleecher Snyder, Ian Goodfellow, Fran\c{c}ois Savard, Xavier Glorot, Douglas Eck, Dumitru Erhan, Michael Mandel, Philippe Hamel,  Simon Lemieux,  Thierry Bertin-Mahieux, Yoshua Bengio
+
+Presented on June 13\textsuperscript{th} 2011\\
+HPCS 2011, Montr\'eal
+
+}
+
+
+
+\begin{document}
+
+
+\frame{\titlepage}
+
+
+\section{Overview}
+\subsection{Overview}
+\frame{
+  \frametitle{Overview 1}
+  \begin{itemize}
+  \item Introduction
+    \begin{itemize}
+    \item Why Scripting for GPUs?
+    \item Theano vs PyCUDA
+    \item Python in 1 slide
+    \item Numpy in 1 slide
+    \end{itemize}
+  \item Theano
+    \begin{itemize}
+    \item Intro
+    \item Simple example
+    \item Real example
+    \item Benchmarks
+    \end{itemize}
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{Overview 2}
+  \begin{itemize}
+  \item Advanced Theano
+    \begin{itemize}
+    \item Compilation Pipeline
+    \item Profiling
+    \item Printing
+    \item Debugging
+    \item Scan(for-Loop generalization)
+    \item GPU
+    \item Exercices/break
+    \end{itemize}
+  \item PyCUDA
+    \begin{itemize}
+    \item Intro
+    \item Example
+    \item PyCUDA + Theano
+    \item Exercices
+    \end{itemize}
+  \item GpuNdArray
+  \item Conclusion
+  \end{itemize}
+  \begin{tabular}{lcr}
+    \imagetop{\includegraphics[width=.4in]{pics/lisabook_logo_text2.png}} & 
+%%    \imagetop{\includegraphics[width=.2in]{pics/white.png}}&
+    \imagetop{\includegraphics[width=1.in]{pics/theano_logo.png}}&
+    \imagetop{\includegraphics[width=.6in]{pics/pycuda-logo-crop.pdf}}
+  \end{tabular} 
+}
+
+\frame{
+  \frametitle{Won't cover}
+  \begin{itemize}
+  \item How to write GPU code
+  \item How to optimize GPU code
+  \end{itemize}
+}
+
+\section{Introduction}
+\subsection{Introduction}
+\frame{
+  \frametitle{Why GPU}
+\begin{itemize}
+  \item Faster, cheaper, more efficient power usage
+  \item How much faster? I saw numbers from 100x slower to 1000x faster.
+    \begin{itemize}
+    \item It depends on the algorithms
+    \item How the benchmark is done
+      \begin{itemize}
+      \item Quality of implementation
+      \item How much time spent optimizing CPU vs GPU code
+      \end{itemize}
+    \item Theory: 
+      \begin{itemize}
+      \item Intel Core i7 980 XE(107Gf/s float64) 6 cores
+      \item NVIDIA C2050(515 Gf/s float64, 1Tf/s float32) 480 cores
+      \item NVIDIA GTX580(1.5Tf/s float32) 512 cores
+      \end{itemize}
+    \end{itemize}
+  \item With Theano, up to 100x can be seen as we don't do multiple core on cpu (except for call to gemm)
+  \item If you see 1000x, it means the benchmark is not fair
+\end{itemize}
+
+}
+\frame{
+  \frametitle{Why Scripting for GPUs}
+  They {\bf Complement each other}
+  \begin{itemize}
+  \item GPUs are everything that scripting/high level languages are not
+    \begin{itemize}
+    \item Highly parallel
+    \item Very architecture-sensitive
+    \item Built for maximum FP/memory throughput
+    \end{itemize}
+  \item CPU: largely restricted to control
+    \begin{itemize}
+    \item Optimized for sequential code
+    \item tasks (1000/sec)
+    \item Scripting fast enough
+    \item Theano = Mathematical expression compiler
+    \item Python + CUDA = PyCUDA
+    \item Python + OpenCL = PyOpenCL
+    \end{itemize}
+  \end{itemize}
+}
+\frame{
+  \frametitle{Theano vs PyCUDA vs PyOpenCL vs CUDA}
+  \begin{itemize}
+    \item Theano
+      \begin{itemize}
+      \item Mathematical expression compiler
+      \item Generates costum C and CUDA code
+      \item Uses python code when performance is not critical
+      \end{itemize}
+    \item CUDA
+      \begin{itemize}
+      \item C extension by NVIDA that allow to code and use GPU
+      \end{itemize}
+    \item PyCUDA
+      \begin{itemize}
+        \item Python interface to CUDA
+        \item Memory management of GPU objects
+        \item Compilation of code for the low-level driver
+      \end{itemize}
+    \item PyOpenCL
+      \begin{itemize}
+      \item PyCUDA for OpenCL
+      \end{itemize}
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{What is your background ?}
+  Do you have experinece with :
+  \begin{itemize}
+  \item Python
+  \item Numpy / Scipy / Matlab
+  \item Maple / Mathematica / SymPy
+  \item GPU programming / CUDA / OpenCL
+  \item Cython / Weave / Numexpr
+  \item C / Java / Fortran
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{Python in 1 Slide}
+  \begin{itemize}
+    \item Interpreted language
+    \item General-purpose high-level programming language
+    \item OO and scripting language
+    \item Emphasizes code readability
+    \item Large and comprehensive standard library
+    \item Indentation for block delimiters
+    \item Dynamic type and memory management
+    \item Dictionary \texttt{d=\{'var1':'value1', 'var2':42, ...\}}
+    %\item List comprehension: [i+3 for i in range(10)] not used in the tutorial
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{Numpy in 1 Slide}
+  \begin{itemize}
+  \item Base scientific computing package on the CPU
+  \item A powerful N-dimensional array object
+    \begin{itemize}
+    \item ndarray.\{ndim, shape, size, dtype, itemsize, stride\}
+    \end{itemize}
+  \item Sophisticated (broadcasting) functions
+    \begin{itemize}
+    \item numpy.random.rand(4,5) * numpy.random.rand(1,5) = mat(4,5)
+    \item numpy.random.rand(4,5) * numpy.random.rand(4,1) = mat(4,5)
+    \item numpy.random.rand(4,5) * numpy.random.rand(5) = mat(4,5)
+    \end{itemize}
+  \item Tools for integrating C/C++ and Fortran code
+  \item Linear algebra, Fourier transform and random number capable
+  \end{itemize}
+
+
+}
+
+%\frame{
+%  \frametitle{Competitors TODO: Remove? Missing many I think!}
+%  There are some competitors for easy computing on gpu.
+%  \begin{itemize}
+%  \item Jacket(GPU for matlab): http://www.accelereyes.com/
+%  \item GPUmat(GPU for matlab, free): http://gp-you.org/
+%  \item numexpr, algopy
+%  \end{itemize}
+%}
+
+\section{Theano}
+\subsection{Introduction}
+\frame{
+%%  \frametitle{Theano}
+\begin{center}
+  \includegraphics[width=3.in]{../images/theano_logo_allblue_350x95.png}
+%  \includegraphics[width=3.in]{../images/theano_logo_allblue_200x54.png}
+\end{center}
+}
+
+\frame{
+  \frametitle{Pointers}
+  \begin{itemize}
+  \item Website: http://deeplearning.net/software/theano/
+  \item Announcements mailing list: http://groups.google.com/group/theano-announce
+  \item User mailing list: http://groups.google.com/group/theano-users
+  \item Deep Learning Tutorials: http://www.deeplearning.net/tutorial/
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{Description}
+  \begin{itemize}
+  \item Mathematical expression compiler
+  \item Statically typed and purely functional
+  \item Dynamic C/CUDA code generation
+  \item Efficient symbolic differentiation
+    \begin{itemize}
+    \item Theano computes derivatives of functions with one or many inputs.
+    \end{itemize}
+  \item Speed and stability optimizations 
+    \begin{itemize}
+    \item Gives the right answer for log(1+x) even if x is really tiny.
+    \end{itemize}
+  \item Extensive unit-testing and self-verification
+    \begin{itemize}
+    \item Detects and diagnoses many types of errors
+    \end{itemize}
+  \item Expressions mimic NumPy's syntax \& semantics
+  \item Works on linux, Mac and Windows
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{Description 2}
+  \begin{itemize}    
+  \item Transparent use of a GPU
+    \begin{itemize}
+    \item float32 only for now (working on other data types)
+    \item Doesn't work on Windows for now
+    \item On GPU data-intensive calculations are typically between 6.5x and 44x faster. We've seen speedsups up to 140x
+    \end{itemize}
+  \item On CPU, common machine learning-algorithms are 1.6x to 7.5x faster than competitive alternatives 
+    \begin{itemize}
+    \item including those in C/C++, NumPy, SciPy, and Matlab
+    \end{itemize}
+  \item The project was started by James Bergstra and Olivier Breuleux
+  \item For the past 1-2 years, I've been replacing as lead contributor
+  \item Some Sparse operation (cpu only)
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{Why Theano is better}
+  Executing the code is faster because:
+  \begin{itemize}
+  \item Rearranges high-level expressions 
+  \item Produces customized low-level code
+  \item Can use a variety of backend technologies(GPU,...)
+  \end{itemize}
+
+  \vfill
+  Writing the code is faster because:
+  \begin{itemize}
+  \item High-level language allows to concentrate on the algorithm
+  \item Automatic optimization
+    \begin{itemize}
+    \item No need to manually optimize for each algo you want to test
+    \end{itemize}
+  \item Automatic efficient symbolic differentiation
+    \begin{itemize}
+    \item No need to manually  differentiate your functions
+    \end{itemize}    
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{Project Status}
+  Why you can rely on Theano:
+  \begin{itemize}  
+  \item Theano has been developed and used since January 2008 (3.5 yrs old)
+  \item Core technology for a funded Silicon-Valley startup
+  \item Used to teach IFT6266 for two years
+  \item Used by everyone in our lab (\textasciitilde 30 people)
+  \item Driven over 40 research papers over the last few years
+  \item Active mailing list with participants from outside our lab
+  \item Good user documentation
+  \item Many contributors (some from outside our lab)
+  \item Some(lots?) of users beyond our lab.
+  \item Deep Learning Tutorials
+  \item Unofficial RPMs for Mandriva
+  \item Downloads (June 8 2011, since last January):
+    \begin{itemize}
+      \item Pypi 780
+      \item MLOSS: 483
+      \item Assembla(main repo): unknown
+    \end{itemize}
+  \end{itemize}
+}
+
+\newcommand\codeHighlight[1]{\textcolor[rgb]{1,0,0}{\textbf{#1}}}
+\subsection{Simple Example}
+
+\begin{frame}[fragile]
+  \frametitle{Simple Example}
+\begin{Verbatim}[commandchars=\\\{\}]
+import theano
+a = theano.tensor.vector("a") {\color{gray} # declare symbolic variable}
+b = a + a**10                 {\color{gray} # build symbolic expression}
+f = theano.function([a], b)   {\color{gray} # compile function}
+print f([0,1,2])              {\color{gray} # prints `array([0,2,1026])`}
+\end{Verbatim}
+  \includegraphics[width=1.2in]{pics/f_unoptimized.png}
+\end{frame}
+
+\frame{
+  \frametitle{Simple Example: Optimized graph}
+             {\bf no pow, fused elemwise op!}
+
+  \includegraphics[width=2.3in]{pics/f_optimized.png}
+
+  Symbolic programming
+  \begin{itemize}
+  \item Paradigm change: people need to use it to understand it
+  \end{itemize}
+}
+
+\frame{
+  \frametitle{A Real Example: Logistic Regression}
+  \begin{itemize}
+  \item GPU-ready
+  \item Symbolic differentiation
+  \item Speed optimizations
+  \item Stability optimizations
+  \end{itemize}
+}
+
+\begin{frame}[fragile]
+  \frametitle{A Real Example: Logistic Regression}
+
+\begin{Verbatim}[commandchars=\\\{\}]
+import numpy
+import theano
+import theano.tensor as T
+rng = numpy.random
+
+# Declare Theano symbolic variables
+x = T.matrix("x")
+y = T.vector("y")
+\codeHighlight{w = theano.shared(rng.randn(100), name="w")}
+\codeHighlight{b = theano.shared(0., name="b")}
+print "Initial model:"
+print w.get_value(), b.get_value()
+
+\end{Verbatim}
+\end{frame}
+
+\subsection{Real Example}
+\begin{frame}[fragile]
+  \frametitle{A Real Example: Logistic Regression}
+\begin{Verbatim}[commandchars=\\\{\}]
+
+# Construct Theano expression graph
+p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))     {\color{gray}# Probabily of having a one}
+prediction = p_1 > 0.5                    {\color{gray}# The prediction: 0 or 1}
+xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) {\color{gray}# Cross-entropy}
+cost = xent.mean() + 0.01*(w**2).sum()    {\color{gray}# The cost to optimize}
+\codeHighlight{gw,gb = T.grad(cost, [w,b])}
+
+
+\end{Verbatim}
+\begin{itemize}
+\item T.grad works symbolically: takes and returns a Theano variable
+\item T.grad can be compared to a macro. So it can be applied multiple times
+\item T.grad takes scalar costs only
+\item Simple recipe allows to compute efficiently vector*Jabobian and vector*Hessian
+\item We are working on the missing optimizations to be able to compute efficently the full Jabobian and Hessian. 
+\end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{A Real Example: Logistic Regression}
+\begin{Verbatim}[commandchars=\\\{\}]
+# Compile expressions to functions
+train = theano.function(
+            inputs=[x,y],
+            \codeHighlight{outputs=[prediction, xent]},
+            \codeHighlight{updates=\{w:w-0.1*gw, b:b-0.1*gb\}})
+predict = theano.function(inputs=[x], outputs=prediction)
+
+N = 4
+feats = 100
+D = (rng.randn(N, feats), rng.randint(size=4,low=0, high=2))
+training_steps = 10
+for i in range(training_steps):
+    pred, err = train(D[0], D[1])
+print "Final model:"
+print w.get_value(), b.get_value()
+print "target values for D:", D[1]
+print "prediction on D:", predict(D[0])
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{A Real Example: optimization}
+\begin{Verbatim}[commandchars=\\\{\}]
+{\color{gray}# Construct Theano expression graph}
+p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
+xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
+prediction = p_1 > 0.5
+cost = xent.mean() + 0.01*(w**2).sum()
+gw,gb = T.grad(cost, [w,b])
+
+{\color{gray}# Compile expressions to functions}
+train = theano.function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+            updates=\{w:w-0.1*gw, b:b-0.1*gb\})  {\color{gray}# This is a dictionary}
+\end{Verbatim}
+Where are those optimization applied?
+\begin{itemize}
+\item Log(1+exp(x))
+\item 1 / (1 + T.exp(var)) (sigmoid)
+\item Log(1-sigmoid(var)) (softplus, stabilisation)
+\item GEMV
+\item Loop fusion
+\end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{A Real Example: optimization!}
+\begin{Verbatim}[commandchars=\\\{\}]
+p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
+xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
+\codeHighlight{# 1 / (1 + T.exp(var)) -> sigmoid(var)}
+\codeHighlight{# Log(1-sigmoid(var)) -> -sigmoid(var)}
+
+prediction = p_1 > 0.5
+cost = xent.mean() + 0.01*(w**2).sum()
+gw,gb = T.grad(cost, [w,b])
+
+train = theano.function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+\codeHighlight{# w-0.1*gw: GEMV with the dot in th grad}
+            updates=\{w:w-0.1*gw, b:b-0.1*gb\})
+
+\end{Verbatim}
+\begin{itemize}
+\item Loop fusion in many places
+\end{itemize}
+\end{frame}
+
+\frame{
+  \frametitle{Nb Dimensions, dtype and Broadcast}
+  \begin{itemize}
+  \item T.scalar, T.vector, T.matrix, T.row, T.col
+  \item T.row(floatX), T.[fdczbwil]row (float32, float64, complex64, complex128, int8, int16, int32, int64)
+  \end{itemize}
+  \begin{itemize}
+  \item All are shortcuts to: T.tensor(dtype, broadcastable=([False,True])*nd)
+  \item Other dtype: uint[8,16,32,64]
+  \item floatX: configurable dtype that can be float32 or float64.
+  \end{itemize}
+}
+
+\subsection{Benchmarks}
+\frame{
+\frametitle{Benchmarks}
+Example:
+\begin{itemize}
+\item Multi-layer perceptron
+\item Convolutional Neural Networks
+\item Misc Elemwise operations
+\end{itemize}
+
+Competitors: Numpy+Scipy, MATLAB, EBLearn, Torch5, numexpr
+}
+
+\frame{
+\frametitle{Benchmark MLP}
+Multi-Layer Perceptron: 60x784 matrix times 784x500 matrix, tanh, times 500x10 matrix, elemwise, then all in reverse for backpropagation
+\begin{center}
+\includegraphics[width=3.in]{pics/mlp.pdf}
+\end{center}
+
+}
+
+\frame{
+\frametitle{Benchmark Convolutional Network}
+Convolutional Network: 256x256 images convolved with 6 7x7 filters, downsampled to 6x50x50, tanh, convolution with 16 6x7x7 filter, tanh, matrix multiply, elemwise, then in reverse
+\begin{center}
+\includegraphics[width=3.in]{pics/conv.pdf}
+\end{center}
+}
+
+\frame{
+\frametitle{Benchmark elemwise}
+\begin{itemize}
+\item All on CPU
+\item Solid blue: Theano
+\item Dashed Red: numexpr(without MKL)
+\end{itemize}
+\begin{center}
+\includegraphics[width=3.in]{pics/multiple_graph.pdf}
+\end{center}
+}
+
+\section{Advanced Theano}
+\subsection{Misc}
+\frame{
+\frametitle{Theano Flags}
+Theano can be configured with flags. They can be defined in two way
+\begin{itemize}
+\item With the environment variable: THEANO\_FLAGS="mode=ProfileMode,ProfileMode.profile\_memory=True"
+\item With an configuration file that defaults to \textasciitilde/.theanorc
+\end{itemize}
+}
+
+\frame{
+\frametitle{Theano Graph}
+\begin{itemize}
+\item Theano works with symbolic graphs
+\item Those graphs are bi-partite graph (graph with 2 types of nodes)
+\item Those 2 nodes types are Apply and Variable nodes
+\end{itemize}
+\begin{itemize}
+\item Inputs and Outputs are list of Theano variables
+\item Can navigate through the graph from any point to any point
+\end{itemize}
+\begin{center}
+\includegraphics[width=3.5in]{pics/apply_node.pdf}
+\end{center}
+
+
+}
+\subsection{Pipeline}
+\frame{
+\frametitle{Compilation Pipeline}
+\begin{center}
+\includegraphics[width=2.7in]{pics/pipeline.pdf}
+\end{center}
+}
+
+\subsection{Profiling}
+\begin{frame}[fragile]
+\frametitle{Profile Mode}
+To replace the default mode with this mode, use the theano flags ``mode=ProfileMode''.
+
+To enable the memory profiling use the flags ProfileMode.profile\_memory=True 
+\begin{Verbatim}
+
+Time since import 1.486s
+Theano compile time: 1.017s (67.9% since import)
+    Optimization time: 0.805s
+    Linker time: 0.199s
+Theano fct call 0.002s (0.1% since import)
+   Theano Op time 0.001s 0.0%(since import) 36.8%(of fct call)
+   Theano function overhead in ProfileMode 0.001s 0.1%(since import) 
+                                           63.2%(of fct call)
+11 Theano fct call, 0.000s per call
+Rest of the time since import 0.479s 32.0%
+
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Profile Mode: Function Summary}
+Theano outputs:
+\vfill
+\begin{Verbatim}
+Theano fct summary:
+<% total fct time> <total time> <time per call> <nb call> <fct name>
+   97.1% 0.002s 1.64e-04s 10 train
+    2.9% 0.000s 4.91e-05s 1 predict
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Profile Mode: Single Op-Wise Summary}
+Theano outputs:
+\vfill
+\begin{Verbatim}
+Single Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %> 
+    <self seconds> <cumulative seconds> <time per call> <nb_call>
+    <nb_op> <nb_apply> <Op name>
+   30.8%   30.8%  0.000s  0.000s  1.86e-05s      10  1  1 <'Gemv'>
+   23.8%   54.6%  0.000s  0.000s  1.58e-06s *    91 10 10 <'Elemwise'>
+   18.3%   72.9%  0.000s  0.000s  1.10e-05s      10  1  1 <'Alloc'>
+   15.9%   88.7%  0.000s  0.001s  8.71e-06s      11  1  2 <'Dot'>
+    7.7%   96.4%  0.000s  0.001s  1.49e-06s *    31  2  4 <'DimShuffle'>
+    2.0%   98.4%  0.000s  0.001s  1.22e-06s *    10  1  1 <'Sum'>
+    1.6%  100.0%  0.000s  0.001s  9.78e-07s *    10  1  1 <'Shape_i'>
+(*) Op is running a c implementation
+
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Profile Mode: Op-Wise Summary}
+Theano outputs:
+\vfill
+\begin{Verbatim}
+Op-wise summary:
+<% of local_time spent on this kind of Op> <cumulative %>
+    <self seconds> <cumulative seconds> <time per call>
+    <nb_call> <nb apply> <Op name>
+   31.4%   31.4%  0.000s  0.000s  1.93e-05s      10  1 Gemv{inplace}
+   16.9%   48.3%  0.000s  0.000s  1.04e-05s      10  1 Alloc
+   15.5%   63.8%  0.000s  0.000s  8.65e-06s      11  2 dot
+    5.0%   68.8%  0.000s  0.000s  3.05e-06s *    10  1 Elemwise{
+        Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}
+    4.3%   73.1%  0.000s  0.000s  1.27e-06s *    21  3 InplaceDimShuffle{x}
+    3.3%   76.4%  0.000s  0.000s  2.00e-06s *    10  1 Elemwise{sub,no_inplace}
+    2.9%   79.3%  0.000s  0.000s  1.79e-06s *    10  1 Elemwise{gt,no_inplace}
+    2.5%   84.5%  0.000s  0.001s  1.53e-06s *    10  1 InplaceDimShuffle{1,0}
+   ... (remaining 9 Apply account for 18.3%(0.00s) of the runtime)
+(*) Op is running a c implementation
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Profile Mode: Apply-Wise Summary}
+Theano outputs:
+\vfill
+\begin{Verbatim}
+Apply-wise summary:
+<% of local_time spent at this position> <cumulative %%>
+    <apply time> <cumulative seconds> <time per call>
+    <nb_call> <Apply position> <Apply Op name>
+   29.8%   29.8%  0.000s  0.000s 1.96e-05s  10  15 Gemv{inplace}
+        (<TensorType(float64, vector)>, {-0.1}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, {0.998})
+   15.8%   45.6%  0.000s  0.000s 1.04e-05s  10  10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+   14.0%   59.6%  0.000s  0.000s 9.20e-06s  10   1 dot(<TensorType(float64, matrix)>, <TensorType(float64, vector)>)
+    5.6%   65.2%  0.000s  0.000s 3.67e-06s  10   9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(<TensorType(float64, vector)>, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
+    3.2%   68.4%  0.000s  0.000s 2.12e-06s  10   4 Elemwise{sub,no_inplace}(TensorConstant{[ 1.]}, <TensorType(float64, vector)>)
+    2.9%   71.3%  0.000s  0.000s 1.93e-06s  10  12 Elemwise{gt,no_inplace}(Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}}}[(0, 0)].0, TensorConstant{[ 0.5]})
+   ... (remaining 14 Apply instances account for 28.6%(0.00s) of the runtime)
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Profile Mode: Memory Profile}
+Theano outputs:
+\vfill
+\begin{Verbatim}
+Profile of Theano functions memory:
+(This check only the output of each apply node. It don't check the 
+    temporary memory used by the op in the apply node.)
+Theano fct: train
+    Max without gc, inplace and view (KB) 4
+    Max FAST_RUN_NO_GC (KB) 0
+    Max FAST_RUN (KB) 0
+    Memory saved by view (KB) 3
+    Memory saved by inplace (KB) 0
+    Memory saved by GC (KB) 0
+    <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> 
+        <created/inplace/view> <Apply node>
+            3200B  [3200] v InplaceDimShuffle{1,0}(<TensorType(float64, matrix)>)
+             800B  [800] i Gemv{inplace}(<TensorType(float64, vector)>, TensorConstant{-0.1}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.998})
+              32B  [32] c Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Profile Mode: Tips}
+Theano outputs:
+\vfill
+\begin{Verbatim}
+Here are tips to potentially make your code run faster
+(if you think of new ones, suggest them on the mailing list).
+Test them first, as they are not guaranteed to always provide a speedup.
+  - Try the Theano flag floatX=float32
+\end{Verbatim}
+\end{frame}
+
+\subsection{Printing}
+\begin{frame}[fragile]
+\frametitle{Text Printing of Graph: Pretty Printing}
+theano.printing.pprint(variable)
+\vfill
+\begin{Verbatim}
+>>> theano.printing.pprint(prediction)
+gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),
+TensorConstant{0.5})
+\end{Verbatim}
+\end{frame}
+
+
+\begin{frame}[fragile]
+\frametitle{Text Printing of Graph: Debug Print}
+theano.printing.debugprint({fct, variable, list of variables})
+\vfill
+\small
+\begin{Verbatim}
+>>> theano.printing.debugprint(prediction)
+Elemwise{gt,no_inplace} [@181772236] ''   
+ |Elemwise{true_div,no_inplace} [@181746668] ''   
+ | |InplaceDimShuffle{x} [@181746412] ''   
+ | | |TensorConstant{1} [@181745836]
+ | |Elemwise{add,no_inplace} [@181745644] ''   
+ | | |InplaceDimShuffle{x} [@181745420] ''   
+ | | | |TensorConstant{1} [@181744844]
+ | | |Elemwise{exp,no_inplace} [@181744652] ''   
+ | | | |Elemwise{sub,no_inplace} [@181744012] ''   
+ | | | | |Elemwise{neg,no_inplace} [@181730764] ''   
+ | | | | | |dot [@181729676] ''   
+ | | | | | | |x [@181563948]
+ | | | | | | |w [@181729964]
+ | | | | |InplaceDimShuffle{x} [@181743788] ''   
+ | | | | | |b [@181730156]
+ |InplaceDimShuffle{x} [@181771788] ''   
+ | |TensorConstant{0.5} [@181771148]
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Text Printing of Graph: Debug Print}
+theano.printing.debugprint({fct, variable, list of variables})
+\vfill
+\small
+\begin{Verbatim}
+>>> theano.printing.debugprint(predict)
+Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
+ |dot [@183018796] ''   1
+ | |x [@183000780]
+ | |w [@183000812]
+ |InplaceDimShuffle{x} [@183133580] ''   0
+ | |b [@183000876]
+ |TensorConstant{[ 0.5]} [@183084108]
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Picture Printing of Graph}
+\begin{Verbatim}
+>>> theano.printing.pydotprint_variables(prediction)
+\end{Verbatim}
+\includegraphics[width=2.0in]{pics/logreg_pydotprint_prediction.png}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Picture Printing of Graph}
+\begin{Verbatim}
+>>> theano.printing.pydotprint(predict)
+\end{Verbatim}
+\includegraphics[width=4in]{pics/logreg_pydotprint_predic.png}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Picture Printing of Graph}
+\begin{Verbatim}[commandchars=\\\{\}]
+>>> theano.printing.pydotprint(train) {\color{gray}# This is a small train example!}
+\end{Verbatim}
+\hspace{-.8cm}
+\includegraphics[width=5.0in]{pics/logreg_pydotprint_train.png}
+\end{frame}
+
+
+\subsection{Debugging}
+\frame{
+\frametitle{How to Debug}
+\begin{itemize}
+\item Run with the flag mode=DebugMode
+  \begin{itemize}
+  \item 100-1000x slower
+  \item Test all optimization steps from the original graph to the final graph
+  \item Checks many properties that Op should/shoudn't do
+  \item Executes the Python and C code versions
+  \end{itemize}
+\item Run with the flag mode=FAST\_COMPILE
+  \begin{itemize}
+  \item Few optimizations
+  \item Run Python code (better error messages and can go in the python debugger)
+  \end{itemize}
+\item Run with the Theano flag compute\_test\_value = {``off'', ``ignore'', ``warn'', ``raise''}
+  \begin{itemize}
+  \item Run the code as we create the graph
+  \item Allow to find the bug earlier (ex: shape mismatch)
+  \item Make identification of the wrong line in the code easier
+  \item Use the value of constant and shared variable directly
+  \item For pure symbolic varible use: x.tag.test\_value = numpy.random.rand(5,10)
+  \end{itemize}
+\end{itemize}
+}
+
+\frame{
+\frametitle{Known Limitation}
+\begin{itemize}
+\item Compilation phase distinct from execution phase
+\item Compilation time significant
+  \begin{itemize}
+  \item Amortize it with functions over big input or reuse functions
+  \end{itemize}
+\item Execution overhead (We have something in a branch that lowers it)
+  \begin{itemize}
+  \item Needs a certain number of operations to be useful
+  \end{itemize}
+\item Compilation time super linear to the size of the graph. 
+  \begin{itemize}
+  \item A few hundreds node OK
+  \item You can disable some optimizations to make it faster with bigger graphs
+  \item When this happened to us, it always indicated a problem in the graph
+  \end{itemize}
+\end{itemize}
+
+}
+
+\subsection{Loop}
+\frame{
+\frametitle{Scan}
+\begin{itemize}
+\item General form of {\bf recurrence}, which can be used for looping.
+\item {\bf Reduction} and {\bf map}(loop over the leading dimensions) are special case of Scan
+\item You *scan* a function along some input sequence, producing an
+  output at each time-step
+\item The function can see the {\bf previous K time-steps} of your function
+\item ``sum()`` could be computed by scanning the $z + x_i$ function
+  over a list, given an initial state of ``z=0``.
+\item Often a for-loop can be expressed as a ``scan()`` operation, and
+  ``scan`` is the closest that Theano comes to looping. 
+\item The advantage of using ``scan`` over for loops is that it allows
+  the number of iterations to be part of the symbolic graph.
+\item calls: ``scan()``, ``map()``, ``reduce()``, ``foldl()``, ``foldr()``.
+\end{itemize}
+}
+
+\begin{frame}[fragile]
+\frametitle{Scan Example: Computing pow(A,k)}
+\begin{Verbatim}
+k = T.iscalar("k"); A = T.vector("A")
+
+def inner_fct(prior_result, A): return prior_result * A
+# Symbolic description of the result
+result, updates = theano.scan(fn=inner_fct,
+                              outputs_info=T.ones_like(A),
+                              non_sequences=A, n_steps=k)
+
+# Scan has provided us with A**1 through A**k.  Keep only the last
+# value. Scan notices this and does not waste memory saving them.
+final_result = result[-1]
+
+power = theano.function(inputs=[A,k], outputs=final_result,
+                        updates=updates)
+
+print power(range(10),2)
+#[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Scan Example: Calculating a Polynomial}
+\begin{Verbatim}
+coefficients = theano.tensor.vector("coefficients")
+x = T.scalar("x"); max_coefficients_supported = 10000
+
+# Generate the components of the polynomial
+full_range=theano.tensor.arange(max_coefficients_supported)
+components, updates = theano.scan(fn=lambda coeff, power, free_var: 
+                                     coeff * (free_var ** power),
+                                  outputs_info=None,
+                                  sequences=[coefficients, full_range],
+                                  non_sequences=x)
+polynomial = components.sum()
+calculate_polynomial = theano.function(inputs=[coefficients, x],
+                                       outputs=polynomial)
+
+test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
+print calculate_polynomial(test_coeff, 3)
+# 19.0
+\end{Verbatim}
+\end{frame}
+
+\subsection{GPU}
+\frame{
+\frametitle{GPU}
+\begin{itemize}
+\item Now only 32 bits float supported (being worked on)
+\item Only 1 GPU per process
+\item Use the Theano flag device=gpu to tell to use the gpu device
+  \begin{itemize}
+  \item Use device=gpu[gpu\_id] to specify witch gpu
+  \item Shared variable with float32 data are by default in the GPU memory space
+  \end{itemize}
+\item Use the Theano flag floatX=float32
+  \begin{itemize}
+  \item Be sure to use floatX (theano.config.floatX) in your code
+  \item Cast input before putting them into a shared variable
+  \item Cast "problem": int32 with float32 $\to$ float64
+    \begin{itemize}    
+    \item A new cast mechanism is being developed
+    \item Insert manual cast in your code or use [u]int{8,16}
+    \item Insert manual cast around the mean op (divide by the length that is a int64!)
+    \end{itemize}
+  \end{itemize}
+\end{itemize}
+}
+\frame{
+\frametitle{GPU for Exercices:}
+\begin{itemize}
+\item Intel Core i7 980 XE(107Gf/s float64) (1050\$)
+\item NVIDIA C2050(515 Gf/s float64, 1Tf/s float32), compute capability 2.0 (2400\$) 6 cores/12 threads
+\item NVIDIA GTX580(1.5Tf/s float32), compute capability 2.0 (500\$) 512 cores
+\item NVIDIA Quadro FX 580(71GF/s single), compute capability 1.1 (140\$ But 'profesionnal card'), 32 cores
+\end{itemize}
+
+%Device 0: "Quadro FX 580"
+% Total amount of global memory:                 536150016 bytes
+% Multiprocessors x Cores/MP = Cores:            4 (MP) x 8 (Cores/MP) = 32 (Cores)
+% Clock rate:                                    1.12 GHz
+% Run time limit on kernels:                     Yes
+% Compute mode:                                  Default (multiple host
+%threads can use this device simultaneously)
+}
+
+\frame{
+\frametitle{Theano Exercices}
+\begin{itemize}
+\item Run the simple example
+\item Run the real example
+\item Modify your version to run in float32 with floatX.
+\item Run your version on the CPU and GPU
+\item Do you see a speed up with the GPU? Where does it come from?(Try to profile it)
+\item Scan: modify the polynomial example to have the reduction done by scan
+\end{itemize}
+}
+
+\section{PyCUDA}
+\subsection{PyCUDA}
+\frame{
+\frametitle{Intro}
+Authors: Andreas Kl\"{o}ckner
+
+PyCUDA lets you access Nvidia's CUDA parallel computation API from Python. Several wrappers of the CUDA API already exist. So what's so special about PyCUDA?
+\begin{itemize}
+\item Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
+  \begin{itemize}
+  \item Makes it much easier to write correct, leak- and crash-free code
+  \item PyCUDA knows about dependencies, too, so (for example) it won't detach from a context before all memory allocated in it is also freed
+  \end{itemize}
+\item Convenience
+  \begin{itemize}
+  \item Abstractions to compile CUDA code from python pycuda.driver.SourceModule
+  \item A GPU memory buffer pycuda.gpuarray.GPUArray
+  \end{itemize}
+\item Completeness
+  \begin{itemize}
+  \item Binding to all of CUDA's driver API
+  \end{itemize}
+\item Automatic Error Checking
+  \begin{itemize}
+  \item All CUDA errors are automatically translated into Python exceptions
+  \end{itemize}
+\item Speed
+  \begin{itemize}
+  \item PyCUDA's base layer is written in C++
+  \end{itemize}
+\item Helpful Documentation.
+\end{itemize}
+
+}
+
+\begin{frame}[fragile]
+\frametitle{Example}
+\begin{Verbatim}
+import pycuda.autoinit
+import pycuda.driver as drv
+import numpy
+
+from pycuda.compiler import SourceModule
+mod = SourceModule("""
+__global__ void multiply_them(float *dest, float *a, float *b)
+{
+  const int i = threadIdx.x;
+  dest[i] = a[i] * b[i];
+}
+""")
+
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Example}
+\begin{Verbatim}
+multiply_them = mod.get_function("multiply_them")
+
+a = numpy.random.randn(400).astype(numpy.float32)
+b = numpy.random.randn(400).astype(numpy.float32)
+
+dest = numpy.zeros_like(a)
+multiply_them(
+        drv.Out(dest), drv.In(a), drv.In(b),
+        block=(400,1,1), grid=(1,1))
+\end{Verbatim}
+\end{frame}
+
+
+\frame{
+\frametitle{GpuArray}
+No support for strides.
+}
+
+\subsection{PyCUDA+Theano}
+\begin{frame}[fragile]
+\frametitle{Theano Op Contract}
+\begin{Verbatim}
+class MyOp(Op):
+    def __eq__(self, other):
+    def __hash__(self):
+    def __str__(self):
+    def make_node(self, *inputs):
+
+python implementation:
+    def perform(self, node, inputs_storage, outputs_storage):
+
+c implementation: [see theano web site]
+
+others implementation (pycuda, ...):
+     def make_thunk(self, node, storage_map, _, _2):
+
+optinal:
+    def __init__(self, ...):
+    def grad(self, inputs, g):
+    def infer_shape(node, (i0_shapes, i1_shapes, ...))
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Theano Op Example}
+\begin{Verbatim}
+import theano
+
+class DoubleOp(theano.Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x):
+        x = theano.tensor.as_tensor_variable(x)
+        return theano.Apply(self, [x], [x.type()])
+    def perform(self, node, inputs, output_storage):
+        x = inputs[0]
+        z = output_storage[0]
+        z[0] = x * 2
+
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Theano Op Example: Test it!}
+\begin{Verbatim}
+x = theano.tensor.matrix()
+f = theano.function([x],DoubleOp()(x))
+
+import numpy
+inp = numpy.random.rand(5,5)
+out = f(inp)
+assert numpy.allclose(inp*2, out)
+print inp
+print out
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Theano+PyCUDA Op Example}
+\begin{Verbatim}
+import numpy, theano
+import theano.misc.pycuda_init
+from pycuda.compiler import SourceModule
+import theano.sandbox.cuda as cuda
+
+class PyCUDADoubleOp(theano.Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, inp):
+        inp = cuda.basic_ops.gpu_contiguous(
+           cuda.basic_ops.as_cuda_ndarray_variable(inp))
+        assert inp.dtype == "float32"
+        return theano.Apply(self, [inp], [inp.type()])
+\end{Verbatim}
+\end{frame}
+
+
+\begin{frame}[fragile]
+\frametitle{Theano+PyCUDA Op Example: make\_thunk}
+\begin{Verbatim}
+    def make_thunk(self, node, storage_map, _, _2):
+        mod = SourceModule( THE_C_CODE )
+        pycuda_fct = mod.get_function("my_fct")
+
+        inputs = [ storage_map[v] for v in node.inputs]
+        outputs = [ storage_map[v] for v in node.outputs]
+        def thunk():
+            z = outputs[0]
+            if z[0] is None or z[0].shape!=inputs[0][0].shape:
+                z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
+            grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
+            pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
+                       block=(512,1,1), grid=grid)
+        return thunk
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Theano+PyCUDA Op Example: GPU Code}
+\begin{Verbatim}
+THE_C_CODE = """
+__global__ void my_fct(float * i0, float * o0, int size) {
+    int i = blockIdx.x*blockDim.x + threadIdx.x;
+    if(i<size){
+        o0[i] = i0[i]*2;
+    }
+}""")
+\end{Verbatim}
+\end{frame}
+
+
+\begin{frame}[fragile]
+\frametitle{Theano+PyCUDA Op Example: Test it!}
+\begin{Verbatim}
+x = theano.tensor.fmatrix()
+f = theano.function([x], PyCUDADoubleOp()(x))
+xv=numpy.ones((4,5), dtype="float32")
+
+assert numpy.allclose(f(xv), xv*2)
+print numpy.asarray(f(xv))
+\end{Verbatim}
+\end{frame}
+
+\begin{frame}
+\frametitle{Theano+PyCUDA Exercices}
+\begin{itemize}
+\item Elemwise add: $x + y$
+\item Elemwise with 2 outputs: $x + y$ and $x - y$
+\item Elemwise with stride
+\end{itemize}
+\end{frame}
+
+\section{GpuNdArray}
+\subsection{GpuNdArray}
+\frame{
+\frametitle{Why a common GPU ndarray?}
+\begin{itemize}
+\item Currently there are at least 4 different GPU arrays in python only
+  \begin{itemize}
+  \item CudaNdarray(Theano), GPUArray(PyCUDA) and CUDAMatrix(cudamat), GPUArray(PyOpenCL), ...
+  \item There are even more if we include other languages
+  \end{itemize}
+\item All of them are a subset of numpy.ndarray on the GPU!
+\item Duplicate work
+  \begin{itemize}
+  \item GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
+  \end{itemize}
+\item Harder to port/reuse code
+\item Harder to find/distribute code
+\item Divides development work
+\end{itemize}
+
+}
+
+\frame{
+\frametitle{Design Goals}
+\begin{itemize}
+\item Make it VERY similar to numpy.ndarray
+\item Be compatible with CUDA and OpenCL
+\item Have the base object in C to allow collaboration with more projects
+  \begin{itemize}
+  \item We want people from C, C++, ruby, R, ... all use the same base GPU n-dimensional array
+  \end{itemize}
+\end{itemize}
+}
+
+\frame{
+\frametitle{Final GpuNdArray Note}
+\begin{itemize}
+\item Under development
+\item Will be the next GPU ndarray for Theano (This summer!)
+\item Probably also for PyCUDA, PyOpenCL
+\item Mailing list: http://lists.tiker.net/listinfo/gpundarray
+\end{itemize}
+
+}
+
+\section{Conclusion}
+\subsection{Conclusion}
+\frame{
+  \frametitle{Conclusion}
+  \begin{itemize}
+  \item I presented a tool that try to be the holy grail in computing: {\bf easy to code} and {\bf fast to execute}!
+  \item Allows to run code on CPU and can move them in many case on the GPU
+  \item Easy wrapping of existing GPU code in Theano
+  \item It {\bf works} and is {\bf used in real world}
+  \end{itemize}
+}
+\end{document}
--- a/doc/hpcs2011_tutorial/pycuda_double_op.py
+++ b/doc/hpcs2011_tutorial/pycuda_double_op.py
+import numpy, theano
+import theano.misc.pycuda_init
+from pycuda.compiler import SourceModule
+import theano.sandbox.cuda as cuda
+
+class PyCUDADoubleOp(theano.Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, inp):
+        inp = cuda.basic_ops.gpu_contiguous(
+           cuda.basic_ops.as_cuda_ndarray_variable(inp))
+        assert inp.dtype == "float32"
+        return theano.Apply(self, [inp], [inp.type()])
+
+    def make_thunk(self, node, storage_map, _, _2):
+        mod = SourceModule("""
+    __global__ void my_fct(float * i0, float * o0, int size) {
+    int i = blockIdx.x*blockDim.x + threadIdx.x;
+    if(i<size){
+        o0[i] = i0[i]*2;
+    }
+  }""")
+        pycuda_fct = mod.get_function("my_fct")
+        inputs = [ storage_map[v] for v in node.inputs]
+        outputs = [ storage_map[v] for v in node.outputs]
+        def thunk():
+            z = outputs[0]
+            if z[0] is None or z[0].shape!=inputs[0][0].shape:
+                z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
+            grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
+            pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
+                       block=(512,1,1), grid=grid)
+
+        return thunk
+
+x = theano.tensor.fmatrix()
+f = theano.function([x], PyCUDADoubleOp()(x))
+xv=numpy.ones((4,5), dtype="float32")
+
+assert numpy.allclose(f(xv), xv*2)
+print numpy.asarray(f(xv))
--- a/doc/hpcs2011_tutorial/simple_example.py
+++ b/doc/hpcs2011_tutorial/simple_example.py
+import theano
+a = theano.tensor.vector("a") # declare variable
+b = a + a**10                 # build symbolic expression
+f = theano.function([a], b)   # compile function
+print f([0,1,2])
+# prints `array([0,2,1026])`
+
+theano.printing.pydotprint_variables(b, outfile="pics/f_unoptimized.png", var_with_name_simple=True)
+theano.printing.pydotprint(f, outfile="pics/f_optimized.png", var_with_name_simple=True)