Merge.

4613cab0 · David Warde-Farley · c48e39ea · 5f8571de · 4613cab0
--- a/doc/hpcs2011_tutorial/presentation.tex
+++ b/doc/hpcs2011_tutorial/presentation.tex
@@ -39,9 +39,9 @@ HPCS 2011, Montr\'eal
  \item Introduction
    \begin{itemize}
    \item Why Scripting for GPUs?
-    \item Theano vs PyCUDA
+    \item Theano vs. PyCUDA
    \item Python in 1 slide
-    \item Numpy in 1 slide
+    \item NumPy in 1 slide
    \end{itemize}
  \item Theano
    \begin{itemize}
@@ -62,16 +62,16 @@ HPCS 2011, Montr\'eal
    \item Profiling
    \item Printing
    \item Debugging
-    \item Scan(for-Loop generalization)
+    \item Scan (For-Loop generalization)
    \item GPU
-    \item Exercices/break
+    \item Exercises/break
    \end{itemize}
  \item PyCUDA
    \begin{itemize}
    \item Intro
    \item Example
    \item PyCUDA + Theano
-    \item Exercices
+    \item Exercises
    \end{itemize}
  \item GpuNdArray
  \item Conclusion
@@ -87,7 +87,7 @@ HPCS 2011, Montr\'eal
 \frame{
  \frametitle{Won't cover}
  \begin{itemize}
-  \item How to write GPU code
+  \item How to write (low-level) GPU code
  \item How to optimize GPU code
  \end{itemize}
 }
@@ -98,7 +98,7 @@ HPCS 2011, Montr\'eal
  \frametitle{Why GPU}
 \begin{itemize}
  \item Faster, cheaper, more efficient power usage
-  \item How much faster? I saw numbers from 100x slower to 1000x faster.
+  \item How much faster? I have seen numbers from 100x slower to 1000x faster.
    \begin{itemize}
    \item It depends on the algorithms
    \item How the benchmark is done
@@ -108,12 +108,13 @@ HPCS 2011, Montr\'eal
      \end{itemize}
    \item Theory: 
      \begin{itemize}
-      \item Intel Core i7 980 XE(107Gf/s float64) 6 cores
-      \item NVIDIA C2050(515 Gf/s float64, 1Tf/s float32) 480 cores
-      \item NVIDIA GTX580(1.5Tf/s float32) 512 cores
+      \item Intel Core i7 980 XE (107Gf/s float64) 6 cores
+      \item NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores
+      \item NVIDIA GTX580 (1.5Tf/s float32) 512 cores
      \end{itemize}
    \end{itemize}
-  \item With Theano, up to 100x can be seen as we don't do multiple core on cpu (except for call to gemm)
+  \item With Theano, up to 100x can be seen as we don't generate multi-core code on CPU (except for calls to GEMM)
+  % COMMENT: whether GEMM is parallel depends on your BLAS, right?
  \item If you see 1000x, it means the benchmark is not fair
 \end{itemize}

@@ -130,7 +131,7 @@ HPCS 2011, Montr\'eal
    \end{itemize}
  \item CPU: largely restricted to control
    \begin{itemize}
-    \item Optimized for sequential code
+    \item Optimized for sequential code and \textit{low latency} (rather than high throughput)
    \item tasks (1000/sec)
    \item Scripting fast enough
    \item Theano = Mathematical expression compiler
@@ -146,7 +147,7 @@ HPCS 2011, Montr\'eal
      \begin{itemize}
      \item Mathematical expression compiler
      \item Generates costum C and CUDA code
-      \item Uses python code when performance is not critical
+      \item Uses Python code when performance is not critical
      \end{itemize}
    \item CUDA
      \begin{itemize}
@@ -170,7 +171,7 @@ HPCS 2011, Montr\'eal
  Do you have experinece with :
  \begin{itemize}
  \item Python
-  \item Numpy / Scipy / Matlab
+  \item NumPy / SciPy / Matlab
  \item Maple / Mathematica / SymPy
  \item GPU programming / CUDA / OpenCL
  \item Cython / Weave / Numexpr
@@ -194,18 +195,18 @@ HPCS 2011, Montr\'eal
 }

 \frame{
-  \frametitle{Numpy in 1 Slide}
+  \frametitle{NumPy in 1 Slide}
  \begin{itemize}
  \item Base scientific computing package on the CPU
  \item A powerful N-dimensional array object
    \begin{itemize}
    \item ndarray.\{ndim, shape, size, dtype, itemsize, stride\}
    \end{itemize}
-  \item Sophisticated (broadcasting) functions
+  \item Sophisticated ``broadcasting'' functions
    \begin{itemize}
-    \item numpy.random.rand(4,5) * numpy.random.rand(1,5) = mat(4,5)
-    \item numpy.random.rand(4,5) * numpy.random.rand(4,1) = mat(4,5)
-    \item numpy.random.rand(4,5) * numpy.random.rand(5) = mat(4,5)
+    \item \texttt{numpy.random.rand(4,5) * numpy.random.rand(1,5)} $\Rightarrow$ mat(4,5)
+    \item \texttt{numpy.random.rand(4,5) * numpy.random.rand(4,1)} $\Rightarrow$ mat(4,5)
+    \item \texttt{numpy.random.rand(4,5) * numpy.random.rand(5)} $\Rightarrow$ mat(4,5)
    \end{itemize}
  \item Tools for integrating C/C++ and Fortran code
  \item Linear algebra, Fourier transform and random number capable
@@ -256,14 +257,14 @@ HPCS 2011, Montr\'eal
    \end{itemize}
  \item Speed and stability optimizations 
    \begin{itemize}
-    \item Gives the right answer for log(1+x) even if x is really tiny.
+    \item Gives the right answer for $\log(1+x)$ even if x is really tiny.
    \end{itemize}
  \item Extensive unit-testing and self-verification
    \begin{itemize}
    \item Detects and diagnoses many types of errors
    \end{itemize}
  \item Expressions mimic NumPy's syntax \& semantics
-  \item Works on linux, Mac and Windows
+  \item Works on Linux, Mac and Windows
  \end{itemize}
 }

@@ -274,15 +275,15 @@ HPCS 2011, Montr\'eal
    \begin{itemize}
    \item float32 only for now (working on other data types)
    \item Doesn't work on Windows for now
-    \item On GPU data-intensive calculations are typically between 6.5x and 44x faster. We've seen speedsups up to 140x
+    \item On GPU data-intensive calculations are typically between 6.5x and 44x faster. We've seen speedups up to 140x
    \end{itemize}
-  \item On CPU, common machine learning-algorithms are 1.6x to 7.5x faster than competitive alternatives 
+  \item On CPU, common machine learning algorithms are 1.6x to 7.5x faster than competitive alternatives 
    \begin{itemize}
-    \item including those in C/C++, NumPy, SciPy, and Matlab
+    \item including specialized implementations in C/C++, NumPy, SciPy, and Matlab
    \end{itemize}
+  \item Some sparse operations (CPU only)
  \item The project was started by James Bergstra and Olivier Breuleux
-  \item For the past 1-2 years, I've been replacing as lead contributor
-  \item Some Sparse operation (cpu only)
+  \item For the past 1-2 years, I have replaced Olivier as lead contributor
  \end{itemize}
 }

@@ -292,7 +293,7 @@ HPCS 2011, Montr\'eal
  \begin{itemize}
  \item Rearranges high-level expressions 
  \item Produces customized low-level code
-  \item Can use a variety of backend technologies(GPU,...)
+  \item Can use a variety of backend technologies (GPU,...)
  \end{itemize}

  \vfill
@@ -301,11 +302,11 @@ HPCS 2011, Montr\'eal
  \item High-level language allows to concentrate on the algorithm
  \item Automatic optimization
    \begin{itemize}
-    \item No need to manually optimize for each algo you want to test
+    \item No need to manually optimize for each algorithm you want to test
    \end{itemize}
  \item Automatic efficient symbolic differentiation
    \begin{itemize}
-    \item No need to manually  differentiate your functions
+    \item No need to manually differentiate your functions (tedious \& error-prone for complicated expressions!)
    \end{itemize}    
  \end{itemize}
 }
@@ -322,14 +323,13 @@ HPCS 2011, Montr\'eal
  \item Active mailing list with participants from outside our lab
  \item Good user documentation
  \item Many contributors (some from outside our lab)
-  \item Some(lots?) of users beyond our lab.
  \item Deep Learning Tutorials
  \item Unofficial RPMs for Mandriva
  \item Downloads (June 8 2011, since last January):
    \begin{itemize}
      \item Pypi 780
      \item MLOSS: 483
-      \item Assembla(main repo): unknown
+      \item Assembla (``bleeding edge'' repository): unknown
    \end{itemize}
  \end{itemize}
 }
@@ -397,20 +397,20 @@ print w.get_value(), b.get_value()
 \begin{Verbatim}[commandchars=\\\{\}]

 # Construct Theano expression graph
-p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))     {\color{gray}# Probabily of having a one}
-prediction = p_1 > 0.5                    {\color{gray}# The prediction: 0 or 1}
-xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) {\color{gray}# Cross-entropy}
-cost = xent.mean() + 0.01*(w**2).sum()    {\color{gray}# The cost to optimize}
+p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))     {\color{gray}# Probability under model that target = 1}
+prediction = p_1 > 0.5                    {\color{gray}# The thresholded prediction: 0 or 1}
+xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) {\color{gray}# Cross-entropy loss function}
+cost = xent.mean() + 0.01*(w**2).sum()    {\color{gray}# The (penalized) cost to optimize}
 \codeHighlight{gw,gb = T.grad(cost, [w,b])}


 \end{Verbatim}
 \begin{itemize}
 \item T.grad works symbolically: takes and returns a Theano variable
-\item T.grad can be compared to a macro. So it can be applied multiple times
+\item T.grad can be compared to a macro: it can be applied multiple times
 \item T.grad takes scalar costs only
-\item Simple recipe allows to compute efficiently vector*Jabobian and vector*Hessian
-\item We are working on the missing optimizations to be able to compute efficently the full Jabobian and Hessian. 
+\item Simple recipe allows to compute efficiently vector * Jacobian and vector * Hessian
+\item We are working on the missing optimizations to be able to compute efficently the full Jabobian and Hessian
 \end{itemize}
 \end{frame}

@@ -458,7 +458,7 @@ Where are those optimization applied?
 \item Log(1+exp(x))
 \item 1 / (1 + T.exp(var)) (sigmoid)
 \item Log(1-sigmoid(var)) (softplus, stabilisation)
-\item GEMV
+\item GEMV (matrix-vector multiply from BLAS)
 \item Loop fusion
 \end{itemize}
 \end{frame}
@@ -488,7 +488,7 @@ train = theano.function(
 \end{frame}

 \frame{
-  \frametitle{Nb Dimensions, dtype and Broadcast}
+  \frametitle{\# Dimensions, dtype and broadcastability}
  \begin{itemize}
  \item T.scalar, T.vector, T.matrix, T.row, T.col
  \item T.row(floatX), T.[fdczbwil]row (float32, float64, complex64, complex128, int8, int16, int32, int64)
@@ -510,7 +510,12 @@ Example:
 \item Misc Elemwise operations
 \end{itemize}

-Competitors: Numpy+Scipy, MATLAB, EBLearn, Torch5, numexpr
+Competitors: NumPy + SciPy, MATLAB, EBLearn, Torch5, numexpr
+% COMMENT: Might want to say that EBLearn and Torch5 are specialized libraries written by
+% practitioners specifically for these tasks, rest are our own implementations
+
+% Also brief explanation of numexpr: "similar to Theano, 'virtual machine' for array-based expressions'
+% but less features implemented
 }

 \frame{
@@ -524,18 +529,18 @@ Multi-Layer Perceptron: 60x784 matrix times 784x500 matrix, tanh, times 500x10 m

 \frame{
 \frametitle{Benchmark Convolutional Network}
-Convolutional Network: 256x256 images convolved with 6 7x7 filters, downsampled to 6x50x50, tanh, convolution with 16 6x7x7 filter, tanh, matrix multiply, elemwise, then in reverse
+Convolutional Network: 256x256 images convolved with 6 7x7 filters, downsampled to 6x50x50, tanh, convolution with 16 6x7x7 filter, elementwise tanh, matrix multiply, elementwise, then in reverse % COMMENT: what does last elementwise mean?
 \begin{center}
 \includegraphics[width=3.in]{pics/conv.pdf}
 \end{center}
 }

 \frame{
-\frametitle{Benchmark elemwise}
+\frametitle{Elemwise Benchmark}
 \begin{itemize}
 \item All on CPU
 \item Solid blue: Theano
-\item Dashed Red: numexpr(without MKL)
+\item Dashed Red: numexpr (without MKL)
 \end{itemize}
 \begin{center}
 \includegraphics[width=3.in]{pics/multiple_graph.pdf}
@@ -543,26 +548,28 @@ Convolutional Network: 256x256 images convolved with 6 7x7 filters, downsampled
 }

 \section{Advanced Theano}
-\subsection{Misc}
+\subsection{Miscellaneous}
 \frame{
 \frametitle{Theano Flags}
-Theano can be configured with flags. They can be defined in two way
+Theano can be configured with flags. They can be defined in two ways
 \begin{itemize}
-\item With the environment variable: THEANO\_FLAGS="mode=ProfileMode,ProfileMode.profile\_memory=True"
-\item With an configuration file that defaults to \textasciitilde/.theanorc
+\item With an environment variable: \texttt{THEANO\_FLAGS="mode=ProfileMode,ProfileMode.profile\_memory=True"}
+\item With a configuration file that defaults to \textasciitilde/.theanorc
 \end{itemize}
 }

 \frame{
+% COMMENT: Might want to skip this or put it nearer to the end
 \frametitle{Theano Graph}
 \begin{itemize}
 \item Theano works with symbolic graphs
-\item Those graphs are bi-partite graph (graph with 2 types of nodes)
+\item Those graphs are bi-partite graphs (graph with 2 types of nodes)
 \item Those 2 nodes types are Apply and Variable nodes
 \end{itemize}
 \begin{itemize}
-\item Inputs and Outputs are list of Theano variables
-\item Can navigate through the graph from any point to any point
+\item Inputs and Outputs are lists of Theano variables
+% COMMENT: this is kind of obvious so I commented it out
+%\item Can navigate through the graph from any point to any point
 \end{itemize}
 \begin{center}
 \includegraphics[width=3.5in]{pics/apply_node.pdf}
@@ -581,9 +588,9 @@ Theano can be configured with flags. They can be defined in two way
 \subsection{Profiling}
 \begin{frame}[fragile]
 \frametitle{Profile Mode}
-To replace the default mode with this mode, use the theano flags ``mode=ProfileMode''.
+To replace the default mode with this mode, use the Theano flags \texttt{mode=ProfileMode}

-To enable the memory profiling use the flags ProfileMode.profile\_memory=True 
+To enable the memory profiling use the flags \texttt{ProfileMode.profile\_memory=True} 
 \begin{Verbatim}

 Time since import 1.486s
@@ -713,7 +720,7 @@ Test them first, as they are not guaranteed to always provide a speedup.

 \subsection{Printing}
 \begin{frame}[fragile]
-\frametitle{Text Printing of Graph: Pretty Printing}
+\frametitle{Text Printing of Your Theano Graph: Pretty Printing}
 theano.printing.pprint(variable)
 \vfill
 \begin{Verbatim}
@@ -725,7 +732,7 @@ TensorConstant{0.5})


 \begin{frame}[fragile]
-\frametitle{Text Printing of Graph: Debug Print}
+\frametitle{Text Printing of Your Theano Graph: Debug Print}
 theano.printing.debugprint({fct, variable, list of variables})
 \vfill
 \small
@@ -752,7 +759,7 @@ Elemwise{gt,no_inplace} [@181772236] ''
 \end{frame}

 \begin{frame}[fragile]
-\frametitle{Text Printing of Graph: Debug Print}
+\frametitle{Text Printing of Your Theano Graph: Debug Print}
 theano.printing.debugprint({fct, variable, list of variables})
 \vfill
 \small
@@ -769,15 +776,16 @@ Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
 \end{frame}

 \begin{frame}[fragile]
-\frametitle{Picture Printing of Graph}
+\frametitle{Picture Printing of Graphs}
 \begin{Verbatim}
 >>> theano.printing.pydotprint_variables(prediction)
 \end{Verbatim}
 \includegraphics[width=2.0in]{pics/logreg_pydotprint_prediction.png}
+% COMMENT: Requires graphviz, you should mention that
 \end{frame}

 \begin{frame}[fragile]
-\frametitle{Picture Printing of Graph}
+\frametitle{Picture Printing of Graphs}
 \begin{Verbatim}
 >>> theano.printing.pydotprint(predict)
 \end{Verbatim}
@@ -785,7 +793,7 @@ Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
 \end{frame}

 \begin{frame}[fragile]
-\frametitle{Picture Printing of Graph}
+\frametitle{Picture Printing of Graphs}
 \begin{Verbatim}[commandchars=\\\{\}]
 >>> theano.printing.pydotprint(train) {\color{gray}# This is a small train example!}
 \end{Verbatim}
@@ -798,43 +806,43 @@ Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
 \frame{
 \frametitle{How to Debug}
 \begin{itemize}
-\item Run with the flag mode=DebugMode
+\item Run with the flag \texttt{mode=DebugMode}
  \begin{itemize}
  \item 100-1000x slower
  \item Test all optimization steps from the original graph to the final graph
-  \item Checks many properties that Op should/shoudn't do
-  \item Executes the Python and C code versions
+  \item Checks many things that Op should/shouldn't do
+  \item Executes both the Python and C code versions
  \end{itemize}
-\item Run with the flag mode=FAST\_COMPILE
+\item Run with the flag \texttt{mode=FAST\_COMPILE}
  \begin{itemize}
  \item Few optimizations
-  \item Run Python code (better error messages and can go in the python debugger)
+  \item Run Python code (better error messages and can be debugged interactively in the Python debugger)
  \end{itemize}
-\item Run with the Theano flag compute\_test\_value = {``off'', ``ignore'', ``warn'', ``raise''}
+\item Run with the Theano flag \texttt{compute\_test\_value = {``off'', ``ignore'', ``warn'', ``raise''}}
  \begin{itemize}
  \item Run the code as we create the graph
-  \item Allow to find the bug earlier (ex: shape mismatch)
-  \item Make identification of the wrong line in the code easier
-  \item Use the value of constant and shared variable directly
-  \item For pure symbolic varible use: x.tag.test\_value = numpy.random.rand(5,10)
+  \item Allows you to find the bug earlier (ex: shape mismatch)
+  \item Makes it easier to identify where the problem is in \textit{your} code
+  \item Use the value of constants and shared variables directly
+  \item For pure symbolic variables uses \texttt{x.tag.test\_value = numpy.random.rand(5,10)}
  \end{itemize}
 \end{itemize}
 }

 \frame{
-\frametitle{Known Limitation}
+\frametitle{Known Limitations}
 \begin{itemize}
 \item Compilation phase distinct from execution phase
-\item Compilation time significant
+\item Compilation time can be significant
  \begin{itemize}
  \item Amortize it with functions over big input or reuse functions
  \end{itemize}
 \item Execution overhead
  \begin{itemize}
  \item Needs a certain number of operations to be useful
-  \item We started working on this in a branch
+  \item We have started working on this in a branch
  \end{itemize}
-\item Compilation time super linear to the size of the graph. 
+\item Compilation time superlinear in the size of the graph. 
  \begin{itemize}
  \item A few hundreds nodes is fine
  \item Disabling a few optimizations can speed up compilation
@@ -844,12 +852,12 @@ Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2

 }

-\subsection{Loop}
+\subsection{Loops}
 \frame{
 \frametitle{Scan}
 \begin{itemize}
 \item General form of {\bf recurrence}, which can be used for looping.
-\item {\bf Reduction} and {\bf map}(loop over the leading dimensions) are special case of Scan
+\item {\bf Reduction} and {\bf map}(loop over the leading dimensions) are special cases of Scan
 \item You *scan* a function along some input sequence, producing an
  output at each time-step
 \item The function can see the {\bf previous K time-steps} of your function
@@ -860,9 +868,11 @@ Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
 \item The advantage of using ``scan`` over for loops
  \begin{itemize}
  \item The number of iterations to be part of the symbolic graph
-  \item Minimizes GPU transfers if GPU is involved FB:  I don't understand it?
-  \item Compute gradients through it
-  \item Slightly faster then using a for loop in python with a compiled theano function 
+  \item Minimizes GPU transfers if GPU is involved % FB:  I don't understand it?
+  % COMMENT: I think it means that the result of each iteration does not need to be copied
+  % to host but this is also true for shared variables
+  \item Compute gradients through sequential steps
+  \item Slightly faster then using a for loop in Python with a compiled Theano function 
  \item Can lower the overall memory usage by detecting the actual amount of memory needed
  \end{itemize}
 \end{itemize}
@@ -918,28 +928,28 @@ print calculate_polynomial(test_coeff, 3)
 \frame{
 \frametitle{GPU}
 \begin{itemize}
-\item Now only 32 bits float supported (being worked on)
+\item Only 32 bit floats are supported (being worked on)
 \item Only 1 GPU per process
-\item Use the Theano flag device=gpu to tell to use the gpu device
+\item Use the Theano flag \texttt{device=gpu} to tell to use the GPU device
  \begin{itemize}
-  \item Use device=gpu[gpu\_id] to specify witch gpu
-  \item Shared variable with float32 data are by default in the GPU memory space
+  \item Use \texttt{device=gpu{0, 1, ...}} to specify which GPU if you have more than one
+  \item Shared variables with float32 dtype are by default moved to the GPU memory space
  \end{itemize}
-\item Use the Theano flag floatX=float32
+\item Use the Theano flag \texttt{floatX=float32}
  \begin{itemize}
-  \item Be sure to use floatX (theano.config.floatX) in your code
-  \item Cast input before putting them into a shared variable
+  \item Be sure to use \texttt{floatX} (\texttt{theano.config.floatX}) in your code
+  \item Cast inputs before putting them into a shared variable
  \item Cast "problem": int32 with float32 $\to$ float64
    \begin{itemize}    
-    \item A new cast mechanism is being developed
+    \item A new casting mechanism is being developed
    \item Insert manual cast in your code or use [u]int{8,16}
-    \item Insert manual cast around the mean op (divide by the length that is a int64!)
+    \item Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
    \end{itemize}
  \end{itemize}
 \end{itemize}
 }
 \frame{
-\frametitle{GPU for Exercices:}
+\frametitle{GPU for Exercises}
 \begin{itemize}
 \item Intel Core i7 980 XE(107Gf/s float64) (1050\$)
 \item NVIDIA C2050(515 Gf/s float64, 1Tf/s float32), compute capability 2.0 (2400\$) 6 cores/12 threads
@@ -957,13 +967,13 @@ print calculate_polynomial(test_coeff, 3)
 }

 \frame{
-\frametitle{Theano Exercices}
+\frametitle{Theano Exercises}
 \begin{itemize}
 \item Run the simple example
 \item Run the real example
-\item Modify your version to run in float32 with floatX.
+\item Modify your version to run in float32 with \texttt{floatX}.
 \item Run your version on the CPU and GPU
-\item Do you see a speed up with the GPU? Where does it come from?(Try to profile it)
+\item Do you see a speed up with the GPU? Where does it come from? (Try to profile it)
 \item Scan: modify the polynomial example to have the reduction done by scan
 \end{itemize}
 }
@@ -979,12 +989,12 @@ Authors: Andreas Kl\"{o}ckner
 \item Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
  \begin{itemize}
  \item Makes it much easier to write correct, leak- and crash-free code
-  \item PyCUDA knows about dependencies(for e.g.. it won't detach from a context before all memory allocated in it is also freed)
+  \item PyCUDA knows about dependencies (e.g.. it won't detach from a context before all memory allocated in it is also freed)
  \end{itemize}
 \item Convenience
  \begin{itemize}
-  \item Abstractions to compile CUDA code from python pycuda.driver.SourceModule
-  \item A GPU memory buffer pycuda.gpuarray.GPUArray
+  \item Abstractions to compile CUDA code from Python: \texttt{pycuda.driver.SourceModule}
+  \item A GPU memory buffer: \texttt{pycuda.gpuarray.GPUArray}
  \end{itemize}
 \item Completeness
  \begin{itemize}
@@ -998,7 +1008,7 @@ Authors: Andreas Kl\"{o}ckner
  \begin{itemize}
  \item PyCUDA's base layer is written in C++
  \end{itemize}
-\item Helpful Documentation.
+\item Helpful documentation
 \end{itemize}

 }
@@ -1040,10 +1050,10 @@ multiply_them(

 \frame{
 \frametitle{GpuArray}
-No support for strides.
+No support for strided memory.
 }

-\subsection{PyCUDA+Theano}
+\subsection{PyCUDA + Theano}
 \begin{frame}[fragile]
 \frametitle{Theano Op Contract}
 \begin{Verbatim}
@@ -1053,15 +1063,15 @@ class MyOp(Op):
    def __str__(self):
    def make_node(self, *inputs):

-python implementation:
+Python implementation:
    def perform(self, node, inputs_storage, outputs_storage):

-c implementation: [see theano web site]
+C implementation: [see the Theano web site]

 others implementation (pycuda, ...):
     def make_thunk(self, node, storage_map, _, _2):

-optinal:
+optional:
    def __init__(self, ...):
    def grad(self, inputs, g):
    def infer_shape(node, (i0_shapes, i1_shapes, ...))
@@ -1131,7 +1141,7 @@ class PyCUDADoubleOp(theano.Op):


 \begin{frame}[fragile]
-\frametitle{Theano+PyCUDA Op Example: make\_thunk}
+\frametitle{Theano + PyCUDA Op Example: make\_thunk}
 \begin{Verbatim}
    def make_thunk(self, node, storage_map, _, _2):
        mod = SourceModule( THE_C_CODE )
@@ -1151,7 +1161,7 @@ class PyCUDADoubleOp(theano.Op):
 \end{frame}

 \begin{frame}[fragile]
-\frametitle{Theano+PyCUDA Op Example: GPU Code}
+\frametitle{Theano + PyCUDA Op Example: GPU Code}
 \begin{Verbatim}
 THE_C_CODE = """
 __global__ void my_fct(float * i0, float * o0, int size) {
@@ -1165,7 +1175,7 @@ __global__ void my_fct(float * i0, float * o0, int size) {


 \begin{frame}[fragile]
-\frametitle{Theano+PyCUDA Op Example: Test it!}
+\frametitle{Theano + PyCUDA Op Example: Test it!}
 \begin{Verbatim}
 x = theano.tensor.fmatrix()
 f = theano.function([x], PyCUDADoubleOp()(x))
@@ -1177,7 +1187,7 @@ print numpy.asarray(f(xv))
 \end{frame}

 \begin{frame}
-\frametitle{Theano+PyCUDA Exercices}
+\frametitle{Theano + PyCUDA Exercises}
 \begin{itemize}
 \item Elemwise add: $x + y$
 \item Elemwise with 2 outputs: $x + y$ and $x - y$
@@ -1190,18 +1200,18 @@ print numpy.asarray(f(xv))
 \frame{
 \frametitle{Why a common GPU ndarray?}
 \begin{itemize}
-\item Currently there are at least 4 different GPU arrays in python only
+\item Currently there are at least 4 different GPU array data structures in use by Python packages
  \begin{itemize}
-  \item CudaNdarray(Theano), GPUArray(PyCUDA) and CUDAMatrix(cudamat), GPUArray(PyOpenCL), ...
+  \item CudaNdarray(Theano), GPUArray(PyCUDA), CUDAMatrix(cudamat), GPUArray(PyOpenCL), ...
  \item There are even more if we include other languages
  \end{itemize}
-\item All of them are a subset of numpy.ndarray on the GPU!
-\item Duplicate work
+\item All of them are a subset of the functionality of \texttt{numpy.ndarray} on the GPU
+\item Lots of duplicated effort
  \begin{itemize}
  \item GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
  \end{itemize}
-\item Harder to port/reuse code
-\item Harder to find/distribute code
+\item Lack of a common array API makes it harder to port/reuse code
+\item Also harder to find/distribute code
 \item Divides development work
 \end{itemize}

@@ -1210,11 +1220,11 @@ print numpy.asarray(f(xv))
 \frame{
 \frametitle{Design Goals}
 \begin{itemize}
-\item Make it VERY similar to numpy.ndarray
-\item Be compatible with CUDA and OpenCL
-\item Have the base object in C to allow collaboration with more projects
+\item Make it VERY similar to \texttt{numpy.ndarray}
+\item Be compatible with both CUDA and OpenCL
+\item Have the base object accessible from C to allow collaboration with more projects, across high-level languages
  \begin{itemize}
-  \item We want people from C, C++, ruby, R, ... all use the same base GPU n-dimensional array
+  \item We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
  \end{itemize}
 \end{itemize}
 }
@@ -1223,7 +1233,7 @@ print numpy.asarray(f(xv))
 \frametitle{Final GpuNdArray Note}
 \begin{itemize}
 \item Under development
-\item Will be the next GPU ndarray for Theano (This summer!)
+\item Will be the next GPU array container for Theano (this summer!)
 \item Probably also for PyCUDA, PyOpenCL
 \item Mailing list: http://lists.tiker.net/listinfo/gpundarray
 \end{itemize}
@@ -1235,10 +1245,12 @@ print numpy.asarray(f(xv))
 \frame{
  \frametitle{Conclusion}
  \begin{itemize}
-  \item I presented a tool that try to be the holy grail in computing: {\bf easy to code} and {\bf fast to execute}!
-  \item Allows to run code on CPU and can move them in many case on the GPU
-  \item Easy wrapping of existing GPU code in Theano
-  \item It {\bf works} and is {\bf used in real world}
+  \item I presented a tool that tries to be the holy grail in computing: {\bf easy to code} and {\bf fast to execute}!
+  \item Generates fast, custom CPU code \textit{and} GPU code
+  \item You can easily wrap existing GPU code with Theano
+  \item It {\bf works} and is {\bf used in the real world} by academic researchers \textit{and} industry 
  \end{itemize}
 }
+% COMMENT: it is often customary to have a slide with thank yous to the audience and to funding agencies and stuff at the end, I don't know
+% which ones provided funding... NSERC? CIFAR?
 \end{document}