提交 2ac55778 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged -- no conflict

...@@ -6,7 +6,7 @@ rng = numpy.random ...@@ -6,7 +6,7 @@ rng = numpy.random
N = 400 N = 400
feats = 784 feats = 784
D = (rng.randn(N, feats).astype(theano.config.floatX), rng.randint(size=N,low=0, high=2).astype(theano.config.floatX)) D = (rng.randn(N, feats).astype(theano.config.floatX), rng.randint(size=N,low=0, high=2).astype(theano.config.floatX))
training_steps = 10 training_steps = 10000
# Declare Theano symbolic variables # Declare Theano symbolic variables
x = T.matrix("x") x = T.matrix("x")
...@@ -15,8 +15,8 @@ w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w") ...@@ -15,8 +15,8 @@ w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b") b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
x.tag.test_value = D[0] x.tag.test_value = D[0]
y.tag.test_value = D[1] y.tag.test_value = D[1]
print "Initial model:" #print "Initial model:"
print w.get_value(), b.get_value() #print w.get_value(), b.get_value()
# Construct Theano expression graph # Construct Theano expression graph
...@@ -30,15 +30,25 @@ gw,gb = T.grad(cost, [w,b]) ...@@ -30,15 +30,25 @@ gw,gb = T.grad(cost, [w,b])
train = theano.function( train = theano.function(
inputs=[x,y], inputs=[x,y],
outputs=[prediction, xent], outputs=[prediction, xent],
updates={w:w-0.1*gw, b:b-0.1*gb}, updates={w:w-0.01*gw, b:b-0.01*gb},
name = "train") name = "train")
predict = theano.function(inputs=[x], outputs=prediction, predict = theano.function(inputs=[x], outputs=prediction,
name = "predict") name = "predict")
if any( [x.op.__class__.__name__=='Gemv' for x in train.maker.env.toposort()]):
print 'Used the cpu'
elif any( [x.op.__class__.__name__=='GpuGemm' for x in train.maker.env.toposort()]):
print 'Used the gpu'
else:
print 'ERROR, not able to tell if theano used the cpu or the gpu'
print train.maker.env.toposort()
for i in range(training_steps): for i in range(training_steps):
pred, err = train(D[0], D[1]) pred, err = train(D[0], D[1])
print "Final model:" #print "Final model:"
print w.get_value(), b.get_value() #print w.get_value(), b.get_value()
print "target values for D" print "target values for D"
print D[1] print D[1]
......
...@@ -171,10 +171,10 @@ HPCS 2011, Montr\'eal ...@@ -171,10 +171,10 @@ HPCS 2011, Montr\'eal
\item Real example \item Real example
% More info on T.grad % More info on T.grad
% Where are the optimization in the example? % Where are the optimization in the example?
% Exercises 2 % Exercises 2: logreg\_example.py
\item Theano Flags \item Theano Flags
\item GPU \item GPU
% Exercises 3 % Exercises 3: logreg\_example.py on the gpu
\item Symbolic Variables \item Symbolic Variables
\item Differentiation Details \item Differentiation Details
\item Benchmarks % MLP, Convolucion, Elemwise \item Benchmarks % MLP, Convolucion, Elemwise
...@@ -193,10 +193,11 @@ HPCS 2011, Montr\'eal ...@@ -193,10 +193,11 @@ HPCS 2011, Montr\'eal
\item Compilation Pipeline \item Compilation Pipeline
\item Inplace Optimization \item Inplace Optimization
\item Profiling \item Profiling
%exercises 4 %exercises 4: ProfileMode on logreg\_example, CPU vs GPU
\item Drawing/Printing Theano Graph \item Drawing/Printing Theano Graph
\item Debugging \item Debugging
\item Scan (For-Loop generalization) \item Scan (For-Loop generalization)
%exercises 5: about scan
\item Known Limitations \item Known Limitations
\end{itemize} %& \includegraphics[width=1.in]{pics/theano_logo.png} \end{itemize} %& \includegraphics[width=1.in]{pics/theano_logo.png}
\begin{tabular}{lcr} \begin{tabular}{lcr}
...@@ -213,7 +214,7 @@ HPCS 2011, Montr\'eal ...@@ -213,7 +214,7 @@ HPCS 2011, Montr\'eal
\begin{itemize} \begin{itemize}
\item Introduction \item Introduction
\item Example \item Example
% PyCUDA Exercices % Exercices 6: pycuda_simple.py
\end{itemize} \end{itemize}
\item CUDA Overview \item CUDA Overview
\item Extending Theano \item Extending Theano
...@@ -221,8 +222,9 @@ HPCS 2011, Montr\'eal ...@@ -221,8 +222,9 @@ HPCS 2011, Montr\'eal
\item Theano Graph \item Theano Graph
\item Op Contract \item Op Contract
\item Op Example \item Op Example
% Exercises 7: double.py
\item Theano + PyCUDA \item Theano + PyCUDA
% Theano+PyCUDA Exercises % Exercises 8: pycuda_double_op.py
\end{itemize} \end{itemize}
\item GpuNdArray \item GpuNdArray
\item Conclusion \item Conclusion
...@@ -522,7 +524,7 @@ rng = numpy.random ...@@ -522,7 +524,7 @@ rng = numpy.random
N = 400 N = 400
feats = 784 feats = 784
D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2)) D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2))
training_steps = 10 training_steps = 10000
\end{Verbatim} \end{Verbatim}
\end{frame} \end{frame}
...@@ -657,7 +659,7 @@ Theano can be configured with flags. They can be defined in two ways ...@@ -657,7 +659,7 @@ Theano can be configured with flags. They can be defined in two ways
python logreg_example.py python logreg_example.py
\end{Verbatim} \end{Verbatim}
\vfill \vfill
Modify and execute the example to run on CPU with floatX=float32 Modify and execute the example in the file logreg\_example.py to run on CPU with floatX=float32
* You will need to use: theano.config.floatX and ndarray.astype("str") * You will need to use: theano.config.floatX and ndarray.astype("str")
\end{frame} \end{frame}
...@@ -715,7 +717,6 @@ Computers in the class ...@@ -715,7 +717,6 @@ Computers in the class
\begin{itemize} \begin{itemize}
\item Modify and execute the code to run with floatX=float32 on GPU \item Modify and execute the code to run with floatX=float32 on GPU
\item Run the code on the GPU
\item Time with: \texttt{time python file.py} \item Time with: \texttt{time python file.py}
\end{itemize} \end{itemize}
\end{frame} \end{frame}
...@@ -752,7 +753,7 @@ Computers in the class ...@@ -752,7 +753,7 @@ Computers in the class
\end{itemize} \end{itemize}
\vfill \vfill
\begin{itemize} \begin{itemize}
\item Broadcastability must be specified when creating the variable. \item Broadcastability must be specified when creating the variable
\item The only shorcut with broadcastable dimensions are: {\bf T.row} and {\bf T.col} \item The only shorcut with broadcastable dimensions are: {\bf T.row} and {\bf T.col}
\item For all others: T.tensor(dtype, broadcastable={\bf ([False or True])*nd}) \item For all others: T.tensor(dtype, broadcastable={\bf ([False or True])*nd})
\end{itemize} \end{itemize}
...@@ -849,16 +850,16 @@ To replace the default mode with this mode, use the Theano flags \texttt{mode=Pr ...@@ -849,16 +850,16 @@ To replace the default mode with this mode, use the Theano flags \texttt{mode=Pr
To enable the memory profiling use the flags \texttt{ProfileMode.profile\_memory=True} To enable the memory profiling use the flags \texttt{ProfileMode.profile\_memory=True}
\begin{Verbatim} \begin{Verbatim}
Time since import 2.697s Time since import 33.456s
Theano compile time: 1.046s (38.8% since import) Theano compile time: 1.023s (3.1% since import)
Optimization time: 0.804s Optimization time: 0.789s
Linker time: 0.230s Linker time: 0.221s
Theano fct call 0.028s (1.0% since import) Theano fct call 30.878s (92.3% since import)
Theano Op time 0.026s 1.0%(since import) 93.7%(of fct call) Theano Op time 29.411s 87.9%(since import) 95.3%(of fct call)
Theano function overhead in ProfileMode 0.002s 0.1%(since import) Theano function overhead in ProfileMode 1.466s 4.4%(since import)
6.3%(of fct call) 4.7%(of fct call)
11 Theano fct call, 0.003s per call 10001 Theano fct call, 0.003s per call
Rest of the time since import 1.623s 60.2% Rest of the time since import 1.555s 4.6%
\end{Verbatim} \end{Verbatim}
\end{frame} \end{frame}
...@@ -869,8 +870,8 @@ Theano outputs: ...@@ -869,8 +870,8 @@ Theano outputs:
\begin{Verbatim} \begin{Verbatim}
Theano fct summary: Theano fct summary:
<% total fct time> <total time> <time per call> <nb call> <fct name> <% total fct time> <total time> <time per call> <nb call> <fct name>
97.2% 0.027s 2.70e-03s 10 train 100.0% 30.877s 3.09e-03s 10000 train
2.8% 0.001s 7.84e-04s 1 predict 0.0% 0.000s 4.06e-04s 1 predict
\end{Verbatim} \end{Verbatim}
\end{frame} \end{frame}
...@@ -883,13 +884,13 @@ Single Op-wise summary: ...@@ -883,13 +884,13 @@ Single Op-wise summary:
<% of local_time spent on this kind of Op> <cumulative %> <% of local_time spent on this kind of Op> <cumulative %>
<self seconds> <cumulative seconds> <time per call> <nb_call> <self seconds> <cumulative seconds> <time per call> <nb_call>
<nb_op> <nb_apply> <Op name> <nb_op> <nb_apply> <Op name>
82.0% 82.0% 0.021s 0.021s 2.13e-03s 10 1 1 <Gemv> 87.3% 87.3% 25.672s 25.672s 2.57e-03s 10000 1 1 <Gemv>
14.1% 96.1% 0.004s 0.025s 3.33e-04s 11 1 2 <Dot> 9.7% 97.0% 2.843s 28.515s 2.84e-04s 10001 1 2 <Dot>
2.9% 98.9% 0.001s 0.026s 8.24e-06s * 91 10 10 <Elemwise> 2.4% 99.3% 0.691s 29.206s 7.68e-06s * 90001 10 10 <Elemwise>
0.6% 99.6% 0.000s 0.026s 1.69e-05s 10 1 1 <Alloc> 0.4% 99.7% 0.127s 29.334s 1.27e-05s 10000 1 1 <Alloc>
0.3% 99.9% 0.000s 0.026s 2.43e-06s * 31 2 4 <DimShuffle> 0.2% 99.9% 0.053s 29.386s 1.75e-06s * 30001 2 4 <DimShuffle>
0.1% 100.0% 0.000s 0.026s 1.91e-06s * 10 1 1 <Sum> 0.0% 100.0% 0.014s 29.400s 1.40e-06s * 10000 1 1 <Sum>
0.0% 100.0% 0.000s 0.026s 1.19e-06s * 10 1 1 <Shape_i> 0.0% 100.0% 0.011s 29.411s 1.10e-06s * 10000 1 1 <Shape_i>
(*) Op is running a c implementation (*) Op is running a c implementation
\end{Verbatim} \end{Verbatim}
\end{frame} \end{frame}
...@@ -903,15 +904,15 @@ Op-wise summary: ...@@ -903,15 +904,15 @@ Op-wise summary:
<% of local_time spent on this kind of Op> <cumulative %> <% of local_time spent on this kind of Op> <cumulative %>
<self seconds> <cumulative seconds> <time per call> <self seconds> <cumulative seconds> <time per call>
<nb_call> <nb apply> <Op name> <nb_call> <nb apply> <Op name>
82.0% 82.0% 0.021s 0.021s 2.13e-03s 10 1 Gemv{inplace} 87.3% 87.3% 25.672s 25.672s 2.57e-03s 10000 1 Gemv{inplace}
14.1% 96.1% 0.004s 0.025s 3.33e-04s 11 2 dot 9.7% 97.0% 2.843s 28.515s 2.84e-04s 10001 2 dot
1.4% 97.5% 0.000s 0.025s 3.63e-05s * 10 1 Elemwise{Composite{ 1.3% 98.2% 0.378s 28.893s 3.78e-05s * 10000 1 Elemwise{Composite{
scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}} scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}
0.6% 98.1% 0.000s 0.026s 1.69e-05s 10 1 Alloc 0.4% 98.7% 0.127s 29.021s 1.27e-05s 10000 1 Alloc
0.4% 98.5% 0.000s 0.026s 1.02e-05s * 10 1 Elemwise{Composite{ 0.3% 99.0% 0.092s 29.112s 9.16e-06s * 10000 1 Elemwise{Composite{
exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)] exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)]
0.2% 99.0% 0.000s 0.026s 2.40e-06s * 21 3 InplaceDimShuffle{x} 0.1% 99.3% 0.033s 29.265s 1.66e-06s * 20001 3 InplaceDimShuffle{x}
... (remaining 11 Apply account for 1.3%(0.00s) of the runtime) ... (remaining 11 Apply account for 0.7%(0.00s) of the runtime)
(*) Op is running a c implementation (*) Op is running a c implementation
\end{Verbatim} \end{Verbatim}
\end{frame} \end{frame}
...@@ -925,15 +926,15 @@ Apply-wise summary: ...@@ -925,15 +926,15 @@ Apply-wise summary:
<% of local_time spent at this position> <cumulative %%> <% of local_time spent at this position> <cumulative %%>
<apply time> <cumulative seconds> <time per call> <apply time> <cumulative seconds> <time per call>
<nb_call> <Apply position> <Apply Op name> <nb_call> <Apply position> <Apply Op name>
82.0% 82.0% 0.021s 0.021s 2.13e-03s 10 15 Gemv{inplace}( 87.3% 87.3% 25.672s 25.672s 2.57e-03s 10000 15 Gemv{inplace}(
w, TensorConstant{-0.1}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.998}) w, TensorConstant{-0.01}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.9998})
11.5% 93.4% 0.003s 0.024s 2.99e-04s 10 1 dot(x, w) 9.7% 97.0% 2.843s 28.515s 2.84e-04s 10000 1 dot(x, w)
2.6% 96.1% 0.001s 0.025s 6.81e-04s 1 1 dot(x, w) 1.3% 98.2% 0.378s 28.893s 3.78e-05s 10000 9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
1.4% 97.5% 0.000s 0.025s 3.63e-05s 10 9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0) 0.4% 98.7% 0.127s 29.020s 1.27e-05s 10000 10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
0.6% 98.1% 0.000s 0.026s 1.69e-05s 10 10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0) 0.3% 99.0% 0.092s 29.112s 9.16e-06s 10000 13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0)
0.4% 98.5% 0.000s 0.026s 1.02e-05s 10 13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0) 0.3% 99.3% 0.080s 29.192s 7.99e-06s 10000 11 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)](Elemwise{neg,no_inplace}.0)
... (remaining 14 Apply instances account for ... (remaining 14 Apply instances account for
1.5%(0.00s) of the runtime) 0.7%(0.00s) of the runtime)
\end{Verbatim} \end{Verbatim}
\end{frame} \end{frame}
...@@ -979,6 +980,7 @@ Test them first, as they are not guaranteed to always provide a speedup. ...@@ -979,6 +980,7 @@ Test them first, as they are not guaranteed to always provide a speedup.
\begin{itemize} \begin{itemize}
\item In the last exercises, do you see a speed up with the GPU? \item In the last exercises, do you see a speed up with the GPU?
\item Where does it come from? (Use ProfileMode) \item Where does it come from? (Use ProfileMode)
\item Is there something we can do to speed up the GPU version?
\end{itemize} \end{itemize}
\end{frame} \end{frame}
...@@ -1167,7 +1169,8 @@ print calculate_polynomial(test_coeff, 3) ...@@ -1167,7 +1169,8 @@ print calculate_polynomial(test_coeff, 3)
\frame{ \frame{
\frametitle{Exercises 5} \frametitle{Exercises 5}
\begin{itemize} \begin{itemize}
\item Scan: modify the polynomial example to have the reduction done by scan \item Run the example in the file scan\_pow.py and scan\_poly.py
\item Modify and execute the polynomial example to have the reduction done by scan
\end{itemize} \end{itemize}
} }
...@@ -1335,9 +1338,9 @@ multiply_them( ...@@ -1335,9 +1338,9 @@ multiply_them(
} }
\begin{frame} \begin{frame}
\frametitle{PyCUDA Exercises} \frametitle{Exercises 6}
\begin{itemize} \begin{itemize}
\item Run the example \item Run the example in the file pycuda\_simple.py
\item Modify and execute it to work for a matrix of 20 $\times$ 10 \item Modify and execute it to work for a matrix of 20 $\times$ 10
\end{itemize} \end{itemize}
\end{frame} \end{frame}
...@@ -1429,6 +1432,18 @@ print out ...@@ -1429,6 +1432,18 @@ print out
\end{Verbatim} \end{Verbatim}
\end{frame} \end{frame}
\begin{frame}
\frametitle{Exercises 7}
\begin{itemize}
\item Run the code in the file double\_op.py.
\item Modify and execute to compute: $x * y$
\item Modify and execute the example to return 2 outputs: $x + y$ and $x - y$
\begin{itemize}
\item Our current elemwise fusion generate computation with only 1 outputs
\end{itemize}
\end{itemize}
\end{frame}
\subsection{Theano+PyCUDA} \subsection{Theano+PyCUDA}
\begin{frame}[fragile] \begin{frame}[fragile]
\frametitle{Theano+PyCUDA Op Example} \frametitle{Theano+PyCUDA Op Example}
...@@ -1501,8 +1516,9 @@ print numpy.asarray(f(xv)) ...@@ -1501,8 +1516,9 @@ print numpy.asarray(f(xv))
\end{frame} \end{frame}
\begin{frame} \begin{frame}
\frametitle{Theano + PyCUDA Exercises} \frametitle{Exercises 8}
\begin{itemize} \begin{itemize}
\item Run the example in the file pycuda\_double\_op.py
\item Modify and execute the example to multiple two matrix: $x * y$ \item Modify and execute the example to multiple two matrix: $x * y$
\item Modify and execute the example to return 2 outputs: $x + y$ and $x - y$ \item Modify and execute the example to return 2 outputs: $x + y$ and $x - y$
\begin{itemize} \begin{itemize}
......
import pycuda.autoinit
import pycuda.driver as drv
import numpy
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400,1,1), grid=(1,1))
assert numpy.allclose(dest, a*b)
print dest
import numpy
import theano
import theano.tensor as T
coefficients = theano.tensor.vector("coefficients")
x = T.scalar("x"); max_coefficients_supported = 10000
# Generate the components of the polynomial
full_range=theano.tensor.arange(max_coefficients_supported)
components, updates = theano.scan(fn=lambda coeff, power, free_var:
coeff * (free_var ** power),
outputs_info=None,
sequences=[coefficients, full_range],
non_sequences=x)
polynomial = components.sum()
calculate_polynomial = theano.function(inputs=[coefficients, x],
outputs=polynomial)
test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
print calculate_polynomial(test_coeff, 3)
# 19.0
import theano
import theano.tensor as T
k = T.iscalar("k"); A = T.vector("A")
def inner_fct(prior_result, A): return prior_result * A
# Symbolic description of the result
result, updates = theano.scan(fn=inner_fct,
outputs_info=T.ones_like(A),
non_sequences=A, n_steps=k)
# Scan has provided us with A**1 through A**k. Keep only the last
# value. Scan notices this and does not waste memory saving them.
final_result = result[-1]
power = theano.function(inputs=[A,k], outputs=final_result,
updates=updates)
print power(range(10),2)
...@@ -21,7 +21,9 @@ since 2007. But it is also approachable enough to be used in the classroom ...@@ -21,7 +21,9 @@ since 2007. But it is also approachable enough to be used in the classroom
:scale: 75% :scale: 75%
:align: left :align: left
**NEW!** You can watch a quick (20 minute) introduction to Theano given as a talk at `SciPy 2010 <http://conference.scipy.org/scipy2010/>`_ via streaming (or downloaded) video: **NEW!** `HPCS 2011 Tutorial <http://www.iro.umontreal.ca/~lisa/pointeurs/tutorial_hpcs2011_fixed.pdf>`_. I included a few fix discovered while doing the Tutorial.
You can watch a quick (20 minute) introduction to Theano given as a talk at `SciPy 2010 <http://conference.scipy.org/scipy2010/>`_ via streaming (or downloaded) video:
`Transparent GPU Computing With Theano`_. `Transparent GPU Computing With Theano`_.
James Bergstra, SciPy 2010, June 30, 2010. James Bergstra, SciPy 2010, June 30, 2010.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论