提交 2ac55778 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged -- no conflict

......@@ -6,7 +6,7 @@ rng = numpy.random
N = 400
feats = 784
D = (rng.randn(N, feats).astype(theano.config.floatX), rng.randint(size=N,low=0, high=2).astype(theano.config.floatX))
training_steps = 10
training_steps = 10000
# Declare Theano symbolic variables
x = T.matrix("x")
......@@ -15,8 +15,8 @@ w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
x.tag.test_value = D[0]
y.tag.test_value = D[1]
print "Initial model:"
print w.get_value(), b.get_value()
#print "Initial model:"
#print w.get_value(), b.get_value()
# Construct Theano expression graph
......@@ -30,15 +30,25 @@ gw,gb = T.grad(cost, [w,b])
train = theano.function(
inputs=[x,y],
outputs=[prediction, xent],
updates={w:w-0.1*gw, b:b-0.1*gb},
updates={w:w-0.01*gw, b:b-0.01*gb},
name = "train")
predict = theano.function(inputs=[x], outputs=prediction,
name = "predict")
if any( [x.op.__class__.__name__=='Gemv' for x in train.maker.env.toposort()]):
print 'Used the cpu'
elif any( [x.op.__class__.__name__=='GpuGemm' for x in train.maker.env.toposort()]):
print 'Used the gpu'
else:
print 'ERROR, not able to tell if theano used the cpu or the gpu'
print train.maker.env.toposort()
for i in range(training_steps):
pred, err = train(D[0], D[1])
print "Final model:"
print w.get_value(), b.get_value()
#print "Final model:"
#print w.get_value(), b.get_value()
print "target values for D"
print D[1]
......
......@@ -171,10 +171,10 @@ HPCS 2011, Montr\'eal
\item Real example
% More info on T.grad
% Where are the optimization in the example?
% Exercises 2
% Exercises 2: logreg\_example.py
\item Theano Flags
\item GPU
% Exercises 3
% Exercises 3: logreg\_example.py on the gpu
\item Symbolic Variables
\item Differentiation Details
\item Benchmarks % MLP, Convolucion, Elemwise
......@@ -193,10 +193,11 @@ HPCS 2011, Montr\'eal
\item Compilation Pipeline
\item Inplace Optimization
\item Profiling
%exercises 4
%exercises 4: ProfileMode on logreg\_example, CPU vs GPU
\item Drawing/Printing Theano Graph
\item Debugging
\item Scan (For-Loop generalization)
%exercises 5: about scan
\item Known Limitations
\end{itemize} %& \includegraphics[width=1.in]{pics/theano_logo.png}
\begin{tabular}{lcr}
......@@ -213,7 +214,7 @@ HPCS 2011, Montr\'eal
\begin{itemize}
\item Introduction
\item Example
% PyCUDA Exercices
% Exercices 6: pycuda_simple.py
\end{itemize}
\item CUDA Overview
\item Extending Theano
......@@ -221,8 +222,9 @@ HPCS 2011, Montr\'eal
\item Theano Graph
\item Op Contract
\item Op Example
% Exercises 7: double.py
\item Theano + PyCUDA
% Theano+PyCUDA Exercises
% Exercises 8: pycuda_double_op.py
\end{itemize}
\item GpuNdArray
\item Conclusion
......@@ -522,7 +524,7 @@ rng = numpy.random
N = 400
feats = 784
D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2))
training_steps = 10
training_steps = 10000
\end{Verbatim}
\end{frame}
......@@ -657,7 +659,7 @@ Theano can be configured with flags. They can be defined in two ways
python logreg_example.py
\end{Verbatim}
\vfill
Modify and execute the example to run on CPU with floatX=float32
Modify and execute the example in the file logreg\_example.py to run on CPU with floatX=float32
* You will need to use: theano.config.floatX and ndarray.astype("str")
\end{frame}
......@@ -715,7 +717,6 @@ Computers in the class
\begin{itemize}
\item Modify and execute the code to run with floatX=float32 on GPU
\item Run the code on the GPU
\item Time with: \texttt{time python file.py}
\end{itemize}
\end{frame}
......@@ -752,7 +753,7 @@ Computers in the class
\end{itemize}
\vfill
\begin{itemize}
\item Broadcastability must be specified when creating the variable.
\item Broadcastability must be specified when creating the variable
\item The only shorcut with broadcastable dimensions are: {\bf T.row} and {\bf T.col}
\item For all others: T.tensor(dtype, broadcastable={\bf ([False or True])*nd})
\end{itemize}
......@@ -849,16 +850,16 @@ To replace the default mode with this mode, use the Theano flags \texttt{mode=Pr
To enable the memory profiling use the flags \texttt{ProfileMode.profile\_memory=True}
\begin{Verbatim}
Time since import 2.697s
Theano compile time: 1.046s (38.8% since import)
Optimization time: 0.804s
Linker time: 0.230s
Theano fct call 0.028s (1.0% since import)
Theano Op time 0.026s 1.0%(since import) 93.7%(of fct call)
Theano function overhead in ProfileMode 0.002s 0.1%(since import)
6.3%(of fct call)
11 Theano fct call, 0.003s per call
Rest of the time since import 1.623s 60.2%
Time since import 33.456s
Theano compile time: 1.023s (3.1% since import)
Optimization time: 0.789s
Linker time: 0.221s
Theano fct call 30.878s (92.3% since import)
Theano Op time 29.411s 87.9%(since import) 95.3%(of fct call)
Theano function overhead in ProfileMode 1.466s 4.4%(since import)
4.7%(of fct call)
10001 Theano fct call, 0.003s per call
Rest of the time since import 1.555s 4.6%
\end{Verbatim}
\end{frame}
......@@ -869,8 +870,8 @@ Theano outputs:
\begin{Verbatim}
Theano fct summary:
<% total fct time> <total time> <time per call> <nb call> <fct name>
97.2% 0.027s 2.70e-03s 10 train
2.8% 0.001s 7.84e-04s 1 predict
100.0% 30.877s 3.09e-03s 10000 train
0.0% 0.000s 4.06e-04s 1 predict
\end{Verbatim}
\end{frame}
......@@ -883,13 +884,13 @@ Single Op-wise summary:
<% of local_time spent on this kind of Op> <cumulative %>
<self seconds> <cumulative seconds> <time per call> <nb_call>
<nb_op> <nb_apply> <Op name>
82.0% 82.0% 0.021s 0.021s 2.13e-03s 10 1 1 <Gemv>
14.1% 96.1% 0.004s 0.025s 3.33e-04s 11 1 2 <Dot>
2.9% 98.9% 0.001s 0.026s 8.24e-06s * 91 10 10 <Elemwise>
0.6% 99.6% 0.000s 0.026s 1.69e-05s 10 1 1 <Alloc>
0.3% 99.9% 0.000s 0.026s 2.43e-06s * 31 2 4 <DimShuffle>
0.1% 100.0% 0.000s 0.026s 1.91e-06s * 10 1 1 <Sum>
0.0% 100.0% 0.000s 0.026s 1.19e-06s * 10 1 1 <Shape_i>
87.3% 87.3% 25.672s 25.672s 2.57e-03s 10000 1 1 <Gemv>
9.7% 97.0% 2.843s 28.515s 2.84e-04s 10001 1 2 <Dot>
2.4% 99.3% 0.691s 29.206s 7.68e-06s * 90001 10 10 <Elemwise>
0.4% 99.7% 0.127s 29.334s 1.27e-05s 10000 1 1 <Alloc>
0.2% 99.9% 0.053s 29.386s 1.75e-06s * 30001 2 4 <DimShuffle>
0.0% 100.0% 0.014s 29.400s 1.40e-06s * 10000 1 1 <Sum>
0.0% 100.0% 0.011s 29.411s 1.10e-06s * 10000 1 1 <Shape_i>
(*) Op is running a c implementation
\end{Verbatim}
\end{frame}
......@@ -903,15 +904,15 @@ Op-wise summary:
<% of local_time spent on this kind of Op> <cumulative %>
<self seconds> <cumulative seconds> <time per call>
<nb_call> <nb apply> <Op name>
82.0% 82.0% 0.021s 0.021s 2.13e-03s 10 1 Gemv{inplace}
14.1% 96.1% 0.004s 0.025s 3.33e-04s 11 2 dot
1.4% 97.5% 0.000s 0.025s 3.63e-05s * 10 1 Elemwise{Composite{
87.3% 87.3% 25.672s 25.672s 2.57e-03s 10000 1 Gemv{inplace}
9.7% 97.0% 2.843s 28.515s 2.84e-04s 10001 2 dot
1.3% 98.2% 0.378s 28.893s 3.78e-05s * 10000 1 Elemwise{Composite{
scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}
0.6% 98.1% 0.000s 0.026s 1.69e-05s 10 1 Alloc
0.4% 98.5% 0.000s 0.026s 1.02e-05s * 10 1 Elemwise{Composite{
0.4% 98.7% 0.127s 29.021s 1.27e-05s 10000 1 Alloc
0.3% 99.0% 0.092s 29.112s 9.16e-06s * 10000 1 Elemwise{Composite{
exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)]
0.2% 99.0% 0.000s 0.026s 2.40e-06s * 21 3 InplaceDimShuffle{x}
... (remaining 11 Apply account for 1.3%(0.00s) of the runtime)
0.1% 99.3% 0.033s 29.265s 1.66e-06s * 20001 3 InplaceDimShuffle{x}
... (remaining 11 Apply account for 0.7%(0.00s) of the runtime)
(*) Op is running a c implementation
\end{Verbatim}
\end{frame}
......@@ -925,15 +926,15 @@ Apply-wise summary:
<% of local_time spent at this position> <cumulative %%>
<apply time> <cumulative seconds> <time per call>
<nb_call> <Apply position> <Apply Op name>
82.0% 82.0% 0.021s 0.021s 2.13e-03s 10 15 Gemv{inplace}(
w, TensorConstant{-0.1}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.998})
11.5% 93.4% 0.003s 0.024s 2.99e-04s 10 1 dot(x, w)
2.6% 96.1% 0.001s 0.025s 6.81e-04s 1 1 dot(x, w)
1.4% 97.5% 0.000s 0.025s 3.63e-05s 10 9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
0.6% 98.1% 0.000s 0.026s 1.69e-05s 10 10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
0.4% 98.5% 0.000s 0.026s 1.02e-05s 10 13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0)
87.3% 87.3% 25.672s 25.672s 2.57e-03s 10000 15 Gemv{inplace}(
w, TensorConstant{-0.01}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.9998})
9.7% 97.0% 2.843s 28.515s 2.84e-04s 10000 1 dot(x, w)
1.3% 98.2% 0.378s 28.893s 3.78e-05s 10000 9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
0.4% 98.7% 0.127s 29.020s 1.27e-05s 10000 10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
0.3% 99.0% 0.092s 29.112s 9.16e-06s 10000 13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0)
0.3% 99.3% 0.080s 29.192s 7.99e-06s 10000 11 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)](Elemwise{neg,no_inplace}.0)
... (remaining 14 Apply instances account for
1.5%(0.00s) of the runtime)
0.7%(0.00s) of the runtime)
\end{Verbatim}
\end{frame}
......@@ -979,6 +980,7 @@ Test them first, as they are not guaranteed to always provide a speedup.
\begin{itemize}
\item In the last exercises, do you see a speed up with the GPU?
\item Where does it come from? (Use ProfileMode)
\item Is there something we can do to speed up the GPU version?
\end{itemize}
\end{frame}
......@@ -1167,7 +1169,8 @@ print calculate_polynomial(test_coeff, 3)
\frame{
\frametitle{Exercises 5}
\begin{itemize}
\item Scan: modify the polynomial example to have the reduction done by scan
\item Run the example in the file scan\_pow.py and scan\_poly.py
\item Modify and execute the polynomial example to have the reduction done by scan
\end{itemize}
}
......@@ -1335,9 +1338,9 @@ multiply_them(
}
\begin{frame}
\frametitle{PyCUDA Exercises}
\frametitle{Exercises 6}
\begin{itemize}
\item Run the example
\item Run the example in the file pycuda\_simple.py
\item Modify and execute it to work for a matrix of 20 $\times$ 10
\end{itemize}
\end{frame}
......@@ -1429,6 +1432,18 @@ print out
\end{Verbatim}
\end{frame}
\begin{frame}
\frametitle{Exercises 7}
\begin{itemize}
\item Run the code in the file double\_op.py.
\item Modify and execute to compute: $x * y$
\item Modify and execute the example to return 2 outputs: $x + y$ and $x - y$
\begin{itemize}
\item Our current elemwise fusion generate computation with only 1 outputs
\end{itemize}
\end{itemize}
\end{frame}
\subsection{Theano+PyCUDA}
\begin{frame}[fragile]
\frametitle{Theano+PyCUDA Op Example}
......@@ -1501,8 +1516,9 @@ print numpy.asarray(f(xv))
\end{frame}
\begin{frame}
\frametitle{Theano + PyCUDA Exercises}
\frametitle{Exercises 8}
\begin{itemize}
\item Run the example in the file pycuda\_double\_op.py
\item Modify and execute the example to multiple two matrix: $x * y$
\item Modify and execute the example to return 2 outputs: $x + y$ and $x - y$
\begin{itemize}
......
import pycuda.autoinit
import pycuda.driver as drv
import numpy
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400,1,1), grid=(1,1))
assert numpy.allclose(dest, a*b)
print dest
import numpy
import theano
import theano.tensor as T
coefficients = theano.tensor.vector("coefficients")
x = T.scalar("x"); max_coefficients_supported = 10000
# Generate the components of the polynomial
full_range=theano.tensor.arange(max_coefficients_supported)
components, updates = theano.scan(fn=lambda coeff, power, free_var:
coeff * (free_var ** power),
outputs_info=None,
sequences=[coefficients, full_range],
non_sequences=x)
polynomial = components.sum()
calculate_polynomial = theano.function(inputs=[coefficients, x],
outputs=polynomial)
test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
print calculate_polynomial(test_coeff, 3)
# 19.0
import theano
import theano.tensor as T
k = T.iscalar("k"); A = T.vector("A")
def inner_fct(prior_result, A): return prior_result * A
# Symbolic description of the result
result, updates = theano.scan(fn=inner_fct,
outputs_info=T.ones_like(A),
non_sequences=A, n_steps=k)
# Scan has provided us with A**1 through A**k. Keep only the last
# value. Scan notices this and does not waste memory saving them.
final_result = result[-1]
power = theano.function(inputs=[A,k], outputs=final_result,
updates=updates)
print power(range(10),2)
......@@ -21,7 +21,9 @@ since 2007. But it is also approachable enough to be used in the classroom
:scale: 75%
:align: left
**NEW!** You can watch a quick (20 minute) introduction to Theano given as a talk at `SciPy 2010 <http://conference.scipy.org/scipy2010/>`_ via streaming (or downloaded) video:
**NEW!** `HPCS 2011 Tutorial <http://www.iro.umontreal.ca/~lisa/pointeurs/tutorial_hpcs2011_fixed.pdf>`_. I included a few fix discovered while doing the Tutorial.
You can watch a quick (20 minute) introduction to Theano given as a talk at `SciPy 2010 <http://conference.scipy.org/scipy2010/>`_ via streaming (or downloaded) video:
`Transparent GPU Computing With Theano`_.
James Bergstra, SciPy 2010, June 30, 2010.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论