merge; no conflicts

582b6a67 · Razvan Pascanu · 1ede2527 · 32cb5be2 · 582b6a67 · 582b6a67
--- a/doc/hpcs2011_tutorial/advanced_theano.txt
+++ b/doc/hpcs2011_tutorial/advanced_theano.txt
+.. _advanced_theano:
+***************
+Advanced Theano
+***************
+Compilation pipeline
+--------------------
+.. image:: pics/pipeline.png
+   :width: 400 px
+Inplace optimization
+--------------------
+- 2 type of inplace operations:
+  - An op that return a view on its inputs (e.g. reshape, inplace transpose)
+  - An op that write the output on the inputs memory space
+- This allows some memory optimization
+- The Op must tell Theano if they work inplace
+- Inplace Op add constraints to the order of execution
+Profiling
+---------
+- To replace the default mode with this mode, use the Theano flags ``mode=ProfileMode``
+- To enable the memory profiling use the flags ``ProfileMode.profile_memory=True``
+Theano output:
+.. code-block:: python
+    """
+    Time since import 33.456s
+    Theano compile time: 1.023s (3.1% since import)
+      Optimization time: 0.789s
+      Linker time: 0.221s
+    Theano fct call 30.878s (92.3% since import)
+     Theano Op time 29.411s 87.9%(since import) 95.3%(of fct call)
+     Theano function overhead in ProfileMode 1.466s 4.4%(since import)
+                                                  4.7%(of fct call)
+    10001 Theano fct call, 0.003s per call
+    Rest of the time since import 1.555s 4.6%
+    Theano fct summary:
+    <% total fct time> <total time> <time per call> <nb call> <fct name>
+     100.0% 30.877s 3.09e-03s 10000 train
+      0.0% 0.000s 4.06e-04s 1 predict
+    Single Op-wise summary:
+    <% of local_time spent on this kind of Op> <cumulative %>
+        <self seconds> <cumulative seconds> <time per call> <nb_call>
+        <nb_op> <nb_apply> <Op name>
+       87.3%   87.3%  25.672s  25.672s  2.57e-03s   10000  1  1 <Gemv>
+        9.7% s  97.0%  2.843s  28.515s  2.84e-04s   10001  1  2 <Dot>
+        2.4%   99.3%  0.691s  29.206s  7.68e-06s * 90001 10 10 <Elemwise>
+        0.4%   99.7%  0.127s  29.334s  1.27e-05s   10000  1  1 <Alloc>
+        0.2%   99.9%  0.053s  29.386s  1.75e-06s * 30001  2  4 <DimShuffle>
+        0.0%  100.0%  0.014s  29.400s  1.40e-06s * 10000  1  1 <Sum>
+        0.0%  100.0%  0.011s  29.411s  1.10e-06s * 10000  1  1 <Shape_i>
+    (*) Op is running a c implementation
+    Op-wise summary:
+    <% of local_time spent on this kind of Op> <cumulative %>
+        <self seconds> <cumulative seconds> <time per call>
+        <nb_call> <nb apply> <Op name>
+       87.3%   87.3%  25.672s  25.672s  2.57e-03s   10000  1 Gemv{inplace}
+        9.7%   97.0%  2.843s  28.515s  2.84e-04s   10001  2 dot
+        1.3%   98.2%  0.378s  28.893s  3.78e-05s * 10000  1 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}
+        0.4%   98.7%  0.127s  29.021s  1.27e-05s   10000  1 Alloc
+        0.3%   99.0%  0.092s  29.112s  9.16e-06s * 10000  1 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)]
+        0.1%   99.3%  0.033s  29.265s  1.66e-06s * 20001  3 InplaceDimShuffle{x}
+       ... (remaining 11 Apply account for 0.7%(0.00s) of the runtime)
+    (*) Op is running a c implementation
+    Apply-wise summary:
+    <% of local_time spent at this position> <cumulative %%>
+        <apply time> <cumulative seconds> <time per call>
+        <nb_call> <Apply position> <Apply Op name>
+       87.3%   87.3%  25.672s  25.672s 2.57e-03s  10000  15 Gemv{inplace}(w, TensorConstant{-0.01}, InplaceDimShuffle{1,0}.0, Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0, 0)].0, TensorConstant{0.9998})
+        9.7%   97.0%  2.843s  28.515s 2.84e-04s  10000   1 dot(x, w)
+        1.3%   98.2%  0.378s  28.893s 3.78e-05s  10000   9 Elemwise{Composite{scalar_softplus,{mul,scalar_softplus,{neg,mul,sub}}}}(y, Elemwise{Composite{neg,sub}}[(0, 0)].0, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
+        0.4%   98.7%  0.127s  29.020s 1.27e-05s  10000  10 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
+        0.3%   99.0%  0.092s  29.112s 9.16e-06s  10000  13 Elemwise{Composite{exp,{mul,{true_div,neg,{add,mul}}}}}[(0,0)](Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)].0, Alloc.0, y, Elemwise{Composite{neg,sub}}[(0,0)].0, Elemwise{sub,no_inplace}.0, InplaceDimShuffle{x}.0)
+        0.3%   99.3%  0.080s  29.192s 7.99e-06s  10000  11 Elemwise{ScalarSigmoid{output_types_preference=transfer_type{0}, _op_use_c_code=True}}[(0, 0)](Elemwise{neg,no_inplace}.0)
+       ... (remaining 14 Apply instances account for
+           0.7%(0.00s) of the runtime)
+    Profile of Theano functions memory:
+    (This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)
+    Theano fct: train
+        Max without gc, inplace and view (KB) 2481
+        Max FAST_RUN_NO_GC (KB) 16
+        Max FAST_RUN (KB) 16
+        Memory saved by view (KB) 2450
+        Memory saved by inplace (KB) 15
+        Memory saved by GC (KB) 0
+        <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)>
+            <created/inplace/view> <Apply node>
+        <created/inplace/view> is taked from the op declaration, not ...
+             2508800B  [2508800] v InplaceDimShuffle{1,0}(x)
+                6272B  [6272] i Gemv{inplace}(w, ...)
+                3200B  [3200] c Elemwise{Composite{...}}(y, ...)
+    Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list).
+    Test them first, as they are not guaranteed to always provide a speedup.
+      - Try the Theano flag floatX=float32
+    """
+Exercise 4
+-----------
+- In the last exercises, do you see a speed up with the GPU?
+- Where does it come from? (Use ProfileMode)
+- Is there something we can do to speed up the GPU version?
+Printing/Drawing Theano graphs
+------------------------------
+- Pretty Printing
+``theano.printing.pprint(variable)``
+>>> theano.printing.pprint(prediction)
+gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),TensorConstant{0.5})
+- Debug Print
+``theano.printing.debugprint({fct, variable, list of variables})``
+>>> theano.printing.debugprint(prediction)
+Elemwise{gt,no_inplace} [@181772236] ''
+ |Elemwise{true_div,no_inplace} [@181746668] ''
+ | |InplaceDimShuffle{x} [@181746412] ''
+ | | |TensorConstant{1} [@181745836]
+ | |Elemwise{add,no_inplace} [@181745644] ''
+ | | |InplaceDimShuffle{x} [@181745420] ''
+ | | | |TensorConstant{1} [@181744844]
+ | | |Elemwise{exp,no_inplace} [@181744652] ''
+ | | | |Elemwise{sub,no_inplace} [@181744012] ''
+ | | | | |Elemwise{neg,no_inplace} [@181730764] ''
+ | | | | | |dot [@181729676] ''
+ | | | | | | |x [@181563948]
+ | | | | | | |w [@181729964]
+ | | | | |InplaceDimShuffle{x} [@181743788] ''
+ | | | | | |b [@181730156]
+ |InplaceDimShuffle{x} [@181771788] ''
+ | |TensorConstant{0.5} [@181771148]
+>>> theano.printing.debugprint(predict)
+Elemwise{Composite{neg,{sub,{{scalar_sigmoid,GT},neg}}}} [@183160204] ''   2
+ |dot [@183018796] ''   1
+ | |x [@183000780]
+ | |w [@183000812]
+ |InplaceDimShuffle{x} [@183133580] ''   0
+ | |b [@183000876]
+ |TensorConstant{[ 0.5]} [@183084108]
+- Picture Printing of Graphs
+>>> theano.printing.pydotprint_variables(prediction)
+.. image:: pics/logreg_pydotprint_prediction.png
+   :width: 800 px
+All pydotprint* requires graphviz and pydot
+>>> theano.printing.pydotprint(predict)
+.. image:: pics/logreg_pydotprint_predic.png
+   :width: 800 px
+>>> theano.printing.pydotprint(train) # This is a small train example!
+.. image:: pics/logreg_pydotprint_train.png
+   :width: 1500 px
+Debugging
+---------
+- Run with the flag ``mode=DebugMode``
+  - 100-1000x slower
+  - Test all optimization steps from the original graph to the final graph
+  - Checks many things that Op should/shouldn't do
+  - Executes both the Python and C code versions
+- Run with the Theano flag ``compute_test_value = {``off'',``ignore'', ``warn'', ``raise''}``
+  - Run the code as we create the graph
+  - Allows you to find the bug earlier (ex: shape mismatch)
+  - Makes it easier to identify where the problem is in *your* code
+  - Use the value of constants and shared variables directly
+  - For pure symbolic variables uses ``x.tag.test_value = numpy.random.rand(5,10)``
+- Run with the flag ``mode=FAST_COMPILE``
+  - Few optimizations
+  - Run Python code (better error messages and can be debugged interactively in the Python debugger)
+Loops
+-----
+**Scan**
+- General form of **recurrence**, which can be used for looping.
+- **Reduction** and **map** (loop over the leading dimensions) are special cases of Scan
+- You 'scan' a function along some input sequence, producing an output at each time-step
+- The function can see the **previous K time-steps** of your function
+- ``sum()`` could be computed by scanning the z + x(i) function over a list, given an initial state of ``z=0``.
+- Often a for-loop can be expressed as a ``scan()`` operation, and ``scan`` is the closest that Theano comes to looping.
+- The advantage of using ``scan`` over for loops
+  - The number of iterations to be part of the symbolic graph
+  - Minimizes GPU transfers if GPU is involved
+  - Compute gradients through sequential steps
+  - Slightly faster then using a for loop in Python with a compiled Theano function
+  - Can lower the overall memory usage by detecting the actual amount of memory needed
+**Scan Example: Computing pow(A,k)**
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  k = T.iscalar("k"); A = T.vector("A")
+  def inner_fct(prior_result, A): return prior_result * A
+  # Symbolic description of the result
+  result, updates = theano.scan(fn=inner_fct,
+                              outputs_info=T.ones_like(A),
+                              non_sequences=A, n_steps=k)
+  # Scan has provided us with A**1 through A**k.  Keep only the last
+  # value. Scan notices this and does not waste memory saving them.
+  final_result = result[-1]
+  power = theano.function(inputs=[A,k], outputs=final_result,
+                        updates=updates)
+  print power(range(10),2)
+  #[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
+**Scan Example: Calculating a Polynomial**
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  coefficients = theano.tensor.vector("coefficients")
+  x = T.scalar("x"); max_coefficients_supported = 10000
+  # Generate the components of the polynomial
+  full_range=theano.tensor.arange(max_coefficients_supported)
+  components, updates = theano.scan(fn=lambda coeff, power, free_var:
+                                     coeff * (free_var ** power),
+                                  outputs_info=None,
+                                  sequences=[coefficients, full_range],
+                                  non_sequences=x)
+  polynomial = components.sum()
+  calculate_polynomial = theano.function(inputs=[coefficients, x],
+                                       outputs=polynomial)
+  test_coeff = numpy.asarray([1, 0, 2], dtype=numpy.float32)
+  print calculate_polynomial(test_coeff, 3)
+  # 19.0
+Exercise 5
+-----------
+- Run both examples 
+- Modify and execute the polynomial example to have the reduction done by scan
+Known limitations
+-----------------
+- Compilation phase distinct from execution phase
+- Compilation time can be significant
+  - Amortize it with functions over big input or reuse functions
+- Execution overhead
+  - Needs a certain number of operations to be useful
+  - We have started working on this in a branch
+- Compilation time superlinear in the size of the graph.
+  - A few hundreds nodes is fine
+  - Disabling a few optimizations can speed up compilation
+  - Usually too many nodes indicates a problem with the graph
+- Lazy evaluation in a branch (We will try to merge this summer)
--- a/doc/hpcs2011_tutorial/extending_theano.txt
+++ b/doc/hpcs2011_tutorial/extending_theano.txt
+.. _extending_theano:
+****************
+Extending Theano
+****************
+Theano graphs
+-------------
+- Theano works with symbolic graphs
+- Those graphs are bi-partite graphs (graph with 2 types of nodes)
+- Those 2 nodes types are Apply and Variable nodes
+Inputs and Outputs are lists of Theano variables
+.. image:: pics/apply_node.png
+    :width: 500 px
+Op contract
+-----------
+.. code-block:: python
+    import theano
+    class MyOp(Op):
+        def __eq__(self, other):
+        def __hash__(self):
+        def __str__(self):
+        def make_node(self, x):
+        # Python implementation:
+        def perform(self, node, inputs_storage, output_storage):
+        # C implementation: [see theano web site]
+        # others implementation (pycuda, ...):
+        def make_thunk(self, node, storage_map, _, _2):
+        # optional:
+        def __init__(self, ...):
+        def grad(self, inputs, g):
+        def infer_shape(node, (i0_shapes, ...))
+Op example
+----------
+.. code-block:: python
+    import theano
+    class DoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, x):
+            x = theano.tensor.as_tensor_variable(x)
+            return theano.Apply(self, [x], [x.type()])
+        def perform(self, node, inputs, output_storage):
+            x = inputs[0]
+            z = output_storage[0]
+            z[0] = x * 2
+Test it!
+>>> x = theano.tensor.matrix()
+>>> f = theano.function([x],DoubleOp()(x))
+>>> import numpy
+>>> inp = numpy.random.rand(5,5)
+>>> out = f(inp)
+>>> assert numpy.allclose(inp*2, out)
+>>> print inp
+>>> print out
+Exercises 7
+-----------
+- Run the code in the file double_op.py.
+- Modify and execute to compute: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+  - Our current elemwise fusion generate computation with only 1 outputs
+Theano + PyCUDA
+---------------
+.. code-block:: python
+    import numpy, theano
+    import theano.misc.pycuda_init
+    from pycuda.compiler import SourceModule
+    import theano.sandbox.cuda as cuda
+    class PyCUDADoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, inp):
+            inp = cuda.basic_ops.gpu_contiguous(
+               cuda.basic_ops.as_cuda_ndarray_variable(inp))
+            assert inp.dtype == "float32"
+            return theano.Apply(self, [inp], [inp.type()])
+        def make_thunk(self, node, storage_map, _, _2):
+            mod = SourceModule("""
+        __global__ void my_fct(float * i0, float * o0, int size) {
+        int i = blockIdx.x*blockDim.x + threadIdx.x;
+        if(i<size){
+            o0[i] = i0[i]*2;
+        }
+      }""")
+            pycuda_fct = mod.get_function("my_fct")
+            inputs = [ storage_map[v] for v in node.inputs]
+            outputs = [ storage_map[v] for v in node.outputs]
+            def thunk():
+                z = outputs[0]
+                if z[0] is None or z[0].shape!=inputs[0][0].shape:
+                    z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
+                grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
+                pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
+                           block=(512,1,1), grid=grid)
+            return thunk
+Test it!
+>>> x = theano.tensor.fmatrix()
+>>> f = theano.function([x], PyCUDADoubleOp()(x))
+>>> xv=numpy.ones((4,5), dtype="float32")
+>>> assert numpy.allclose(f(xv), xv*2)
+>>> print numpy.asarray(f(xv))
+Exercises 8
+-----------
+- Run the above example
+- Modify and execute the example to multiple two matrix: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+  - Our current elemwise fusion generate computation with only 1 outputs
+- Modify and execute the example to support stride? (Don't force the input to be c contiguous)
--- a/doc/hpcs2011_tutorial/gpundarray.txt
+++ b/doc/hpcs2011_tutorial/gpundarray.txt
+.. _gpundarray:
+**********
+GpuNdArray
+**********
+Why a common GPU ndarray?
+- Currently there are at least 4 different GPU array data structures in use by Python packages
+  - CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ...
+  - There are even more if we include other languages
+- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU
+- Lots of duplicated effort
+  - GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
+- Lack of a common array API makes it harder to port/reuse code
+- Also harder to find/distribute code
+- Divides development work
+Design Goals
+- Make it VERY similar to ``numpy.ndarray``
+- Be compatible with both CUDA and OpenCL
+- Have the base object accessible from C to allow collaboration with more projects, across high-level languages
+  - We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
+Final GpuNdArray Note
+- Under development
+- Will be the next GPU array container for Theano (this summer!)
+- Probably also for PyCUDA, PyOpenCL
+- Mailing list: http://lists.tiker.net/listinfo/gpundarray
--- a/doc/hpcs2011_tutorial/index.txt
+++ b/doc/hpcs2011_tutorial/index.txt
+.. _index:
+=========================
+GPU programming made Easy
+=========================
+.. toctree::
+    introduction
+    theano
+    advanced_theano
+    pyCUDA
+    extending_theano
+    gpundarray
--- a/doc/hpcs2011_tutorial/introduction.txt
+++ b/doc/hpcs2011_tutorial/introduction.txt
+.. _introduction:
+************
+Introduction
+************
+Theano motivations
+------------------
+Theano tries to be the **holy grail** in computing: *easy to code* and *it fast to execute* !
+it works only on mathematical expressions, so you won't have:
+  - Function call inside a theano function
+  - Structure, enum
+  - Dynamic type (Theano is Fully typed)
+Unfortunately it doesn't do coffee... yet.
+.. image:: pics/Caffeine_Machine_no_background_red.png
+Theano status
+-------------
+Why you can rely on Theano:
+- Theano has been developed and used since January 2008 (3.5 yrs old)
+- Core technology for a funded Silicon-Valley startup
+- Driven over 40 research papers in the last few years
+- Good user documentation
+- Active mailing list with participants from outside our lab
+- Many contributors (some from outside our lab)
+- Used to teach IFT6266 for two years
+- Used by everyone in our lab (\textasciitilde 30 people)
+- Deep Learning Tutorials
+- Unofficial RPMs for Mandriva
+- Downloads (June 8 2011, since last January): Pypi 780, MLOSS: 483, Assembla (``bleeding edge'' repository): unknown
+Why scripting for GPUs ?
+------------------------
+**GPUs?**
+- Faster, cheaper, more efficient power usage
+- How much faster? I have seen numbers from 100x slower to 1000x faster.
+  - It depends on the algorithms
+  - How the benchmark is done
+    - Quality of implementation
+    - How much time was spent optimizing CPU vs GPU code
+  - In Theory:
+    - Intel Core i7 980 XE (107Gf/s float64) 6 cores
+    - NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores
+    - NVIDIA GTX580 (1.5Tf/s float32) 512 cores
+  - Theano goes up to 100x faster on th GPU because we don't use multiple core on CPU
+    - Theano can be linked with multi-core capable BLAS (GEMM and GEMV)
+  - If you see 1000x, it probably means the benchmark is not fair
+**Scripting for GPUs?**
+They *Complement each other*
+- GPUs are everything that scripting/high level languages are not
+  - Highly parallel
+  - Very architecture-sensitive
+  - Built for maximum FP/memory throughput
+- CPU: largely restricted to control
+  - Optimized for sequential code and low latency (rather than high throughput)
+  - Tasks (1000/sec)
+  - Scripting fast enough
+Theano vs PyCUDA vs PyOpenCL vs CUDA
+------------------------------------
+- Theano
+  - Mathematical expression compiler
+  - Generates costum C and CUDA code
+  - Uses Python code when performance is not critical
+- CUDA
+  - C extension by NVIDA that allow to code and use GPU
+- PyCUDA (Python + CUDA)
+  - Python interface to CUDA
+  - Memory management of GPU objects
+  - Compilation of code for the low-level driver
+- PyOpenCL (Python + OpenCL)
+  - PyCUDA for OpenCL
+Python
+------
+- Interpreted language
+- General-purpose high-level programming language
+- OO and scripting language
+- Emphasizes code readability
+- Large and comprehensive standard library
+- Indentation for block delimiters
+- Dynamic type and memory management
+- Dictionary ``d={'var1':'value1', 'var2':42, ...}``
+- List comprehension: ``[i+3 for i in range(10)]``
+NumPy
+-----
+- Base scientific computing package in Python on the CPU
+- A powerful N-dimensional array object
+  - ndarray.{ndim, shape, size, dtype, itemsize, stride}
+- Sophisticated broadcasting functions
+  - ``numpy.random.rand(4,5) * numpy.random.rand(1,5)`` -> mat(4,5)
+  - ``numpy.random.rand(4,5) * numpy.random.rand(4,1)`` -> mat(4,5)
+  - ``numpy.random.rand(4,5) * numpy.random.rand(5)`` -> mat(4,5)
+- Tools for integrating C/C++ and Fortran code
+- Linear algebra, Fourier transform and pseudorandom number generation
--- a/doc/hpcs2011_tutorial/pics/apply_node.png
+++ b/doc/hpcs2011_tutorial/pics/apply_node.png
--- a/doc/hpcs2011_tutorial/pics/conv.png
+++ b/doc/hpcs2011_tutorial/pics/conv.png
--- a/doc/hpcs2011_tutorial/pics/f_optimized.png
+++ b/doc/hpcs2011_tutorial/pics/f_optimized.png
--- a/doc/hpcs2011_tutorial/pics/f_unoptimized.png
+++ b/doc/hpcs2011_tutorial/pics/f_unoptimized.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_train.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_train.png
--- a/doc/hpcs2011_tutorial/pics/mlp.png
+++ b/doc/hpcs2011_tutorial/pics/mlp.png
--- a/doc/hpcs2011_tutorial/pics/multiple_graph.png
+++ b/doc/hpcs2011_tutorial/pics/multiple_graph.png
--- a/doc/hpcs2011_tutorial/pics/pipeline.png
+++ b/doc/hpcs2011_tutorial/pics/pipeline.png
--- a/doc/hpcs2011_tutorial/pyCUDA.txt
+++ b/doc/hpcs2011_tutorial/pyCUDA.txt
+.. _pyCUDA:
+******
+PyCUDA
+******
+Introduction
+------------
+Authors: Andreas Klockner
+- PyCUDA can access Nvidia's CUDA parallel computation API from Python
+- Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
+  - Makes it much easier to write correct, leak- and crash-free code
+  - PyCUDA knows about dependencies (e.g.. it won't detach from a context before all memory allocated in it is also freed)
+- Convenience
+  - Abstractions to compile CUDA code from Python: ``pycuda.driver.SourceModule``
+  - A GPU memory buffer: \texttt{pycuda.gpuarray.GPUArray}
+- Completeness
+  - Binding to all of CUDA's driver API
+- Automatic Error Checking
+  - All CUDA errors are automatically translated into Python exceptions
+- Speed
+  - PyCUDA's base layer is written in C++
+- Helpful documentation
+Example
+-------
+.. code-block:: python
+  import pycuda.autoinit
+  import pycuda.driver as drv
+  import numpy
+  from pycuda.compiler import SourceModule
+  mod = SourceModule("""
+  __global__ void multiply_them(float *dest, float *a, float *b)
+  {
+    const int i = threadIdx.x;
+    dest[i] = a[i] * b[i];
+  }
+  """)
+  multiply_them = mod.get_function("multiply_them")
+  a = numpy.random.randn(400).astype(numpy.float32)
+  b = numpy.random.randn(400).astype(numpy.float32)
+  dest = numpy.zeros_like(a)
+  multiply_them(
+          drv.Out(dest), drv.In(a), drv.In(b),
+          block=(400,1,1), grid=(1,1))
+  assert numpy.allclose(dest, a*b)
+  print dest
+Exercice 6
+----------
+- Run the above example
+- Modify and execute it to work for a matrix of 20 x 10
--- a/doc/hpcs2011_tutorial/theano.txt
+++ b/doc/hpcs2011_tutorial/theano.txt
+.. _theano:
+******
+Theano
+******
+Pointers
+--------
+- http://deeplearning.net/software/theano/
+- Announcements mailing list: http://groups.google.com/group/theano-announce
+- User mailing list: http://groups.google.com/group/theano-users
+- Deep Learning Tutorials: http://www.deeplearning.net/tutorial/
+- Installation: https://deeplearning.net/software/theano/install.html
+Description
+-----------
+- Mathematical symbolic expression compiler
+- Dynamic C/CUDA code generation
+- Efficient symbolic differentiation
+  - Theano computes derivatives of functions with one or many inputs.
+- Speed and stability optimizations
+  - Gives the right answer for ``log(1+x)`` even if x is really tiny.
+- Works on Linux, Mac and Windows
+- Transparent use of a GPU
+  - float32 only for now (working on other data types)
+  - Doesn't work on Windows for now
+  - On GPU data-intensive calculations are typically between 6.5x and 44x faster. We've seen speedups up to 140x
+- Extensive unit-testing and self-verification
+  - Detects and diagnoses many types of errors
+- On CPU, common machine learning algorithms are 1.6x to 7.5x faster than competitive alternatives
+  - including specialized implementations in C/C++, NumPy, SciPy, and Matlab
+- Expressions mimic NumPy's syntax & semantics
+- Statically typed and purely functional
+- Some sparse operations (CPU only)
+- The project was started by James Bergstra and Olivier Breuleux
+- For the past 1-2 years, I have replaced Olivier as lead contributor
+Why Theano is better ?
+----------------------
+Executing the code is faster because Theano:
+  - Rearranges high-level expressions
+  - Produces customized low-level code
+  - Uses a variety of backend technologies (GPU,...)
+Writing the code is faster because:
+  - High-level language allows to **concentrate on the algorithm**
+  - Theano do **automatic optimization**
+    - No need to manually optimize for each algorithm you want to test
+  - Theano do **automatic efficient symbolic differentiation**
+    - No need to manually differentiate your functions (tedious & error-prone for complicated expressions!)
+Simple example
+--------------
+>>> import theano
+>>> a = theano.tensor.vector("a")      # declare symbolic variable
+>>> b = a + a**10                      # build symbolic expression
+>>> f = theano.function([a], b)        # compile function
+>>> print f([0,1,2])                   # prints `array([0,2,1026])`
+==================================  ==================================
+        Unoptimized graph                    Optimized graph
+==================================  ==================================
+.. image:: pics/f_unoptimized.png   .. image:: pics/f_optimized.png
+==================================  ==================================
+Symbolic programming
+- Paradigm shift: people need to use it to understand it
+Exercise 1
+-----------
+.. code-block:: python
+  import theano
+  a = theano.tensor.vector("a") # declare variable
+  b = a + a**10                 # build symbolic expression
+  f = theano.function([a], b)   # compile function
+  print f([0,1,2])
+  # prints `array([0,2,1026])`
+  theano.printing.pydotprint_variables(b, outfile="f_unoptimized.png", var_with_name_simple=True)
+  theano.printing.pydotprint(f, outfile="f_optimized.png", var_with_name_simple=True)
+Modify and execute the example to do this expression: a**2 + b**2 + 2*a*b
+Real example
+------------
+**Logistic Regression**
+- GPU-ready
+- Symbolic differentiation
+- Speed optimizations
+- Stability optimizations
+.. code-block:: python
+  import numpy
+  import theano
+  import theano.tensor as T
+  rng = numpy.random
+  N = 400
+  feats = 784
+  D = (rng.randn(N, feats), rng.randint(size=N,low=0, high=2))
+  training_steps = 10000
+  # Declare Theano symbolic variables
+  x = T.matrix("x")
+  y = T.vector("y")
+  w = theano.shared(rng.randn(100), name="w")
+  b = theano.shared(0., name="b")
+  print "Initial model:"
+  print w.get_value(), b.get_value()
+  # Construct Theano expression graph
+  p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))     # Probability that target = 1
+  prediction = p_1 > 0.5                    # The prediction thresholded
+  xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) # Cross-entropy loss function
+  cost = xent.mean() + 0.01*(w**2).sum()    # The cost to minimize
+  gw,gb = T.grad(cost, [w,b])
+  # Compile
+  train = theano.function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+            updates={w:w-0.1*gw, b:b-0.1*gb})
+  predict = theano.function(inputs=[x], outputs=prediction)
+  # Train
+  for i in range(training_steps):
+      pred, err = train(D[0], D[1])
+  print "Final model:"
+  print w.get_value(), b.get_value()
+  print "target values for D:", D[1]
+  print "prediction on D:", predict(D[0])
+**Optimizations:**
+.. code-block:: python
+  p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
+  # 1 / (1 + T.exp(var)) -> sigmoid(var)
+  xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
+  # Log(1-sigmoid(var)) -> -sigmoid(var)
+  prediction = p_1 > 0.5
+  cost = xent.mean() + 0.01*(w**2).sum()
+  gw,gb = T.grad(cost, [w,b])
+  train = theano.function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+            # w-0.1*gw: GEMV with the dot in the grad
+            updates={w:w-0.1*gw, b:b-0.1*gb})
+Where are those optimization applied?
+- ``log(1+exp(x))``
+- ``1 / (1 + T.exp(var))`` (sigmoid)
+- ``log(1-sigmoid(var))`` (softplus, stabilisation)
+- GEMV (matrix-vector multiply from BLAS)
+- Loop fusion
+Theano flags
+------------
+Theano can be configured with flags. They can be defined in two ways
+- With an environment variable: ``THEANO_FLAGS="mode=ProfileMode,ProfileMode.profile_memory=True"``
+- With a configuration file that defaults to ``~.theanorc``
+Exercise 2
+-----------
+.. code-block:: python
+    import numpy
+    import theano
+    import theano.tensor as T
+    rng = numpy.random
+    N = 400
+    feats = 784
+    D = (rng.randn(N, feats).astype(theano.config.floatX),
+    rng.randint(size=N,low=0, high=2).astype(theano.config.floatX))
+    training_steps = 10000
+    # Declare Theano symbolic variables
+    x = T.matrix("x")
+    y = T.vector("y")
+    w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
+    b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
+    x.tag.test_value = D[0]
+    y.tag.test_value = D[1]
+    #print "Initial model:"
+    #print w.get_value(), b.get_value()
+    # Construct Theano expression graph
+    p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b)) # Probabily of having a one
+    prediction = p_1 > 0.5 # The prediction that is done: 0 or 1
+    xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) # Cross-entropy
+    cost = xent.mean() + 0.01*(w**2).sum() # The cost to optimize
+    gw,gb = T.grad(cost, [w,b])
+    # Compile expressions to functions
+    train = theano.function(
+                inputs=[x,y],
+                outputs=[prediction, xent],
+                updates={w:w-0.01*gw, b:b-0.01*gb},
+                name = "train")
+    predict = theano.function(inputs=[x], outputs=prediction,
+                name = "predict")
+    if any( [x.op.__class__.__name__=='Gemv' for x in
+    train.maker.env.toposort()]):
+        print 'Used the cpu'
+    elif any( [x.op.__class__.__name__=='GpuGemm' for x in
+    train.maker.env.toposort()]):
+        print 'Used the gpu'
+    else:
+        print 'ERROR, not able to tell if theano used the cpu or the gpu'
+        print train.maker.env.toposort()
+    for i in range(training_steps):
+        pred, err = train(D[0], D[1])
+    #print "Final model:"
+    #print w.get_value(), b.get_value()
+    print "target values for D"
+    print D[1]
+    print "prediction on D"
+    print predict(D[0])
+    # Print the graph used in the slides
+    theano.printing.pydotprint(predict,
+                               outfile="pics/logreg_pydotprint_predic.png",
+                               var_with_name_simple=True)
+    theano.printing.pydotprint_variables(prediction,
+                               outfile="pics/logreg_pydotprint_prediction.png",
+                               var_with_name_simple=True)
+    theano.printing.pydotprint(train,
+                               outfile="pics/logreg_pydotprint_train.png",
+                               var_with_name_simple=True)
+Modify and execute the example to run on CPU with floatX=float32
+* You will need to use: ``theano.config.floatX`` and ``ndarray.astype("str")``
+GPU
+---
+- Only 32 bit floats are supported (being worked on)
+- Only 1 GPU per process
+- Use the Theano flag ``device=gpu`` to tell to use the GPU device
+  - Use ``device=gpu{0, 1, ...}`` to specify which GPU if you have more than one
+  - Shared variables with float32 dtype are by default moved to the GPU memory space
+- Use the Theano flag ``floatX=float32``
+  - Be sure to use ``floatX`` (``theano.config.floatX``) in your code
+  - Cast inputs before putting them into a shared variable
+  - Cast "problem": int32 with float32 to float64
+    - A new casting mechanism is being developed
+    - Insert manual cast in your code or use [u]int{8,16}
+    - Insert manual cast around the mean operator (which involves a division by the length, which is an int64!)
+Exercice 3
+-----------
+- Modify and execute the example of `Exercise 2`_ to run with floatX=float32 on GPU
+- Time with: ``time python file.py``
+Symbolic variables
+------------------
+- # Dimensions
+  - T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4
+- Dtype
+  - T.[fdczbwil]vector (float32, float64, complex64, complex128, int8, int16, int32, int64)
+  - T.vector to floatX dtype
+  - floatX: configurable dtype that can be float32 or float64.
+- Custom variable
+  - All are shortcuts to: ``T.tensor(dtype, broadcastable=[False]*nd)``
+  - Other dtype: uint[8,16,32,64], floatX
+Creating symbolic variables: Broadcastability
+  - Remember what I said about broadcasting?
+  - How to add a row to all rows of a matrix?
+  - How to add a column to all columns of a matrix?
+- Broadcastability must be specified when creating the variable
+- The only shorcut with broadcastable dimensions are: **T.row** and **T.col**
+- For all others: ``T.tensor(dtype, broadcastable=([False or True])*nd)``
+Differentiation details
+-----------------------
+>>> gw,gb = T.grad(cost, [w,b])
+- T.grad works symbolically: takes and returns a Theano variable
+- T.grad can be compared to a macro: it can be applied multiple times
+- T.grad takes scalar costs only
+- Simple recipe allows to compute efficiently vector x Jacobian and vector x Hessian
+- We are working on the missing optimizations to be able to compute efficently the full Jacobian and Hessian and Jacobian x vector
+Benchmarks
+----------
+Example:
+- Multi-layer perceptron
+- Convolutional Neural Networks
+- Misc Elemwise operations
+Competitors: NumPy + SciPy, MATLAB, EBLearn, Torch5, numexpr
+- EBLearn, Torch5: specialized libraries written by practitioners specifically for these tasks
+- numexpr: similar to Theano, 'virtual machine' for elemwise expressions
+**Multi-Layer Perceptron**:
+60x784 matrix times 784x500 matrix, tanh, times 500x10 matrix, elemwise, then all in reverse for backpropagation
+.. image:: pics/mlp.png
+**Convolutional Network**: 
+256x256 images convolved with 6 7x7 filters,
+downsampled to 6x50x50, tanh, convolution with 16 6x7x7 filter, elementwise
+tanh, matrix multiply, softmax elementwise, then in reverse
+.. image:: pics/conv.png
+**Elemwise**
+- All on CPU
+- Solid blue: Theano
+- Dashed Red: numexpr (without MKL)
+.. image:: pics/multiple_graph.png
--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
@@ -71,6 +71,14 @@ def test_pycuda_memory_to_theano():
    print "gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y)
    assert sys.getrefcount(y)==3
    assert (numpy.asarray(z) == 0).all()
+    assert z.base is y
+    # Test that we can take a view from this cuda view on pycuda memory
+    zz = z.view()
+    assert sys.getrefcount(y) == 4
+    assert zz.base is y
+    del zz
+    assert sys.getrefcount(y) == 3
    cuda_ones = cuda_ndarray.CudaNdarray(numpy.asarray([[[1]]],dtype='float32'))
    z += cuda_ones

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -50,13 +50,7 @@ class HostFromGpu(Op):
        z[0] = numpy.asarray(x)
    def grad(self, inputs, grads):
        gz, = grads
-        if isinstance(gz, tensor.TensorType):
+        return [gpu_from_host(gz)]
-            # This would only happen if you call Lop, and provide a tensor
-            # that is not cuda
-            # This might require another look to be sure
-            return [gpu_from_host(gz)]
-        else:
-            return [gz]
    def R_op(self, inputs, eval_points):
        ev, = eval_points
@@ -85,13 +79,7 @@ class GpuFromHost(Op):
        z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0])
    def grad(self, inputs, grads):
        gz, = grads
-        if isinstance(gz,CudaNdarrayType):
+        return [host_from_gpu(gz)]
-            # This would only happen if you call Lop, and provide a tensor
-            # that is not cuda
-            # This might require another look to be sure
-            return [host_from_gpu(gz)]
-        else:
-            return [gz]
    def R_op(self, inputs, eval_points):
        ev, = eval_points

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2585,13 +2585,10 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * bas
    // Get the original base object (base.base.base...)
    PyObject * orig_base = base;
    // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
-    if (orig_base && CudaNdarray_Check(orig_base))
+    while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
    {
-        while (((CudaNdarray*) orig_base)->base)
+        // base_base is itself a view
-        {
+        orig_base = ((CudaNdarray*) orig_base)->base;
-            // base_base is itself a view
-            orig_base = ((CudaNdarray*) orig_base)->base;
-        }
    }
    //N.B. XDECREF and XINCREF are no-ops for NULL pointers
    if (self->base != orig_base)

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -594,7 +594,7 @@ def local_gpu_advanced_incsubtensor1(node):
                                               gpu_from_host(y), *coords)]
    # Should not execute for GpuAdvancedIncSubtensor1
-    if node.op.__class__ is tensor.AdvancedSubtensor1 and node.inputs[0].dtype=="float32":
+    if node.op.__class__ is tensor.AdvancedIncSubtensor1 and node.inputs[0].dtype=="float32":
        x, y  = node.inputs[0:2]
        coords = node.inputs[2:]
        go_gpu = False

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -806,6 +806,22 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
    def __init__(self, name):
        return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name)
+def test_advinc_subtensor1():
+    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
+    shared = cuda.shared_constructor
+    #shared = tensor.shared
+    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+                      dtype='float32')
+    yval = numpy.asarray([[10,10,10], [10,10,10]],
+                      dtype='float32')
+    x = shared(xval, name = 'x')
+    y = T.fmatrices('y')
+    expr = T.advanced_inc_subtensor1(x,y,[0,2])
+    f=theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op,cuda.GpuAdvancedIncSubtensor1) for node in f.maker.env.toposort() ])==1
+    assert numpy.allclose(f(yval),[[11.,12.,13.], [4.,5.,6.], [17.,18.,19.]])
 def test_inc_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared
@@ -832,7 +848,6 @@ def test_set_subtensor():
                      dtype='float32')
    expr = T.set_subtensor(x[:,1:3], y[:,1:3])
    f=theano.function([x,y], expr, mode=mode_with_gpu)
-    print f.maker.env.toposort()
    assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
    assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==True for node in f.maker.env.toposort() ])==1
    print f(xval,yval)

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -116,7 +116,7 @@ def test_run_nnet():
            rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
            #print "cpu:", rval_cpu
            #print "gpu:", rval_gpu
-            abs_diff, rel_diff = theano.tensor.basic.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
+            abs_diff, rel_diff = theano.tensor.tensor_grad.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
            max_abs_diff = abs_diff.max()
            print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d"%(
                max_abs_diff, rel_diff.max(), n_in, n_hid)

--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
 import numpy
-from theano.gof import Variable, Op, utils, Type, Constant,  Value, Apply
+from theano.gof import Op, Apply
 from theano.tensor import as_tensor_variable, dot, DimShuffle
 from theano import tensor
@@ -174,7 +174,7 @@ def is_positive(v):
    print 'is_positive', v
    if v.owner and v.owner.op == tensor.pow:
        print 'try for pow', v, v.owner.inputs
-        try: 
+        try:
            exponent = tensor.get_constant_value(v.owner.inputs[1])
        except TypeError:
            return False
@@ -530,5 +530,3 @@ class A_Xinv_b(Op):
        gX = -matrix_dot(iX.T, a, gz, b.T, iX.T)
        gb = matrix_dot(ix.T, a.T, gz)
        return [ga, gX, gb]
--- a/theano/sandbox/linalg/tests/test_linalg.py
+++ b/theano/sandbox/linalg/tests/test_linalg.py
@@ -5,7 +5,18 @@ import theano.scipy # To know if scipy is available.
 from theano import tensor, function
 from theano.tensor.basic import _allclose
-from theano.sandbox.linalg.ops import *
+# The one in comment are not tested...
+from theano.sandbox.linalg.ops import (cholesky,
+                                       matrix_inverse,
+                                       #solve,
+                                       #diag,
+                                       #extract_diag,
+                                       #alloc_diag,
+                                       det,
+                                       #PSD_hint,
+                                       #trace,
+                                       #spectral_radius_bound
+                                       )
 from nose.plugins.skip import SkipTest
@@ -21,7 +32,7 @@ if 0:
        pd = numpy.dot(r,r.T)
        x = tensor.matrix()
-        chol = Cholesky()(x)
+        chol = cholesky(x)
        f = function([x], tensor.dot(chol, chol.T)) # an optimization could remove this
        ch_f = function([x], chol)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -136,6 +136,7 @@ def safe_make_node(op, *inputs):
        return node[0].owner
    else:
        return node.owner
 def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},
               bad_runtime = {}, grad = {}, mode = None, grad_rtol=None,
               eps = 1e-10, skip = False):
@@ -146,7 +147,7 @@ def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},
    class Checker(unittest.TestCase):
-        op = _op
+        op = staticmethod(_op)
        expected = staticmethod(_expected)
        checks = _checks
        good = _good
@@ -999,6 +1000,52 @@ SecondSameRankTester = makeTester(
                            mode=get_default_mode().excluding('local_fill_to_alloc')
                        )
+### Alloc
+AllocTester = makeBroadcastTester(
+        name = 'AllocTester',
+        op = alloc,
+        expected = (lambda x, *shp: numpy.zeros(shp, dtype=x.dtype) + x),
+        good = dict(
+            correct02 = (rand(), numpy.int32(4), numpy.int32(7)),
+            correct12 = (rand(7), numpy.int32(4), numpy.int32(7)),
+            correct13 = (rand(7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
+            correct23 = (rand(4,7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
+            ),
+        bad_runtime = dict(
+            bad_shape12 = (rand(7), numpy.int32(7), numpy.int32(5)),
+            too_big32 = (rand(6,2,4), numpy.int32(6), numpy.int32(2)),
+            too_big32b = (rand(6,2,4), numpy.int32(2), numpy.int32(4)),
+            ),
+        )
+# Since not all inputs of Alloc are differentiable, we need different testers
+s1, s2, s3 = randint_ranged(1, 13, (3,))
+# alloc a scalar into a vector
+Alloc01GradTester = makeBroadcastTester(
+        name = 'Alloc01GradTester',
+        #op = (lambda self, x: alloc(x, s1)),
+        op = (lambda x: alloc(x, s1)),
+        expected = (lambda x: numpy.zeros((s1,), dtype=x.dtype) + x),
+        grad = dict(
+            x1 = (rand(),),
+            x2 = (rand(),),
+            x3 = (rand(),),
+            ),
+        )
+# alloc a vector into a tensor3
+Alloc13GradTester = makeBroadcastTester(
+        name = 'Alloc13GradTester',
+        #op = (lambda self, x: alloc(x, s1, s2, s3)),
+        op = (lambda x: alloc(x, s1, s2, s3)),
+        expected = (lambda x: numpy.zeros((s1, s2, s3), dtype=x.dtype) + x),
+        grad = dict(
+            x1 = (rand(s3),),
+            x2 = (rand(s3),),
+            x3 = (rand(s3),),
+            ),
+        )
 def test_eye():
    def check(dtype, N, M_=None, k=0):
        # Theano does not accept None as a tensor.

--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py
@@ -4,15 +4,31 @@ This is a REALLY PARTIAL TEST.
 I did them to help debug stuff.
 """
+import logging
+import StringIO
 import theano
 import theano.tensor as tensor
 def test_pydotprint_cond_highlight():
+    assert len(theano.theano_logger.handlers) == 1
    x = tensor.dvector()
    f = theano.function([x], x*2)
    f([1,2,3,4])
-    theano.printing.pydotprint(f, cond_highlight = True)
+    s = StringIO.StringIO()
+    new_handler = logging.StreamHandler(s)
+    new_handler.setLevel(logging.DEBUG)
+    orig_handler = theano.theano_logger.handlers[0]
+    theano.theano_logger.removeHandler(orig_handler)
+    theano.theano_logger.addHandler(new_handler)
+    try:
+        theano.printing.pydotprint(f, cond_highlight = True)
+    finally:
+        theano.theano_logger.addHandler(orig_handler)
+        theano.theano_logger.removeHandler(new_handler)
+    assert s.getvalue() == 'pydotprint: cond_highlight is set but there is no IfElse node in the graph\n'