merge; no conflicts

582b6a67 · Razvan Pascanu · 1ede2527 · 32cb5be2 · 582b6a67 · 582b6a67
--- a/doc/hpcs2011_tutorial/advanced_theano.txt
+++ b/doc/hpcs2011_tutorial/advanced_theano.txt
--- a/doc/hpcs2011_tutorial/extending_theano.txt
+++ b/doc/hpcs2011_tutorial/extending_theano.txt
+
+.. _extending_theano:
+
+****************
+Extending Theano
+****************
+
+Theano graphs
+-------------
+
+- Theano works with symbolic graphs
+- Those graphs are bi-partite graphs (graph with 2 types of nodes)
+- Those 2 nodes types are Apply and Variable nodes
+
+Inputs and Outputs are lists of Theano variables
+
+.. image:: pics/apply_node.png
+    :width: 500 px
+
+Op contract
+-----------
+
+
+.. code-block:: python
+
+    import theano
+
+    class MyOp(Op):
+        def __eq__(self, other):
+        def __hash__(self):
+        def __str__(self):
+        def make_node(self, x):
+        # Python implementation:
+        def perform(self, node, inputs_storage, output_storage):
+        # C implementation: [see theano web site]
+        # others implementation (pycuda, ...):
+        def make_thunk(self, node, storage_map, _, _2):
+        # optional:
+        def __init__(self, ...):
+        def grad(self, inputs, g):
+        def infer_shape(node, (i0_shapes, ...))
+
+
+Op example
+----------
+
+.. code-block:: python
+
+    import theano
+
+    class DoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, x):
+            x = theano.tensor.as_tensor_variable(x)
+            return theano.Apply(self, [x], [x.type()])
+        def perform(self, node, inputs, output_storage):
+            x = inputs[0]
+            z = output_storage[0]
+            z[0] = x * 2
+
+Test it!
+
+>>> x = theano.tensor.matrix()
+>>> f = theano.function([x],DoubleOp()(x))
+>>> import numpy
+>>> inp = numpy.random.rand(5,5)
+>>> out = f(inp)
+>>> assert numpy.allclose(inp*2, out)
+>>> print inp
+>>> print out
+
+
+Exercises 7
+-----------
+
+- Run the code in the file double_op.py.
+- Modify and execute to compute: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+
+  - Our current elemwise fusion generate computation with only 1 outputs
+
+
+
+Theano + PyCUDA
+---------------
+
+.. code-block:: python
+
+    import numpy, theano
+    import theano.misc.pycuda_init
+    from pycuda.compiler import SourceModule
+    import theano.sandbox.cuda as cuda
+
+    class PyCUDADoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, inp):
+            inp = cuda.basic_ops.gpu_contiguous(
+               cuda.basic_ops.as_cuda_ndarray_variable(inp))
+            assert inp.dtype == "float32"
+            return theano.Apply(self, [inp], [inp.type()])
+        def make_thunk(self, node, storage_map, _, _2):
+            mod = SourceModule("""
+        __global__ void my_fct(float * i0, float * o0, int size) {
+        int i = blockIdx.x*blockDim.x + threadIdx.x;
+        if(i<size){
+            o0[i] = i0[i]*2;
+        }
+      }""")
+            pycuda_fct = mod.get_function("my_fct")
+            inputs = [ storage_map[v] for v in node.inputs]
+            outputs = [ storage_map[v] for v in node.outputs]
+            def thunk():
+                z = outputs[0]
+                if z[0] is None or z[0].shape!=inputs[0][0].shape:
+                    z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
+                grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
+                pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
+                           block=(512,1,1), grid=grid)
+            return thunk
+    
+
+Test it!
+
+>>> x = theano.tensor.fmatrix()
+>>> f = theano.function([x], PyCUDADoubleOp()(x))
+>>> xv=numpy.ones((4,5), dtype="float32")
+>>> assert numpy.allclose(f(xv), xv*2)
+>>> print numpy.asarray(f(xv))
+
+Exercises 8
+-----------
+
+- Run the above example
+- Modify and execute the example to multiple two matrix: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+
+  - Our current elemwise fusion generate computation with only 1 outputs
+
+- Modify and execute the example to support stride? (Don't force the input to be c contiguous)
--- a/doc/hpcs2011_tutorial/gpundarray.txt
+++ b/doc/hpcs2011_tutorial/gpundarray.txt
+
+.. _gpundarray:
+
+**********
+GpuNdArray
+**********
+
+Why a common GPU ndarray?
+
+- Currently there are at least 4 different GPU array data structures in use by Python packages
+
+  - CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ...
+  - There are even more if we include other languages
+
+- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU
+- Lots of duplicated effort
+
+  - GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
+
+- Lack of a common array API makes it harder to port/reuse code
+- Also harder to find/distribute code
+- Divides development work
+
+
+Design Goals
+
+- Make it VERY similar to ``numpy.ndarray``
+- Be compatible with both CUDA and OpenCL
+- Have the base object accessible from C to allow collaboration with more projects, across high-level languages
+
+  - We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
+
+
+Final GpuNdArray Note
+
+- Under development
+- Will be the next GPU array container for Theano (this summer!)
+- Probably also for PyCUDA, PyOpenCL
+- Mailing list: http://lists.tiker.net/listinfo/gpundarray
+
--- a/doc/hpcs2011_tutorial/index.txt
+++ b/doc/hpcs2011_tutorial/index.txt
+
+.. _index:
+
+=========================
+GPU programming made Easy
+=========================
+
+.. toctree::
+
+    introduction
+    theano
+    advanced_theano
+    pyCUDA
+    extending_theano
+    gpundarray
+
--- a/doc/hpcs2011_tutorial/introduction.txt
+++ b/doc/hpcs2011_tutorial/introduction.txt
+
+.. _introduction:
+
+
+************
+Introduction
+************
+
+Theano motivations
+------------------
+Theano tries to be the **holy grail** in computing: *easy to code* and *it fast to execute* !
+
+it works only on mathematical expressions, so you won't have:
+
+  - Function call inside a theano function
+  - Structure, enum
+  - Dynamic type (Theano is Fully typed)
+
+Unfortunately it doesn't do coffee... yet.
+
+.. image:: pics/Caffeine_Machine_no_background_red.png
+
+
+Theano status
+-------------
+
+Why you can rely on Theano:
+
+- Theano has been developed and used since January 2008 (3.5 yrs old)
+- Core technology for a funded Silicon-Valley startup
+- Driven over 40 research papers in the last few years
+- Good user documentation
+- Active mailing list with participants from outside our lab
+- Many contributors (some from outside our lab)
+- Used to teach IFT6266 for two years
+- Used by everyone in our lab (\textasciitilde 30 people)
+- Deep Learning Tutorials
+- Unofficial RPMs for Mandriva
+- Downloads (June 8 2011, since last January): Pypi 780, MLOSS: 483, Assembla (``bleeding edge'' repository): unknown
+
+Why scripting for GPUs ?
+------------------------
+
+**GPUs?**
+
+- Faster, cheaper, more efficient power usage
+- How much faster? I have seen numbers from 100x slower to 1000x faster.
+
+  - It depends on the algorithms
+  - How the benchmark is done
+      
+    - Quality of implementation
+    - How much time was spent optimizing CPU vs GPU code
+
+  - In Theory:
+
+    - Intel Core i7 980 XE (107Gf/s float64) 6 cores
+    - NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores
+    - NVIDIA GTX580 (1.5Tf/s float32) 512 cores
+  
+  - Theano goes up to 100x faster on th GPU because we don't use multiple core on CPU
+    
+    - Theano can be linked with multi-core capable BLAS (GEMM and GEMV)
+  - If you see 1000x, it probably means the benchmark is not fair
+
+**Scripting for GPUs?**
+
+They *Complement each other*
+
+- GPUs are everything that scripting/high level languages are not
+
+  - Highly parallel
+  - Very architecture-sensitive
+  - Built for maximum FP/memory throughput
+- CPU: largely restricted to control
+
+  - Optimized for sequential code and low latency (rather than high throughput)
+  - Tasks (1000/sec)
+  - Scripting fast enough
+
+Theano vs PyCUDA vs PyOpenCL vs CUDA
+------------------------------------
+
+- Theano
+
+  - Mathematical expression compiler
+  - Generates costum C and CUDA code
+  - Uses Python code when performance is not critical
+
+- CUDA
+
+  - C extension by NVIDA that allow to code and use GPU
+
+- PyCUDA (Python + CUDA)
+
+  - Python interface to CUDA
+  - Memory management of GPU objects
+  - Compilation of code for the low-level driver
+
+- PyOpenCL (Python + OpenCL)
+  - PyCUDA for OpenCL
+
+Python
+------
+
+- Interpreted language
+- General-purpose high-level programming language
+- OO and scripting language
+- Emphasizes code readability
+- Large and comprehensive standard library
+- Indentation for block delimiters
+- Dynamic type and memory management
+- Dictionary ``d={'var1':'value1', 'var2':42, ...}``
+- List comprehension: ``[i+3 for i in range(10)]``
+
+NumPy
+-----
+
+- Base scientific computing package in Python on the CPU
+- A powerful N-dimensional array object
+
+  - ndarray.{ndim, shape, size, dtype, itemsize, stride}
+  
+- Sophisticated broadcasting functions
+    
+  - ``numpy.random.rand(4,5) * numpy.random.rand(1,5)`` -> mat(4,5)
+  - ``numpy.random.rand(4,5) * numpy.random.rand(4,1)`` -> mat(4,5)
+  - ``numpy.random.rand(4,5) * numpy.random.rand(5)`` -> mat(4,5)
+
+- Tools for integrating C/C++ and Fortran code
+- Linear algebra, Fourier transform and pseudorandom number generation
+
+
--- a/doc/hpcs2011_tutorial/pics/apply_node.png
+++ b/doc/hpcs2011_tutorial/pics/apply_node.png
--- a/doc/hpcs2011_tutorial/pics/conv.png
+++ b/doc/hpcs2011_tutorial/pics/conv.png
--- a/doc/hpcs2011_tutorial/pics/f_optimized.png
+++ b/doc/hpcs2011_tutorial/pics/f_optimized.png
--- a/doc/hpcs2011_tutorial/pics/f_unoptimized.png
+++ b/doc/hpcs2011_tutorial/pics/f_unoptimized.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_train.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_train.png
--- a/doc/hpcs2011_tutorial/pics/mlp.png
+++ b/doc/hpcs2011_tutorial/pics/mlp.png
--- a/doc/hpcs2011_tutorial/pics/multiple_graph.png
+++ b/doc/hpcs2011_tutorial/pics/multiple_graph.png
--- a/doc/hpcs2011_tutorial/pics/pipeline.png
+++ b/doc/hpcs2011_tutorial/pics/pipeline.png
--- a/doc/hpcs2011_tutorial/pyCUDA.txt
+++ b/doc/hpcs2011_tutorial/pyCUDA.txt
+
+.. _pyCUDA:
+
+******
+PyCUDA
+******
+
+Introduction
+------------
+
+Authors: Andreas Klockner
+
+- PyCUDA can access Nvidia's CUDA parallel computation API from Python
+- Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
+
+  - Makes it much easier to write correct, leak- and crash-free code
+  - PyCUDA knows about dependencies (e.g.. it won't detach from a context before all memory allocated in it is also freed)
+
+- Convenience
+
+  - Abstractions to compile CUDA code from Python: ``pycuda.driver.SourceModule``
+  - A GPU memory buffer: \texttt{pycuda.gpuarray.GPUArray}
+
+- Completeness
+
+  - Binding to all of CUDA's driver API
+
+- Automatic Error Checking
+
+  - All CUDA errors are automatically translated into Python exceptions
+
+- Speed
+
+  - PyCUDA's base layer is written in C++
+
+- Helpful documentation
+
+
+Example
+-------
+
+.. code-block:: python
+
+  import pycuda.autoinit
+  import pycuda.driver as drv
+  import numpy
+  
+  from pycuda.compiler import SourceModule
+  mod = SourceModule("""
+  __global__ void multiply_them(float *dest, float *a, float *b)
+  {
+    const int i = threadIdx.x;
+    dest[i] = a[i] * b[i];
+  }
+  """)
+
+  multiply_them = mod.get_function("multiply_them")
+  
+  a = numpy.random.randn(400).astype(numpy.float32)
+  b = numpy.random.randn(400).astype(numpy.float32)
+  
+  dest = numpy.zeros_like(a)
+  multiply_them(
+          drv.Out(dest), drv.In(a), drv.In(b),
+          block=(400,1,1), grid=(1,1))
+
+  assert numpy.allclose(dest, a*b)
+  print dest
+
+
+Exercice 6
+----------
+
+- Run the above example
+- Modify and execute it to work for a matrix of 20 x 10
+
--- a/doc/hpcs2011_tutorial/theano.txt
+++ b/doc/hpcs2011_tutorial/theano.txt
--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
@@ -71,6 +71,14 @@ def test_pycuda_memory_to_theano():
    print "gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y)
    assert sys.getrefcount(y)==3
    assert (numpy.asarray(z) == 0).all()
+    assert z.base is y
+
+    # Test that we can take a view from this cuda view on pycuda memory
+    zz = z.view()
+    assert sys.getrefcount(y) == 4
+    assert zz.base is y
+    del zz
+    assert sys.getrefcount(y) == 3

    cuda_ones = cuda_ndarray.CudaNdarray(numpy.asarray([[[1]]],dtype='float32'))
    z += cuda_ones

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -50,13 +50,7 @@ class HostFromGpu(Op):
        z[0] = numpy.asarray(x)
    def grad(self, inputs, grads):
        gz, = grads
-        if isinstance(gz, tensor.TensorType):
-            # This would only happen if you call Lop, and provide a tensor
-            # that is not cuda
-            # This might require another look to be sure
-            return [gpu_from_host(gz)]
-        else:
-            return [gz]
+        return [gpu_from_host(gz)]

    def R_op(self, inputs, eval_points):
        ev, = eval_points
@@ -85,13 +79,7 @@ class GpuFromHost(Op):
        z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0])
    def grad(self, inputs, grads):
        gz, = grads
-        if isinstance(gz,CudaNdarrayType):
-            # This would only happen if you call Lop, and provide a tensor
-            # that is not cuda
-            # This might require another look to be sure
-            return [host_from_gpu(gz)]
-        else:
-            return [gz]
+        return [host_from_gpu(gz)]

    def R_op(self, inputs, eval_points):
        ev, = eval_points

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2585,13 +2585,10 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * bas
    // Get the original base object (base.base.base...)
    PyObject * orig_base = base;
    // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
-    if (orig_base && CudaNdarray_Check(orig_base))
+    while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
    {
-        while (((CudaNdarray*) orig_base)->base)
-        {
-            // base_base is itself a view
-            orig_base = ((CudaNdarray*) orig_base)->base;
-        }
+        // base_base is itself a view
+        orig_base = ((CudaNdarray*) orig_base)->base;
    }
    //N.B. XDECREF and XINCREF are no-ops for NULL pointers
    if (self->base != orig_base)

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -594,7 +594,7 @@ def local_gpu_advanced_incsubtensor1(node):
                                               gpu_from_host(y), *coords)]

    # Should not execute for GpuAdvancedIncSubtensor1
-    if node.op.__class__ is tensor.AdvancedSubtensor1 and node.inputs[0].dtype=="float32":
+    if node.op.__class__ is tensor.AdvancedIncSubtensor1 and node.inputs[0].dtype=="float32":
        x, y  = node.inputs[0:2]
        coords = node.inputs[2:]
        go_gpu = False

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -806,6 +806,22 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
    def __init__(self, name):
        return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name)

+def test_advinc_subtensor1():
+    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
+    shared = cuda.shared_constructor
+    #shared = tensor.shared
+    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+                      dtype='float32')
+    yval = numpy.asarray([[10,10,10], [10,10,10]],
+                      dtype='float32')
+    x = shared(xval, name = 'x')
+    y = T.fmatrices('y')
+    expr = T.advanced_inc_subtensor1(x,y,[0,2])
+    f=theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op,cuda.GpuAdvancedIncSubtensor1) for node in f.maker.env.toposort() ])==1
+    assert numpy.allclose(f(yval),[[11.,12.,13.], [4.,5.,6.], [17.,18.,19.]])
+
+
 def test_inc_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared
@@ -832,7 +848,6 @@ def test_set_subtensor():
                      dtype='float32')
    expr = T.set_subtensor(x[:,1:3], y[:,1:3])
    f=theano.function([x,y], expr, mode=mode_with_gpu)
-    print f.maker.env.toposort()
    assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
    assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==True for node in f.maker.env.toposort() ])==1
    print f(xval,yval)

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -116,7 +116,7 @@ def test_run_nnet():
            rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
            #print "cpu:", rval_cpu
            #print "gpu:", rval_gpu
-            abs_diff, rel_diff = theano.tensor.basic.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
+            abs_diff, rel_diff = theano.tensor.tensor_grad.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
            max_abs_diff = abs_diff.max()
            print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d"%(
                max_abs_diff, rel_diff.max(), n_in, n_hid)

--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
 import numpy

-from theano.gof import Variable, Op, utils, Type, Constant,  Value, Apply
+from theano.gof import Op, Apply

 from theano.tensor import as_tensor_variable, dot, DimShuffle
 from theano import tensor
@@ -174,7 +174,7 @@ def is_positive(v):
    print 'is_positive', v
    if v.owner and v.owner.op == tensor.pow:
        print 'try for pow', v, v.owner.inputs
-        try: 
+        try:
            exponent = tensor.get_constant_value(v.owner.inputs[1])
        except TypeError:
            return False
@@ -530,5 +530,3 @@ class A_Xinv_b(Op):
        gX = -matrix_dot(iX.T, a, gz, b.T, iX.T)
        gb = matrix_dot(ix.T, a.T, gz)
        return [ga, gX, gb]
-
-
--- a/theano/sandbox/linalg/tests/test_linalg.py
+++ b/theano/sandbox/linalg/tests/test_linalg.py
@@ -5,7 +5,18 @@ import theano.scipy # To know if scipy is available.
 from theano import tensor, function
 from theano.tensor.basic import _allclose

-from theano.sandbox.linalg.ops import *
+# The one in comment are not tested...
+from theano.sandbox.linalg.ops import (cholesky,
+                                       matrix_inverse,
+                                       #solve,
+                                       #diag,
+                                       #extract_diag,
+                                       #alloc_diag,
+                                       det,
+                                       #PSD_hint,
+                                       #trace,
+                                       #spectral_radius_bound
+                                       )

 from nose.plugins.skip import SkipTest

@@ -21,7 +32,7 @@ if 0:
        pd = numpy.dot(r,r.T)

        x = tensor.matrix()
-        chol = Cholesky()(x)
+        chol = cholesky(x)
        f = function([x], tensor.dot(chol, chol.T)) # an optimization could remove this

        ch_f = function([x], chol)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -136,6 +136,7 @@ def safe_make_node(op, *inputs):
        return node[0].owner
    else:
        return node.owner
+
 def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},
               bad_runtime = {}, grad = {}, mode = None, grad_rtol=None,
               eps = 1e-10, skip = False):
@@ -146,7 +147,7 @@ def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},

    class Checker(unittest.TestCase):

-        op = _op
+        op = staticmethod(_op)
        expected = staticmethod(_expected)
        checks = _checks
        good = _good
@@ -999,6 +1000,52 @@ SecondSameRankTester = makeTester(
                            mode=get_default_mode().excluding('local_fill_to_alloc')
                        )

+### Alloc
+AllocTester = makeBroadcastTester(
+        name = 'AllocTester',
+        op = alloc,
+        expected = (lambda x, *shp: numpy.zeros(shp, dtype=x.dtype) + x),
+        good = dict(
+            correct02 = (rand(), numpy.int32(4), numpy.int32(7)),
+            correct12 = (rand(7), numpy.int32(4), numpy.int32(7)),
+            correct13 = (rand(7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
+            correct23 = (rand(4,7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
+            ),
+        bad_runtime = dict(
+            bad_shape12 = (rand(7), numpy.int32(7), numpy.int32(5)),
+            too_big32 = (rand(6,2,4), numpy.int32(6), numpy.int32(2)),
+            too_big32b = (rand(6,2,4), numpy.int32(2), numpy.int32(4)),
+            ),
+        )
+
+# Since not all inputs of Alloc are differentiable, we need different testers
+s1, s2, s3 = randint_ranged(1, 13, (3,))
+# alloc a scalar into a vector
+Alloc01GradTester = makeBroadcastTester(
+        name = 'Alloc01GradTester',
+        #op = (lambda self, x: alloc(x, s1)),
+        op = (lambda x: alloc(x, s1)),
+        expected = (lambda x: numpy.zeros((s1,), dtype=x.dtype) + x),
+        grad = dict(
+            x1 = (rand(),),
+            x2 = (rand(),),
+            x3 = (rand(),),
+            ),
+        )
+
+# alloc a vector into a tensor3
+Alloc13GradTester = makeBroadcastTester(
+        name = 'Alloc13GradTester',
+        #op = (lambda self, x: alloc(x, s1, s2, s3)),
+        op = (lambda x: alloc(x, s1, s2, s3)),
+        expected = (lambda x: numpy.zeros((s1, s2, s3), dtype=x.dtype) + x),
+        grad = dict(
+            x1 = (rand(s3),),
+            x2 = (rand(s3),),
+            x3 = (rand(s3),),
+            ),
+        )
+
 def test_eye():
    def check(dtype, N, M_=None, k=0):
        # Theano does not accept None as a tensor.

--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py
@@ -4,15 +4,31 @@ This is a REALLY PARTIAL TEST.
 I did them to help debug stuff.

 """
-
+import logging
+import StringIO

 import theano
 import theano.tensor as tensor


 def test_pydotprint_cond_highlight():
+    assert len(theano.theano_logger.handlers) == 1
+
    x = tensor.dvector()
    f = theano.function([x], x*2)
    f([1,2,3,4])

-    theano.printing.pydotprint(f, cond_highlight = True)
+    s = StringIO.StringIO()
+    new_handler = logging.StreamHandler(s)
+    new_handler.setLevel(logging.DEBUG)
+    orig_handler = theano.theano_logger.handlers[0]
+
+    theano.theano_logger.removeHandler(orig_handler)
+    theano.theano_logger.addHandler(new_handler)
+    try:
+        theano.printing.pydotprint(f, cond_highlight = True)
+    finally:
+        theano.theano_logger.addHandler(orig_handler)
+        theano.theano_logger.removeHandler(new_handler)
+
+    assert s.getvalue() == 'pydotprint: cond_highlight is set but there is no IfElse node in the graph\n'