merge NC

cd9e62a0 · James Bergstra · cc8822d3 · f5aec8b8 · cd9e62a0 · cd9e62a0
--- a/doc/hpcs2011_tutorial/advanced_theano.txt
+++ b/doc/hpcs2011_tutorial/advanced_theano.txt
--- a/doc/hpcs2011_tutorial/extending_theano.txt
+++ b/doc/hpcs2011_tutorial/extending_theano.txt
+
+.. _extending_theano:
+
+****************
+Extending Theano
+****************
+
+Theano graphs
+-------------
+
+- Theano works with symbolic graphs
+- Those graphs are bi-partite graphs (graph with 2 types of nodes)
+- Those 2 nodes types are Apply and Variable nodes
+
+Inputs and Outputs are lists of Theano variables
+
+.. image:: pics/apply_node.png
+    :width: 500 px
+
+Op contract
+-----------
+
+
+.. code-block:: python
+
+    import theano
+
+    class MyOp(Op):
+        def __eq__(self, other):
+        def __hash__(self):
+        def __str__(self):
+        def make_node(self, x):
+        # Python implementation:
+        def perform(self, node, inputs_storage, output_storage):
+        # C implementation: [see theano web site]
+        # others implementation (pycuda, ...):
+        def make_thunk(self, node, storage_map, _, _2):
+        # optional:
+        def __init__(self, ...):
+        def grad(self, inputs, g):
+        def infer_shape(node, (i0_shapes, ...))
+
+
+Op example
+----------
+
+.. code-block:: python
+
+    import theano
+
+    class DoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, x):
+            x = theano.tensor.as_tensor_variable(x)
+            return theano.Apply(self, [x], [x.type()])
+        def perform(self, node, inputs, output_storage):
+            x = inputs[0]
+            z = output_storage[0]
+            z[0] = x * 2
+
+Test it!
+
+>>> x = theano.tensor.matrix()
+>>> f = theano.function([x],DoubleOp()(x))
+>>> import numpy
+>>> inp = numpy.random.rand(5,5)
+>>> out = f(inp)
+>>> assert numpy.allclose(inp*2, out)
+>>> print inp
+>>> print out
+
+
+Exercises 7
+-----------
+
+- Run the code in the file double_op.py.
+- Modify and execute to compute: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+
+  - Our current elemwise fusion generate computation with only 1 outputs
+
+
+
+Theano + PyCUDA
+---------------
+
+.. code-block:: python
+
+    import numpy, theano
+    import theano.misc.pycuda_init
+    from pycuda.compiler import SourceModule
+    import theano.sandbox.cuda as cuda
+
+    class PyCUDADoubleOp(theano.Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, inp):
+            inp = cuda.basic_ops.gpu_contiguous(
+               cuda.basic_ops.as_cuda_ndarray_variable(inp))
+            assert inp.dtype == "float32"
+            return theano.Apply(self, [inp], [inp.type()])
+        def make_thunk(self, node, storage_map, _, _2):
+            mod = SourceModule("""
+        __global__ void my_fct(float * i0, float * o0, int size) {
+        int i = blockIdx.x*blockDim.x + threadIdx.x;
+        if(i<size){
+            o0[i] = i0[i]*2;
+        }
+      }""")
+            pycuda_fct = mod.get_function("my_fct")
+            inputs = [ storage_map[v] for v in node.inputs]
+            outputs = [ storage_map[v] for v in node.outputs]
+            def thunk():
+                z = outputs[0]
+                if z[0] is None or z[0].shape!=inputs[0][0].shape:
+                    z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
+                grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
+                pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
+                           block=(512,1,1), grid=grid)
+            return thunk
+    
+
+Test it!
+
+>>> x = theano.tensor.fmatrix()
+>>> f = theano.function([x], PyCUDADoubleOp()(x))
+>>> xv=numpy.ones((4,5), dtype="float32")
+>>> assert numpy.allclose(f(xv), xv*2)
+>>> print numpy.asarray(f(xv))
+
+Exercises 8
+-----------
+
+- Run the above example
+- Modify and execute the example to multiple two matrix: x * y
+- Modify and execute the example to return 2 outputs: x + y and x - y
+
+  - Our current elemwise fusion generate computation with only 1 outputs
+
+- Modify and execute the example to support stride? (Don't force the input to be c contiguous)
--- a/doc/hpcs2011_tutorial/gpundarray.txt
+++ b/doc/hpcs2011_tutorial/gpundarray.txt
+
+.. _gpundarray:
+
+**********
+GpuNdArray
+**********
+
+Why a common GPU ndarray?
+
+- Currently there are at least 4 different GPU array data structures in use by Python packages
+
+  - CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ...
+  - There are even more if we include other languages
+
+- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU
+- Lots of duplicated effort
+
+  - GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
+
+- Lack of a common array API makes it harder to port/reuse code
+- Also harder to find/distribute code
+- Divides development work
+
+
+Design Goals
+
+- Make it VERY similar to ``numpy.ndarray``
+- Be compatible with both CUDA and OpenCL
+- Have the base object accessible from C to allow collaboration with more projects, across high-level languages
+
+  - We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
+
+
+Final GpuNdArray Note
+
+- Under development
+- Will be the next GPU array container for Theano (this summer!)
+- Probably also for PyCUDA, PyOpenCL
+- Mailing list: http://lists.tiker.net/listinfo/gpundarray
+
--- a/doc/hpcs2011_tutorial/index.txt
+++ b/doc/hpcs2011_tutorial/index.txt
+
+.. _index:
+
+=========================
+GPU programming made Easy
+=========================
+
+.. toctree::
+
+    introduction
+    theano
+    advanced_theano
+    pyCUDA
+    extending_theano
+    gpundarray
+
--- a/doc/hpcs2011_tutorial/introduction.txt
+++ b/doc/hpcs2011_tutorial/introduction.txt
+
+.. _introduction:
+
+
+************
+Introduction
+************
+
+Theano motivations
+------------------
+Theano tries to be the **holy grail** in computing: *easy to code* and *it fast to execute* !
+
+it works only on mathematical expressions, so you won't have:
+
+  - Function call inside a theano function
+  - Structure, enum
+  - Dynamic type (Theano is Fully typed)
+
+Unfortunately it doesn't do coffee... yet.
+
+.. image:: pics/Caffeine_Machine_no_background_red.png
+
+
+Theano status
+-------------
+
+Why you can rely on Theano:
+
+- Theano has been developed and used since January 2008 (3.5 yrs old)
+- Core technology for a funded Silicon-Valley startup
+- Driven over 40 research papers in the last few years
+- Good user documentation
+- Active mailing list with participants from outside our lab
+- Many contributors (some from outside our lab)
+- Used to teach IFT6266 for two years
+- Used by everyone in our lab (\textasciitilde 30 people)
+- Deep Learning Tutorials
+- Unofficial RPMs for Mandriva
+- Downloads (June 8 2011, since last January): Pypi 780, MLOSS: 483, Assembla (``bleeding edge'' repository): unknown
+
+Why scripting for GPUs ?
+------------------------
+
+**GPUs?**
+
+- Faster, cheaper, more efficient power usage
+- How much faster? I have seen numbers from 100x slower to 1000x faster.
+
+  - It depends on the algorithms
+  - How the benchmark is done
+      
+    - Quality of implementation
+    - How much time was spent optimizing CPU vs GPU code
+
+  - In Theory:
+
+    - Intel Core i7 980 XE (107Gf/s float64) 6 cores
+    - NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores
+    - NVIDIA GTX580 (1.5Tf/s float32) 512 cores
+  
+  - Theano goes up to 100x faster on th GPU because we don't use multiple core on CPU
+    
+    - Theano can be linked with multi-core capable BLAS (GEMM and GEMV)
+  - If you see 1000x, it probably means the benchmark is not fair
+
+**Scripting for GPUs?**
+
+They *Complement each other*
+
+- GPUs are everything that scripting/high level languages are not
+
+  - Highly parallel
+  - Very architecture-sensitive
+  - Built for maximum FP/memory throughput
+- CPU: largely restricted to control
+
+  - Optimized for sequential code and low latency (rather than high throughput)
+  - Tasks (1000/sec)
+  - Scripting fast enough
+
+Theano vs PyCUDA vs PyOpenCL vs CUDA
+------------------------------------
+
+- Theano
+
+  - Mathematical expression compiler
+  - Generates costum C and CUDA code
+  - Uses Python code when performance is not critical
+
+- CUDA
+
+  - C extension by NVIDA that allow to code and use GPU
+
+- PyCUDA (Python + CUDA)
+
+  - Python interface to CUDA
+  - Memory management of GPU objects
+  - Compilation of code for the low-level driver
+
+- PyOpenCL (Python + OpenCL)
+  - PyCUDA for OpenCL
+
+Python
+------
+
+- Interpreted language
+- General-purpose high-level programming language
+- OO and scripting language
+- Emphasizes code readability
+- Large and comprehensive standard library
+- Indentation for block delimiters
+- Dynamic type and memory management
+- Dictionary ``d={'var1':'value1', 'var2':42, ...}``
+- List comprehension: ``[i+3 for i in range(10)]``
+
+NumPy
+-----
+
+- Base scientific computing package in Python on the CPU
+- A powerful N-dimensional array object
+
+  - ndarray.{ndim, shape, size, dtype, itemsize, stride}
+  
+- Sophisticated broadcasting functions
+    
+  - ``numpy.random.rand(4,5) * numpy.random.rand(1,5)`` -> mat(4,5)
+  - ``numpy.random.rand(4,5) * numpy.random.rand(4,1)`` -> mat(4,5)
+  - ``numpy.random.rand(4,5) * numpy.random.rand(5)`` -> mat(4,5)
+
+- Tools for integrating C/C++ and Fortran code
+- Linear algebra, Fourier transform and pseudorandom number generation
+
+
--- a/doc/hpcs2011_tutorial/pics/apply_node.png
+++ b/doc/hpcs2011_tutorial/pics/apply_node.png
--- a/doc/hpcs2011_tutorial/pics/conv.png
+++ b/doc/hpcs2011_tutorial/pics/conv.png
--- a/doc/hpcs2011_tutorial/pics/f_optimized.png
+++ b/doc/hpcs2011_tutorial/pics/f_optimized.png
--- a/doc/hpcs2011_tutorial/pics/f_unoptimized.png
+++ b/doc/hpcs2011_tutorial/pics/f_unoptimized.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_predic.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_prediction.png
--- a/doc/hpcs2011_tutorial/pics/logreg_pydotprint_train.png
+++ b/doc/hpcs2011_tutorial/pics/logreg_pydotprint_train.png
--- a/doc/hpcs2011_tutorial/pics/mlp.png
+++ b/doc/hpcs2011_tutorial/pics/mlp.png
--- a/doc/hpcs2011_tutorial/pics/multiple_graph.png
+++ b/doc/hpcs2011_tutorial/pics/multiple_graph.png
--- a/doc/hpcs2011_tutorial/pics/pipeline.png
+++ b/doc/hpcs2011_tutorial/pics/pipeline.png
--- a/doc/hpcs2011_tutorial/pyCUDA.txt
+++ b/doc/hpcs2011_tutorial/pyCUDA.txt
+
+.. _pyCUDA:
+
+******
+PyCUDA
+******
+
+Introduction
+------------
+
+Authors: Andreas Klockner
+
+- PyCUDA can access Nvidia's CUDA parallel computation API from Python
+- Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
+
+  - Makes it much easier to write correct, leak- and crash-free code
+  - PyCUDA knows about dependencies (e.g.. it won't detach from a context before all memory allocated in it is also freed)
+
+- Convenience
+
+  - Abstractions to compile CUDA code from Python: ``pycuda.driver.SourceModule``
+  - A GPU memory buffer: \texttt{pycuda.gpuarray.GPUArray}
+
+- Completeness
+
+  - Binding to all of CUDA's driver API
+
+- Automatic Error Checking
+
+  - All CUDA errors are automatically translated into Python exceptions
+
+- Speed
+
+  - PyCUDA's base layer is written in C++
+
+- Helpful documentation
+
+
+Example
+-------
+
+.. code-block:: python
+
+  import pycuda.autoinit
+  import pycuda.driver as drv
+  import numpy
+  
+  from pycuda.compiler import SourceModule
+  mod = SourceModule("""
+  __global__ void multiply_them(float *dest, float *a, float *b)
+  {
+    const int i = threadIdx.x;
+    dest[i] = a[i] * b[i];
+  }
+  """)
+
+  multiply_them = mod.get_function("multiply_them")
+  
+  a = numpy.random.randn(400).astype(numpy.float32)
+  b = numpy.random.randn(400).astype(numpy.float32)
+  
+  dest = numpy.zeros_like(a)
+  multiply_them(
+          drv.Out(dest), drv.In(a), drv.In(b),
+          block=(400,1,1), grid=(1,1))
+
+  assert numpy.allclose(dest, a*b)
+  print dest
+
+
+Exercice 6
+----------
+
+- Run the above example
+- Modify and execute it to work for a matrix of 20 x 10
+
--- a/doc/hpcs2011_tutorial/theano.txt
+++ b/doc/hpcs2011_tutorial/theano.txt
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1183,36 +1183,46 @@ class _Linker(gof.link.LocalLinker):

        thunks_py = [] #python thunks
        thunks_c = [] #c thunks
+
+        compute_map = {}
+        for k in storage_map:
+            compute_map[k] = [k.owner is None]
+
+
        for node in order:
            node_input_storage = [storage_map[r] for r in node.inputs]
            node_output_storage = [storage_map[r] for r in node.outputs]
-            try:
-                if not self.maker.mode.check_c_code:
-                    raise utils.MethodNotDefined()
-                e = Env(*graph.clone(node.inputs, node.outputs))
-                e.toposort = lambda: e.nodes #WARNING: STOCHASTIC ORDER
-                #  Specifically... e.nodes is a set, but of only 1 element
-
-                cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs, node.outputs) if r2 in no_recycling])
-
-                thunk, node_input_filters, node_output_filters = cl.make_thunk(
-                    input_storage = node_input_storage,
-                    output_storage = node_output_storage)
-                thunk.inputs = node_input_storage
-                thunk.outputs = node_output_storage
-                thunks_c.append(thunk)

-            except (NotImplementedError, utils.MethodNotDefined):
-                thunks_c.append(None)
+            if hasattr(node.op, '_op_use_c_code'):
+                old_value = node.op._op_use_c_code
+            else:
+                old_value = False
+            try:
+                # ! Problem ! We do not know if make_thunk succedded into
+                # generating a cthunk, or if it reverted back to a python
+                # thunk, or if it is none of the above ...
+                node.op._op_use_c_code = True
+                tmp_thunk = node.op.make_thunk(node,
+                                    storage_map,
+                                    compute_map,
+                                    no_recycling)
+                if hasattr(tmp_thunk, 'cthunk'):
+                    # Arbritrary check to see if it has a C implementation
+                    thunks_c.append(tmp_thunk)
+                else:
+                    thunks_c.append(None)
+            finally:
+                node.op._op_use_c_code = old_value

            if self.maker.mode.check_py_code or thunks_c[-1] is None:
-                p = node.op.perform
-                thunk = (lambda p = p, i = node_input_storage, o = node_output_storage, n =
-                        node: p(n, [x[0] for x in i], o))
-                thunk.inputs = node_input_storage
-                thunk.outputs = node_output_storage
-                thunk.perform = p
-                thunks_py.append(thunk)
+                try:
+                    node.op._op_use_c_code = False
+                    thunks_py += [node.op.make_thunk(node,
+                                        storage_map,
+                                        compute_map,
+                                        no_recycling)]
+                finally:
+                    node.op._op_use_c_code = old_value
            else:
                thunks_py.append(None)

@@ -1233,6 +1243,11 @@ class _Linker(gof.link.LocalLinker):
        # This is the function that runs when you evaluate the graph
        #####
        def f():
+            ####
+            # Note: `f` ignores the compute_map and evaluates the nodes in
+            # topological order. In some sense, this is ok, and can be used
+            # for now.
+            #####
            _logger.debug("starting a DebugMode call")
            for x in no_recycling:
                x[0] = None

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -401,7 +401,9 @@ class PerformLinker(LocalLinker):
        for node in order:
            # Maker sure we don't use C version of the code, but rather only
            # the python version
-            old_value = node.op._op_use_c_code
+            # Note : ops that implement their own make thunk don't usually
+            # have this attribute defiend !!
+            old_value = getattr(node.op, '_op_use_c_code', False)
            try:
                node.op._op_use_c_code = False
                thunks += [node.op.make_thunk(node,

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1063,6 +1063,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            start_from = env.outputs
        changed = True
        max_use_abort = False
+        opt_name = None
        process_count = {}

        while changed and not max_use_abort:
@@ -1099,6 +1100,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                        process_count.setdefault(lopt, 0)
                        if process_count[lopt] > max_use:
                            max_use_abort = True
+                            opt_name = lopt.name
                        else:
                            lopt_change = self.process_node(env, node, lopt)
                            if lopt_change:
@@ -1110,7 +1112,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                self.detach_updater(env, u)
            self.detach_updater(env, u) #TODO: erase this line, it's redundant at best
        if max_use_abort:
-            _logger.error("EquilibriumOptimizer max'ed out")
+            _logger.error("EquilibriumOptimizer max'ed out by "+opt_name)

    def print_summary(self, stream=sys.stdout, level=0):
        print >> stream, "%s%s id=%i" %(' '*level, self.__class__.__name__, id(self))

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -168,7 +168,10 @@ class EquilibriumDB(DB):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
        return opt.EquilibriumOptimizer(opts,
                max_depth=5,
-                max_use_ratio=11,#upgraded to 11 to don't generated useless output in test.
+                max_use_ratio=50,#upgraded to 50 to avoid equibriumOptimizer
+                                # to be max'ed out by constant folding (can
+                                        # I increase the max ratio only for
+                                        # constant folding somehow?
                failure_callback=opt.NavigatorOptimizer.warn_inplace)



--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
@@ -71,6 +71,14 @@ def test_pycuda_memory_to_theano():
    print "gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y)
    assert sys.getrefcount(y)==3
    assert (numpy.asarray(z) == 0).all()
+    assert z.base is y
+
+    # Test that we can take a view from this cuda view on pycuda memory
+    zz = z.view()
+    assert sys.getrefcount(y) == 4
+    assert zz.base is y
+    del zz
+    assert sys.getrefcount(y) == 3

    cuda_ones = cuda_ndarray.CudaNdarray(numpy.asarray([[[1]]],dtype='float32'))
    z += cuda_ones

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -50,13 +50,7 @@ class HostFromGpu(Op):
        z[0] = numpy.asarray(x)
    def grad(self, inputs, grads):
        gz, = grads
-        if isinstance(gz, tensor.TensorType):
-            # This would only happen if you call Lop, and provide a tensor
-            # that is not cuda
-            # This might require another look to be sure
-            return [gpu_from_host(gz)]
-        else:
-            return [gz]
+        return [gpu_from_host(gz)]

    def R_op(self, inputs, eval_points):
        ev, = eval_points
@@ -85,13 +79,7 @@ class GpuFromHost(Op):
        z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0])
    def grad(self, inputs, grads):
        gz, = grads
-        if isinstance(gz,CudaNdarrayType):
-            # This would only happen if you call Lop, and provide a tensor
-            # that is not cuda
-            # This might require another look to be sure
-            return [host_from_gpu(gz)]
-        else:
-            return [gz]
+        return [host_from_gpu(gz)]

    def R_op(self, inputs, eval_points):
        ev, = eval_points

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2585,13 +2585,10 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * bas
    // Get the original base object (base.base.base...)
    PyObject * orig_base = base;
    // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
-    if (orig_base && CudaNdarray_Check(orig_base))
+    while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
    {
-        while (((CudaNdarray*) orig_base)->base)
-        {
-            // base_base is itself a view
-            orig_base = ((CudaNdarray*) orig_base)->base;
-        }
+        // base_base is itself a view
+        orig_base = ((CudaNdarray*) orig_base)->base;
    }
    //N.B. XDECREF and XINCREF are no-ops for NULL pointers
    if (self->base != orig_base)

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -590,7 +590,7 @@ def local_gpu_advanced_incsubtensor1(node):
                                               gpu_from_host(y), *coords)]

    # Should not execute for GpuAdvancedIncSubtensor1
-    if node.op.__class__ is tensor.AdvancedSubtensor1 and node.inputs[0].dtype=="float32":
+    if node.op.__class__ is tensor.AdvancedIncSubtensor1 and node.inputs[0].dtype=="float32":
        x, y  = node.inputs[0:2]
        coords = node.inputs[2:]
        go_gpu = False

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -806,6 +806,22 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
    def __init__(self, name):
        return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name)

+def test_advinc_subtensor1():
+    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
+    shared = cuda.shared_constructor
+    #shared = tensor.shared
+    xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
+                      dtype='float32')
+    yval = numpy.asarray([[10,10,10], [10,10,10]],
+                      dtype='float32')
+    x = shared(xval, name = 'x')
+    y = T.fmatrices('y')
+    expr = T.advanced_inc_subtensor1(x,y,[0,2])
+    f=theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op,cuda.GpuAdvancedIncSubtensor1) for node in f.maker.env.toposort() ])==1
+    assert numpy.allclose(f(yval),[[11.,12.,13.], [4.,5.,6.], [17.,18.,19.]])
+
+
 def test_inc_subtensor():
    shared = cuda.shared_constructor
    #shared = tensor.shared
@@ -832,7 +848,6 @@ def test_set_subtensor():
                      dtype='float32')
    expr = T.set_subtensor(x[:,1:3], y[:,1:3])
    f=theano.function([x,y], expr, mode=mode_with_gpu)
-    print f.maker.env.toposort()
    assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
    assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==True for node in f.maker.env.toposort() ])==1
    print f(xval,yval)

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -116,7 +116,7 @@ def test_run_nnet():
            rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
            #print "cpu:", rval_cpu
            #print "gpu:", rval_gpu
-            abs_diff, rel_diff = theano.tensor.basic.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
+            abs_diff, rel_diff = theano.tensor.tensor_grad.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
            max_abs_diff = abs_diff.max()
            print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d"%(
                max_abs_diff, rel_diff.max(), n_in, n_hid)

--- a/theano/scan_module/__init__.py
+++ b/theano/scan_module/__init__.py
@@ -41,4 +41,4 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 import scan_opt
 from scan import scan
 from scan_views import map, reduce, foldl, foldr
-from scan_utils import clone
+from scan_utils import clone, until
--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
--- a/theano/scan_module/scan_views.py
+++ b/theano/scan_module/scan_views.py
@@ -102,31 +102,18 @@ def reduce( fn
    :param name: See ``scan``.
    """
    # Makes sure the outputs_info is a list.
-    if not isinstance(outputs_info, (list,tuple)):
-        outs_info = [outputs_info]
-    else:
-        outs_info = list(outputs_info)
-
-    for i,out_info in enumerate(outs_info):
-        if out_info:
-            if not isinstance(out_info, dict):
-                # Specifies that it should return only the last step.
-                outs_info[i] = dict(
-                    initial = out_info,  return_steps = 1)
-            else:
-                # Specifies that it should return only the last step.
-                outs_info[i]['return_steps'] = 1
-                # NOTE : If the user asks for more then the last step,
-                # it means he does not understand ``reduce``. We could
-                # issue a warning in that case
-    return scan.scan( fn                 = fn
+    rval = scan.scan( fn                 = fn
                , sequences         = sequences
-                , outputs_info      = outs_info
+                , outputs_info      = outputs_info
                , non_sequences     = non_sequences
                , go_backwards      = go_backwards
                , truncate_gradient = -1
                , mode              = mode
                , name              = name )
+    if isinstance(rval[0], (list,tuple)):
+        return [ x[-1] for x in rval[0]], rval[1]
+    else:
+        return rval[0][-1], rval[1]


 # The ``foldl`` view of Scan Op.

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2838,7 +2838,7 @@ def extract_constant(x):
        if x.owner and isinstance(x.owner.op, ScalarFromTensor):
            x = x.owner.inputs[0]
        else:
-            x = tensor.tensor_from_scalar(x)
+            x = tensor_from_scalar(x)
    return x



--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1245,32 +1245,59 @@ def local_useless_subtensor(node):
        shape_of = node.env.shape_feature.shape_of
        node_input_idx = 1
        for pos, idx in enumerate(node.op.idx_list):
+            if not isinstance(idx, slice):
+                # If idx is not a slice, this means we remove this dimension
+                # from the output, so the subtensor is not useless
+                return False
+            if idx.start not in [0,None]:
+                # If the start of the slice is different from 0, or is a
+                # variable, then we assume the subtensor is not useless
+                return False
+            if idx.step not in [1, None]:
+                # If we are going backwards, or skipping elements, then this
+                # is not a useless subtensor
+                return False
+
+
            length_pos_data = sys.maxint
            length_pos_shape_i = None
            try:
                length_pos = shape_of[node.inputs[0]][pos]
-                if isinstance(length_pos, theano.tensor.basic.TensorConstant):
-                    length_pos_data = length_pos.data
-                else:
-                    length_pos_shape_i = node.inputs[node_input_idx].owner.inputs[0]
+                try:
+                    length_pos_data = get_constant_value(length_pos)
+                except TypeError:
+                    pass

+                if isinstance(idx.stop, theano.scalar.Scalar):
+                    if isinstance(node.inputs[node_input_idx].owner.op,
+                                T.ScalarFromTensor):
+                        length_pos_shape_i = node.inputs[node_input_idx].owner.inputs[0]
+                    else:
+                        length_pos_shape_i = node.inputs[node_input_idx]
+                    assert length_pos_shape_i.type == idx.stop
+                    # We already know that start and step are not variables
+                    # and so they don't appear in the input of the node
+                    node_input_idx += 1
+            # Catch exception from shape_of
            except Exception, e:
                length_pos = None
-            if ( isinstance(idx,slice) and
-                idx.start in [0,None] and
-                idx.step in [1,None] and
-                (idx.stop in [sys.maxint, None, length_pos_data] or
-                 (isinstance(idx.stop, int) and idx.stop>=length_pos_data) or
-                 (isinstance(idx.stop, theano.scalar.Scalar) and
-                  length_pos==length_pos_shape_i)
-                 )):
+
+            if isinstance(idx.stop, int):
+                if idx.stop < length_pos_data:
+                    return False
+
+            elif isinstance(idx.stop, theano.scalar.Scalar):
+                if length_pos_shape_i is None:
+                    return False
+                if length_pos is None:
+                    return False
+                if length_pos_shape_i != length_pos:
+                    return False
+            elif idx.stop is None:
                pass
            else:
                return False
-            if isinstance(idx, slice):
-                node_input_idx += sum([isinstance(idx.start, theano.scalar.Scalar),
-                                       isinstance(idx.stop, theano.scalar.Scalar),
-                                       isinstance(idx.step, theano.scalar.Scalar)])
+
        return [node.inputs[0]]



--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -136,6 +136,7 @@ def safe_make_node(op, *inputs):
        return node[0].owner
    else:
        return node.owner
+
 def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},
               bad_runtime = {}, grad = {}, mode = None, grad_rtol=None,
               eps = 1e-10, skip = False):
@@ -146,7 +147,7 @@ def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},

    class Checker(unittest.TestCase):

-        op = _op
+        op = staticmethod(_op)
        expected = staticmethod(_expected)
        checks = _checks
        good = _good
@@ -999,6 +1000,52 @@ SecondSameRankTester = makeTester(
                            mode=get_default_mode().excluding('local_fill_to_alloc')
                        )

+### Alloc
+AllocTester = makeBroadcastTester(
+        name = 'AllocTester',
+        op = alloc,
+        expected = (lambda x, *shp: numpy.zeros(shp, dtype=x.dtype) + x),
+        good = dict(
+            correct02 = (rand(), numpy.int32(4), numpy.int32(7)),
+            correct12 = (rand(7), numpy.int32(4), numpy.int32(7)),
+            correct13 = (rand(7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
+            correct23 = (rand(4,7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
+            ),
+        bad_runtime = dict(
+            bad_shape12 = (rand(7), numpy.int32(7), numpy.int32(5)),
+            too_big32 = (rand(6,2,4), numpy.int32(6), numpy.int32(2)),
+            too_big32b = (rand(6,2,4), numpy.int32(2), numpy.int32(4)),
+            ),
+        )
+
+# Since not all inputs of Alloc are differentiable, we need different testers
+s1, s2, s3 = randint_ranged(1, 13, (3,))
+# alloc a scalar into a vector
+Alloc01GradTester = makeBroadcastTester(
+        name = 'Alloc01GradTester',
+        #op = (lambda self, x: alloc(x, s1)),
+        op = (lambda x: alloc(x, s1)),
+        expected = (lambda x: numpy.zeros((s1,), dtype=x.dtype) + x),
+        grad = dict(
+            x1 = (rand(),),
+            x2 = (rand(),),
+            x3 = (rand(),),
+            ),
+        )
+
+# alloc a vector into a tensor3
+Alloc13GradTester = makeBroadcastTester(
+        name = 'Alloc13GradTester',
+        #op = (lambda self, x: alloc(x, s1, s2, s3)),
+        op = (lambda x: alloc(x, s1, s2, s3)),
+        expected = (lambda x: numpy.zeros((s1, s2, s3), dtype=x.dtype) + x),
+        grad = dict(
+            x1 = (rand(s3),),
+            x2 = (rand(s3),),
+            x3 = (rand(s3),),
+            ),
+        )
+
 def test_eye():
    def check(dtype, N, M_=None, k=0):
        # Theano does not accept None as a tensor.

--- a/theano/tensor/tests/test_rop.py
+++ b/theano/tensor/tests/test_rop.py
@@ -13,30 +13,33 @@ ops without:
    Prod
    MulwithoutZeros
    ProdWithoutZeros
-
+    CAReduce(for max,... done for MaxAndArgmax op)

 list of ops that support R-op:
+ * with test
+    * SpecifyShape
+    * MaxAndArgmax
+    * Subtensor
+    * IncSubtensor set_subtensor too
    * Alloc
+    * Dot
+    * Elemwise
+    * Sum
+    * Softmax
+    * Shape
+    * Join
+
+ * without test
    * Split
    * ARange
    * ScalarFromTensor
-    * Shape
-    * SpecifyShape
-    * MaxAndArgmax
-    * Subtensor
-    * IncSubtensor
    * Rebroadcast
-    * Join
    * Reshape
    * Flatten
    * AdvancedSubtensor1
    * AdvancedIncSubtensor1
    * AdvancedIncSubtensor
-    * Dot
    * DimShuffle
-    * Elemwise
-    * Sum
-    * Softmax
    * Scan


@@ -183,11 +186,17 @@ class test_RopLop(unittest.TestCase):
                           self.in_shape)


-    def test_max_argmax(self):
+    def test_max(self):
+        ## If we call max directly, we will return an CAReduce object
+        ## and he don't have R_op implemented!
+        #self.check_mat_rop_lop(TT.max(self.mx, axis=[0,1])[0],
+        #                       ())
+        self.check_mat_rop_lop(TT.max(self.mx, axis=0),
+                               (self.mat_in_shape[1],))
        self.check_mat_rop_lop(TT.max(self.mx, axis=1),
                               (self.mat_in_shape[0],))

-    def test_max_argmax(self):
+    def test_argmax(self):
        self.check_nondiff_rop(TT.argmax(self.mx,axis=1))

    def test_subtensor(self):
@@ -201,7 +210,7 @@ class test_RopLop(unittest.TestCase):
        self.check_rop_lop(out, self.in_shape)


-    def test_incsubtensor1(self):
+    def test_incsubtensor2(self):
        tv = numpy.asarray( self.rng.uniform(size=(10,)),
                           theano.config.floatX)
        t = theano.shared(tv)
@@ -217,7 +226,7 @@ class test_RopLop(unittest.TestCase):
        self.check_rop_lop(out, self.in_shape)


-    def test_setsubtensor1(self):
+    def test_setsubtensor2(self):
        tv = numpy.asarray( self.rng.uniform(size=(10,)),
                           theano.config.floatX)
        t = theano.shared(tv)

--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py
@@ -4,15 +4,31 @@ This is a REALLY PARTIAL TEST.
 I did them to help debug stuff.

 """
-
+import logging
+import StringIO

 import theano
 import theano.tensor as tensor


 def test_pydotprint_cond_highlight():
+    assert len(theano.theano_logger.handlers) == 1
+
    x = tensor.dvector()
    f = theano.function([x], x*2)
    f([1,2,3,4])

-    theano.printing.pydotprint(f, cond_highlight = True)
+    s = StringIO.StringIO()
+    new_handler = logging.StreamHandler(s)
+    new_handler.setLevel(logging.DEBUG)
+    orig_handler = theano.theano_logger.handlers[0]
+
+    theano.theano_logger.removeHandler(orig_handler)
+    theano.theano_logger.addHandler(new_handler)
+    try:
+        theano.printing.pydotprint(f, cond_highlight = True)
+    finally:
+        theano.theano_logger.addHandler(orig_handler)
+        theano.theano_logger.removeHandler(new_handler)
+
+    assert s.getvalue() == 'pydotprint: cond_highlight is set but there is no IfElse node in the graph\n'