Merge https://github.com/Theano/Theano

9bbf919e · Yann N. Dauphin · 5c89596d · 4937b42b · 9bbf919e · 9bbf919e
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,6 +17,7 @@ install:
 # So we test with 0.8. Our internal buildbot have 0.7.2.
 # We install it later only for the PART that need it.
 #  - "pip install -q scipy==0.8 --use-mirrors"
+  - "pip install nose-timelimit --use-mirrors"
  - "pip install . --no-deps --use-mirrors"
 # command to run tests
 env:
@@ -36,7 +37,7 @@ script:
  - df -h
  - ulimit -a
  - echo $PART
-  - theano-nose -v $PART
+  - theano-nose --with-timelimit -v $PART


 #after_script:

--- a/doc/extending/ctype.txt
+++ b/doc/extending/ctype.txt
@@ -558,9 +558,9 @@ default, it will recompile the c code for each process.
 Shape and Shape_i
 =================

-We have 2 generic Ops Shape and Shape_i that return the shape of any
-Theano Variable that have a shape attribute and Shape_i return only of
-the element of the shape.
+We have 2 generic Ops, Shape and Shape_i, that return the shape of any
+Theano Variable that has a shape attribute (Shape_i returns only one of
+the elements of the shape).


 .. code-block:: python
@@ -568,5 +568,5 @@ the element of the shape.
   theano.compile.ops.register_shape_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
   theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())

-The c code work as the ViewOp. Shape_i have the additional i parameter
-that you can use with %(i)s.
+The C code works as the ViewOp. Shape_i has the additional ``i`` parameter
+that you can use with ``%(i)s``.
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -20,7 +20,9 @@ since 2007.  But it is also approachable enough to be used in the classroom
 News
 ====

-* Theano 0.6rc3 was released. Everybody is encouraged to update.
+* Ian Goodfellow did a `12h class with exercises on Theano <https://github.com/goodfeli/theano_exercises>`_.
+
+* Theano 0.6 was released. Everybody is encouraged to update.

 * New technical report on Theano: `Theano: new features and speed improvements <http://arxiv.org/abs/1211.5590>`_.
  However, please keep citing the other paper below in scientific work involving Theano.

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -234,8 +234,8 @@ From here, the easiest way to get started is (this requires setuptools_ or distr

 .. note::

-   "python setup.py develop ..." don't work on Python 3 as it don't call
-   the converter from Python2 code to Python 3 code.
+   "python setup.py develop ..." does not work on Python 3 as it does not call
+   the converter from Python 2 code to Python 3 code.

 This will install a ``.pth`` file in your ``site-packages`` directory that
 tells Python where to look for your Theano installation (i.e. in the

--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -3,8 +3,8 @@
 Easy Installation of an optimized Theano on Ubuntu
 ==================================================

-These instruction was done for Ubuntu 11.04, 11.10 and 12.04. You can
-probably do something similar on older computer.
+These instruction was done for Ubuntu 11.04, 11.10, 12.04, 12.10, 13.04
+and 13.10. You can probably do something similar on older computer.

 .. note::

@@ -49,7 +49,7 @@ probably do something similar on older computer.
 Installation steps
 ~~~~~~~~~~~~~~~~~~

-Ubuntu 11.10/12.04/12.10/13.04:
+Ubuntu 11.10/12.04/12.10/13.04/13.10:
 1) ``sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git``
 2) ``sudo pip install Theano``

@@ -167,7 +167,7 @@ yourself. Here is some code that will help you.
    make FC=gfortran
    sudo make PREFIX=/usr/local/ install
    # Tell Theano to use OpenBLAS.
-    # This work only for the current user.
+    # This works only for the current user.
    # Each Theano user on that computer should run that line.
    echo -e "\n[blas]\nldflags = -lopenblas\n" >> ~/.theanorc

@@ -236,15 +236,4 @@ Test GPU configuration

   Ubuntu 12.10: default gcc version 4.7.2. gcc 4.4.7, 4.5.4 and 4.6.3 availables.

-
-
-
-
-
-
-
-
-
-
-
-
+   Ubuntu 13.10: default gcc version 4.8.1. gcc 4.4.7, 4.6.4 and 4.7.3 availables.
--- a/doc/library/compile/index.txt
+++ b/doc/library/compile/index.txt
@@ -16,6 +16,7 @@
    shared
    function
    io
+    ops
    mode
    module
    debugmode

--- a/doc/library/compile/ops.txt
+++ b/doc/library/compile/ops.txt
+==================================================
+:mod:`ops` --  Some Common Ops and extra Ops stuff
+==================================================
+
+.. automodule:: theano.compile.ops
+    :members:
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -216,7 +216,7 @@ import theano and print the config variable, as in:
   Positive int value, default: 200000.

   This specifies the vectors minimum size for which elemwise ops
-   use openmp, if openmp is enable.
+   use openmp, if openmp is enabled.

 .. attribute:: cast_policy


--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -607,6 +607,27 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
    have shape (2, 60).


+.. function:: tile(x, reps, ndim=None)
+
+    Construct an array by repeating the input `x` according to `reps`
+    pattern.
+
+    Tiles its input according to `reps`. The length of `reps` is the
+    number of dimension of `x` and contains the number of times to
+    tile `x` in each dimension.
+
+    :see: `numpy.tile
+        <http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_
+        documentation for examples.
+
+    :see: :func:`theano.tensor.extra_ops.repeat
+        <theano.tensor.extra_ops.repeat>`
+
+    :note: Currently, `reps` must be a constant, `x.ndim` and
+        `len(reps)` must be equal and, if specified, `ndim` must be
+        equal to both.
+
+
 Creating Tensor
 ===============

@@ -1542,6 +1563,86 @@ Gradient / Differentiation

    :rtype: variable or list of variables (matching `wrt`)
    :returns: gradients of the cost with respect to each of the `wrt` terms
+    
+.. function:: subgraph_grad(wrt, end, start=None, cost=None, details=False)
+
+    With respect to `wrt`, computes gradients of cost and/or from existing 
+    `start` gradients, up to the `end` variables of a symbolic digraph. 
+    In other words, computes gradients for a subgraph of the
+    symbolic theano function. Ignores all disconnected inputs.
+    
+    This can be useful when one needs to perform the gradient descent 
+    iteratively (e.g. one layer at a time in an MLP), or when a particular 
+    operation is not differentiable in theano (e.g. stochastic sampling 
+    from a multinomial). In the latter case, the gradient of the 
+    non-differentiable process could be approximated by user-defined 
+    formula, which could be calculated using the gradients of a cost 
+    with respect to samples (0s and 1s). These gradients are obtained 
+    by performing a subgraph_grad from the `cost` or previously known gradients 
+    (`start`) up to the outputs of the stochastic process (`end`). 
+    A dictionary mapping gradients obtained from the user-defined 
+    differentiation of the process, to variables, could then be fed into 
+    another subgraph_grad as `start` with any other `cost` (e.g. weight decay).
+    
+    In an MLP, we could use subgraph_grad to iteratively backpropagate:
+    >>> x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
+    >>> w1 = theano.shared(np.random.randn(3,4))
+    >>> w2 = theano.shared(np.random.randn(4,2))
+    >>> a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
+    >>> a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
+    >>> cost2 = theano.tensor.sqr(a2 - t).sum() 
+    >>> cost2 += theano.tensor.sqr(w2.sum())
+    >>> cost1 = theano.tensor.sqr(w1.sum())
+    
+    >>> params = [[w2],[w1]]
+    >>> costs = [cost2,cost1]
+    >>> grad_ends = [[a1], [x]]
+    
+    >>> next_grad = None
+    >>> param_grads = []
+    >>> for i in xrange(2):
+    >>>     param_grad, next_grad = theano.subgraph_grad(
+    >>>         wrt=params[i], end=grad_ends[i], 
+    >>>         start=next_grad, cost=costs[i]
+    >>>     )
+    >>>     next_grad = dict(zip(grad_ends[i], next_grad))
+    >>>     param_grads.extend(param_grad)
+    
+    :type wrt : List of Variables.
+        Gradients are computed with respect to `wrt`.
+    
+    :type end : List of Variables.
+        Theano variables at which to end gradient descent
+        (they are considered constant in theano.grad). 
+        For convenience, the gradients with respect to these variables 
+        are also returned.
+    
+    :type start : Dictionary of Variables
+    :param start: If not None, a dictionary mapping variables to 
+            their gradients. This is useful when the gradient on some 
+            variables are known. These are used to compute the gradients
+            backwards up to the variables in `end` 
+            (they are used as known_grad in theano.grad).
+    
+    :type cost: Scalar (0-dimensional) Variable.
+    :param cost: 
+            Additional costs for which to compute the gradients.  
+            For example, these could be weight decay, an l1 constraint,
+            MSE, NLL, etc. May optionally be None if start is provided.
+            Warning : If the gradients of `cost` with respect to any 
+            of the `start` variables is already part of the `start` 
+            dictionary, then it may be counted twice with respect to `wrt` 
+            and `end`.
+    
+    :type details: bool.
+    :param details: When True, additionally returns the 
+        list of gradients from `start` and of `cost`, respectively, 
+        with respect to `wrt` (not `end`).
+    
+    :rtype: Tuple of 2 or 4 Lists of Variables
+    
+    :return: Returns lists of gradients with respect to `wrt` and `end`, 
+            respectively.


 .. _R_op_list:

--- a/doc/tutorial/loop.txt
+++ b/doc/tutorial/loop.txt
@@ -15,7 +15,7 @@ Scan
 - ``sum()`` could be computed by scanning the *z + x(i)* function over a list, given an initial state of *z=0*.
 - Often a *for* loop can be expressed as a ``scan()`` operation, and ``scan`` is the closest that Theano comes to looping.
 - Advantages of using ``scan`` over *for* loops:
-  
+
  - Number of iterations to be part of the symbolic graph.
  - Minimizes GPU transfers (if GPU is involved).
  - Computes gradients through sequential steps.
@@ -24,7 +24,246 @@ Scan

 The full documentation can be found in the library: :ref:`Scan <lib_scan>`.

-**Scan Example: Computing pow(A,k)**
+**Scan Example: Computing tanh(x(t).dot(W) + b) elementwise**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+
+  # defining the tensor variables
+  X = T.matrix("X")
+  W = T.matrix("W")
+  b_sym = T.vector("b_sym")
+
+  results, updates = theano.scan(lambda v: T.tanh(T.dot(v, W) + b_sym), sequences=X)
+  compute_elementwise = theano.function(inputs=[X, W, b_sym], outputs=[results])
+
+  # test values
+  x = np.eye(2, dtype=theano.config.floatX)
+  w = np.ones((2, 2), dtype=theano.config.floatX)
+  b = np.ones((2), dtype=theano.config.floatX)
+  b[1] = 2
+
+  print compute_elementwise(x, w, b)[0]
+
+  # comparison with numpy
+  print np.tanh(x.dot(w) + b)
+
+
+**Scan Example: Computing the sequence x(t) = tanh(x(t - 1).dot(W) + y(t).dot(U) + p(T - t).dot(V))**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+
+  # define tensor variables
+  X = T.vector("X")
+  W = T.matrix("W")
+  b_sym = T.vector("b_sym")
+  U = T.matrix("U")
+  Y = T.matrix("Y")
+  V = T.matrix("V")
+  P = T.matrix("P")
+
+  results, updates = theano.scan(lambda y, p, x_tm1: T.tanh(T.dot(x_tm1, W) + T.dot(y, U) + T.dot(p, V)),
+            sequences=[Y, P[::-1]], outputs_info=[X])
+  compute_seq = theano.function(inputs=[X, W, Y, U, P, V], outputs=[results])
+
+  # test values
+  x = np.zeros((2), dtype=theano.config.floatX)
+  x[1] = 1
+  w = np.ones((2, 2), dtype=theano.config.floatX)
+  y = np.ones((5, 2), dtype=theano.config.floatX)
+  y[0, :] = -3
+  u = np.ones((2, 2), dtype=theano.config.floatX)
+  p = np.ones((5, 2), dtype=theano.config.floatX)
+  p[0, :] = 3
+  v = np.ones((2, 2), dtype=theano.config.floatX)
+
+  print compute_seq(x, w, y, u, p, v)[0]
+
+  # comparison with numpy
+  x_res = np.zeros((5, 2), dtype=theano.config.floatX)
+  x_res[0] = np.tanh(x.dot(w) + y[0].dot(u) + p[4].dot(v))
+  for i in range(1, 5):
+    x_res[i] = np.tanh(x_res[i - 1].dot(w) + y[i].dot(u) + p[4-i].dot(v))
+  print x_res
+
+**Scan Example: Computing norms of lines of X**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+
+  # define tensor variable
+  X = T.matrix("X")
+  results, updates = theano.scan(lambda x_i: T.sqrt((x_i ** 2).sum()), sequences=[X])
+  compute_norm_lines = theano.function(inputs=[X], outputs=[results])
+
+  # test value
+  x = np.diag(np.arange(1, 6, dtype=theano.config.floatX), 1)
+  print compute_norm_lines(x)[0]
+
+  # comparison with numpy
+  print np.sqrt((x ** 2).sum(1))
+
+**Scan Example: Computing norms of columns of X**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+
+  # define tensor variable
+  X = T.matrix("X")
+  results, updates = theano.scan(lambda x_i: T.sqrt((x_i ** 2).sum()), sequences=[X.T])
+  compute_norm_cols = theano.function(inputs=[X], outputs=[results])
+
+  # test value
+  x = np.diag(np.arange(1, 6, dtype=theano.config.floatX), 1)
+  print compute_norm_cols(x)[0]
+
+  # comparison with numpy
+  print np.sqrt((x ** 2).sum(0))
+
+**Scan Example: Computing trace of X**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+  floatX = "float32"
+
+  # define tensor variable
+  X = T.matrix("X")
+  results, updates = theano.scan(lambda i, j, t_f: T.cast(X[i, j] + t_f, floatX),
+                    sequences=[T.arange(X.shape[0]), T.arange(X.shape[1])],
+                    outputs_info=np.asarray(0., dtype=floatX))
+  result = results[-1]
+  compute_trace = theano.function(inputs=[X], outputs=[result])
+
+  # test value
+  x = np.eye(5, dtype=theano.config.floatX)
+  x[0] = np.arange(5, dtype=theano.config.floatX)
+  print compute_trace(x)[0]
+
+  # comparison with numpy
+  print np.diagonal(x).sum()
+
+**Scan Example: Computing the sequence x(t) = x(t - 2).dot(U) + x(t - 1).dot(V) +  tanh(x(t - 1).dot(W)  + b)**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+
+  # define tensor variables
+  X = T.matrix("X")
+  W = T.matrix("W")
+  b_sym = T.vector("b_sym")
+  U = T.matrix("U")
+  V = T.matrix("V")
+  n_sym = T.iscalar("n_sym")
+
+  results, updates = theano.scan(lambda x_tm2, x_tm1: T.dot(x_tm2, U) + T.dot(x_tm1, V) + T.tanh(T.dot(x_tm1, W) + b_sym),
+                      n_steps=n_sym, outputs_info=[dict(initial=X, taps=[-2, -1])])
+  compute_seq2 = theano.function(inputs=[X, U, V, W, b_sym, n_sym], outputs=[results])
+
+  # test values
+  x = np.zeros((2, 2), dtype=theano.config.floatX) # the initial value must be able to return x[-2]
+  x[1, 1] = 1
+  w = 0.5 * np.ones((2, 2), dtype=theano.config.floatX)
+  u = 0.5 * (np.ones((2, 2), dtype=theano.config.floatX) - np.eye(2, dtype=theano.config.floatX))
+  v = 0.5 * np.ones((2, 2), dtype=theano.config.floatX)
+  n = 10
+  b = np.ones((2), dtype=theano.config.floatX)
+
+  print compute_seq2(x, u, v, w, b, n)
+
+  # comparison with numpy
+  x_res = np.zeros((10, 2))
+  x_res[0] = x[0].dot(u) + x[1].dot(v) + np.tanh(x[1].dot(w) + b)
+  x_res[1] = x[1].dot(u) + x_res[0].dot(v) + np.tanh(x_res[0].dot(w) + b)
+  x_res[2] = x_res[0].dot(u) + x_res[1].dot(v) + np.tanh(x_res[1].dot(w) + b)
+  for i in range(2, 10):
+    x_res[i] = (x_res[i - 2].dot(u) + x_res[i - 1].dot(v) +
+                np.tanh(x_res[i - 1].dot(w) + b))
+  print x_res
+
+**Scan Example: Computing the Jacobian of y = tanh(v.dot(A)) wrt x**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+
+  # define tensor variables
+  v = T.vector()
+  A = T.matrix()
+  y = T.tanh(T.dot(v, A))
+  results, updates = theano.scan(lambda i: T.grad(y[i], v), sequences=[T.arange(y.shape[0])])
+  compute_jac_t = theano.function([A, v], [results], allow_input_downcast=True) # shape (d_out, d_in)
+
+  # test values
+  x = np.eye(5, dtype=theano.config.floatX)[0]
+  w = np.eye(5, 3, dtype=theano.config.floatX)
+  w[2] = np.ones((3), dtype=theano.config.floatX)
+  print compute_jac_t(w, x)[0]
+
+  # compare with numpy
+  print ((1 - np.tanh(x.dot(w)) ** 2) * w).T
+
+Note that we need to iterate over the indices of ``y`` and not over the elements of ``y``. The reason is that scan create a placeholder variable for its internal function and this placeholder variable does not have the same dependencies than the variables that will replace it.
+
+**Scan Example: Accumulate number of loop during a scan**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+
+  # define shared variables
+  k = theano.shared(0)
+  n_sym = T.iscalar("n_sym")
+
+  results, updates = theano.scan(lambda:{k:(k + 1)}, n_steps=n_sym)
+  accumulator = theano.function([n_sym], [], updates=updates, allow_input_downcast=True)
+
+  k.get_value()
+  accumulator(5)
+  k.get_value()
+
+**Scan Example: Computing tanh(v.dot(W) + b) * d where b is binomial**
+
+.. code-block:: python
+  import theano
+  import theano.tensor as T
+  import numpy as np
+
+  # define tensor variables
+  X = T.matrix("X")
+  W = T.matrix("W")
+  b_sym = T.vector("b_sym")
+
+  # define shared random stream
+  trng = T.shared_randomstreams.RandomStreams(1234)
+  d=trng.binomial(size=W[1].shape)
+
+  results, updates = theano.scan(lambda v: T.tanh(T.dot(v, W) + b_sym) * d, sequences=X)
+  compute_with_bnoise = theano.function(inputs=[X, W, b_sym], outputs=[results],
+                            updates=updates, allow_input_downcast=True)
+  x = np.eye(10, 2, dtype=theano.config.floatX)
+  w = np.ones((2, 2), dtype=theano.config.floatX)
+  b = np.ones((2), dtype=theano.config.floatX)
+
+  print compute_with_bnoise(x, w, b)
+
+Note that if you want to use a random variable ``d`` that will not be updated through scan loops, you should pass this variable as a ``non_sequences`` arguments. 
+
+**Scan Example: Computing pow(A, k)**

 .. code-block:: python

@@ -46,11 +285,11 @@ The full documentation can be found in the library: :ref:`Scan <lib_scan>`.
  # Scan has provided us with A ** 1 through A ** k.  Keep only the last
  # value. Scan notices this and does not waste memory saving them.
  final_result = result[-1]
-  
+
  power = theano.function(inputs=[A, k], outputs=final_result,
                        updates=updates)
-  
-  print power(range(10),2)
+
+  print power(range(10), 2)
  #[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]



--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
@@ -17,8 +17,8 @@ those operations will run in parallel in Theano.

 The most frequent way to control the number of threads used is via the
 ``OMP_NUM_THREADS`` environment variable. Set it to the number of
-threads you want to use before starting the python process. Some BLAS
-implementations support other enviroment variables.
+threads you want to use before starting the Python process. Some BLAS
+implementations support other environment variables.


 Parallel element wise ops with OpenMP
@@ -35,9 +35,9 @@ tensor size for which the operation is parallelized because for short
 tensors using OpenMP can slow down the operation. The default value is
 ``200000``.

-For simple(fast) operation you can obtain a speed up with very large
-tensors while for more complex operation you can obtain a good speed
-up also for smaller tensor.
+For simple (fast) operations you can obtain a speed-up with very large
+tensors while for more complex operations you can obtain a good speed-up
+also for smaller tensors.

 There is a script ``elemwise_openmp_speedup.py`` in ``theano/misc/``
 which you can use to tune the value of ``openmp_elemwise_minsize`` for
@@ -47,4 +47,4 @@ without OpenMP and shows the time difference between the cases.

 The only way to control the number of threads used is via the
 ``OMP_NUM_THREADS`` environment variable. Set it to the number of threads
-you want to use before starting the python process.
+you want to use before starting the Python process.
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -61,7 +61,7 @@ from theano.compile import \
    Component, External, Member, Method, \
    Composite, ComponentList, ComponentDict, Module, \
    ProfileMode, ProfileStats, \
-    Param, shared
+    Param, shared, as_op

 from theano.misc.safe_asarray import _asarray

@@ -79,7 +79,7 @@ from theano.updates import Updates, OrderedUpdates
 #we don't import by default as we don't want to force having scipy installed.
 #import sparse

-from theano.gradient import Rop, Lop, grad
+from theano.gradient import Rop, Lop, grad, subgraph_grad

 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
    import theano.sandbox.cuda

--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
@@ -2,7 +2,8 @@ from theano.compile.ops import (
        DeepCopyOp, deep_copy_op, register_deep_copy_op_c_code,
        Shape, shape, register_shape_c_code,
        Shape_i, register_shape_i_c_code,
-        ViewOp, view_op, register_view_op_c_code)
+        ViewOp, view_op, register_view_op_c_code, FromFunctionOp, 
+        as_op)

 from theano.compile.function_module import *


--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1077,6 +1077,7 @@ class FunctionMaker(object):
        self.mode = mode
        self.accept_inplace = accept_inplace
        self.function_builder = function_builder
+        self.on_unused_input = on_unused_input  # Used only for the pickling

        self.required = [(i.value is None) for i in self.inputs]
        self.refeed = [
@@ -1215,6 +1216,7 @@ def _pickle_FunctionMaker(self):
                accept_inplace=self.accept_inplace,
                function_builder=self.function_builder,
                profile=self.profile,
+                on_unused_input=self.on_unused_input,
                )
    return (_constructor_FunctionMaker, (kwargs,))


--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -189,9 +189,6 @@ optdb.register('Print1.51', PrintCurrentFunctionGraph('Post-stabilize'),
 optdb.register('specialize', gof.EquilibriumDB(),
        2, 'fast_run')

-optdb.register('Print2.01', PrintCurrentFunctionGraph('Post-specialize'),
-        2.01,)  # 'fast_run', 'fast_compile')
-
 # misc special cases for speed that break canonicalization
 optdb.register('uncanonicalize', gof.EquilibriumDB(),
        3, 'fast_run')

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
-"""This file contain auxiliary Ops, used during the compilation phase."""
+"""This file contains auxiliary Ops, used during the compilation phase
+and Ops building class (:class:`FromFunctionOp`) and decorator
+(:func:`as_op`) that help make new Ops more rapidly.
+
+"""
 import copy
 import warnings

@@ -364,3 +368,96 @@ def register_shape_i_c_code(typ, code, version=()):
 # List of Theano Types that one can add an extra dimension and for which
 # Scan can deal with.
 expandable_types = ()
+
+class FromFunctionOp(gof.Op):
+    """
+    Build a basic Theano Op around a function.
+
+    Since the resulting Op is very basic and is missing most of the
+    optional functionalities, some optimizations may not apply.  If you
+    want to help, you can supply an infer_shape function that computes
+    the shapes of the output given the shapes of the inputs.
+
+    Also the gradient is undefined in the resulting op and Theano will
+    raise an error if you attempt to get the gradient of a graph
+    containing this op.
+    """
+    def __init__(self, fn, itypes, otypes, infer_shape):
+        self.__fn = fn
+        self.itypes = itypes
+        self.otypes = otypes
+        self.__infer_shape = infer_shape
+        if self.__infer_shape is not None:
+            self.infer_shape = self._infer_shape
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.__fn == other.__fn)
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.__fn)
+
+    def __str__(self):
+        return 'FromFunctionOp{%s}' % self.__fn.__name__
+
+    def make_node(self, *inputs):
+        assert len(inputs) == len(self.itypes)
+        assert all(inp.type == it for inp, it in zip(inputs, self.itypes))
+        return theano.Apply(self, inputs, [o() for o in self.otypes])
+
+    def perform(self, node, inputs, outputs):
+        outs = self.__fn(*inputs)
+        if not isinstance(outs, (list, tuple)):
+            outs = (outs,)
+        assert len(outs) == len(outputs)
+        for i in range(len(outs)):
+            outputs[i][0] = outs[i]
+
+    def _infer_shape(self, node, input_shapes):
+        return self.__infer_shape(node, input_shapes)
+
+def as_op(itypes, otypes, infer_shape=None):
+    """
+    Decorator that converts a function into a basic Theano op that
+    will call the supplied function as its implementation.
+
+    It takes an optional infer_shape parameter that should be a
+    callable with this signature:
+
+        def infer_shape(node, input_shapes):
+            ...
+            return output_shapes
+
+    Here `input_shapes` and `output_shapes` are lists of tuples that
+    represent the shape of the corresponding inputs/outputs.
+
+    This should not be used when performance is a concern since the
+    very basic nature of the resulting Op may interfere with certain
+    graph optimizations.
+
+    Example usage:
+
+       @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
+              otypes=[theano.tensor.fmatrix])
+       def numpy_dot(a, b):
+           return numpy.dot(a, b)
+    """
+    if not isinstance(itypes, (list, tuple)):
+        itypes = [itypes]
+    if any(not isinstance(t, theano.Type) for t in itypes):
+        raise TypeError("itypes has to be a list of Theano types")
+    if not isinstance(otypes, (list, tuple)):
+        otypes = [otypes]
+    if any(not isinstance(t, theano.Type) for t in otypes):
+        raise TypeError("otypes has to be a list of Theano types")
+
+    # make sure they are lists and not tuples
+    itypes = list(itypes)
+    otypes = list(otypes)
+
+    if infer_shape is not None and not callable(infer_shape):
+        raise TypeError("infer_shape needs to be a callable")
+
+    def make_op(fn):
+        return FromFunctionOp(fn, itypes, otypes, infer_shape)
+    return make_op
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -507,13 +507,22 @@ class ProfileStats(object):

        print >> file, header_str

-        atimes = [(
+        topos = {}  # Only do the topo once per fct.
+        atimes = []
+        for a, t in self.apply_time.items():
+            if a.fgraph not in topos:
+                topo = a.fgraph.toposort()
+                topos[a.fgraph] = topo
+            else:
+                topo = topos[a.fgraph]
+            atimes.append((
                t * 100 / local_time,
                t,
                a,
-                a.fgraph.toposort().index(a),
-                self.apply_callcount[a])
-            for a, t in self.apply_time.items()]
+                topo.index(a),
+                self.apply_callcount[a]))
+        del topos
+
        atimes.sort()
        atimes.reverse()
        tot = 0

--- a/theano/compile/tests/test_ops.py
+++ b/theano/compile/tests/test_ops.py
+"""
+ Tests for the Op decorator
+"""
+import numpy as np
+
+from theano.tests import unittest_tools as utt
+from theano import function
+import theano
+from theano import tensor
+from theano.tensor import dmatrix, dvector
+from numpy import allclose
+from theano.compile import as_op
+
+
+class OpDecoratorTests(utt.InferShapeTester):
+    def test_1arg(self):
+        x = dmatrix('x')
+
+        @as_op(dmatrix, dvector)
+        def diag(x):
+            return np.diag(x)
+
+        fn = function([x], diag(x))
+        r = fn([[1.5, 5], [2, 2]])
+        r0 = np.array([1.5, 2])
+
+        assert allclose(r, r0), (r, r0)
+
+    def test_2arg(self):
+        x = dmatrix('x')
+        x.tag.test_value = np.zeros((2, 2))
+        y = dvector('y')
+        y.tag.test_value = [0, 0]
+
+        @as_op([dmatrix, dvector], dvector)
+        def diag_mult(x, y):
+            return np.diag(x) * y
+
+        fn = function([x, y], diag_mult(x, y))
+        r = fn([[1.5, 5], [2, 2]], [1, 100])
+        r0 = np.array([1.5, 200])
+
+        assert allclose(r, r0), (r, r0)
+
+    def test_infer_shape(self):
+        x = dmatrix('x')
+        x.tag.test_value = np.zeros((2, 2))
+        y = dvector('y')
+        y.tag.test_value = [0, 0]
+
+        def infer_shape(node, shapes):
+            x, y = shapes
+            return [y]
+
+        @as_op([dmatrix, dvector], dvector, infer_shape)
+        def diag_mult(x, y):
+            return np.diag(x) * y
+
+        self._compile_and_check([x, y], [diag_mult(x, y)],
+                                [[[1.5, 5], [2, 2]], [1, 100]],
+                                diag_mult.__class__, warn=False)
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -117,19 +117,10 @@ AddConfigVar('mode',
 enum = EnumStr("g++", "")

 # Test whether or not g++ is present: disable C code if it is not.
-# Using the dummy file descriptor below is a workaround for a crash experienced
-# in an unusual Python 2.4.4 Windows environment with the default stdin=None.
-dummy_stdin = open(os.devnull)
 try:
-    try:
-        rc = call_subprocess_Popen(['g++', '-v'], stdout=subprocess.PIPE,
-                                   stderr=subprocess.PIPE,
-                                   stdin=dummy_stdin).wait()
-    except OSError:
-        rc = 1
-finally:
-    dummy_stdin.close()
-    del dummy_stdin
+    rc = call_subprocess_Popen(['g++', '-v'])
+except OSError:
+    rc = 1
 if rc == 0:
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
@@ -426,8 +417,8 @@ AddConfigVar('compute_test_value_opt',
             in_c_key=False)

 AddConfigVar('unpickle_function',
-             ("Replace unpickled Theano function with None",
-              "This is useful to unpickle old graph that pickled"
+             ("Replace unpickled Theano functions with None. "
+              "This is useful to unpickle old graphs that pickled"
              " them when it shouldn't"),
             BoolParam(True),
             in_c_key=False)
@@ -492,9 +483,9 @@ AddConfigVar('openmp',
         )

 AddConfigVar('openmp_elemwise_minsize',
-             "If OpenMP is enable, this is the minimum size of vector "
-             "for which  the openmp parallel for is enable."
-             "Used in element wise ops",
+             "If OpenMP is enabled, this is the minimum size of vectors "
+             "for which the openmp parallelization is enabled "
+             "in element wise ops.",
             IntParam(200000),
             in_c_key=False,
         )
--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -57,7 +57,10 @@ from theano.gof.link import \
 from theano.gof.op import \
    Op, OpenMPOp, PureOp, ops_with_inner_function

-from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
+from theano.gof.opt import (
+    Optimizer,
+    optimizer, inplace_optimizer,
+    SeqOptimizer,
    MergeOptimizer, MergeOptMerge,
    LocalOptimizer, local_optimizer, LocalOptGroup,
    OpSub, OpRemove, PatternSub,

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -29,7 +29,8 @@ from theano.compat.six import b, BytesIO, StringIO
 from theano.gof.utils import flatten
 from theano.configparser import config
 from theano.gof.cc import hash_from_code
-from theano.misc.windows import call_subprocess_Popen
+from theano.misc.windows import (subprocess_Popen, call_subprocess_Popen,
+                                 output_subprocess_Popen)

 # we will abuse the lockfile mechanism when reading and writing the registry
 from theano.gof import compilelock
@@ -1438,8 +1439,12 @@ def get_gcc_shared_library_arg():


 def std_include_dirs():
-    return (numpy.distutils.misc_util.get_numpy_include_dirs()
-            + [distutils.sysconfig.get_python_inc()])
+    numpy_inc_dirs = numpy.distutils.misc_util.get_numpy_include_dirs()
+    py_inc = distutils.sysconfig.get_python_inc()
+    py_plat_spec_inc = distutils.sysconfig.get_python_inc(plat_specific=True)
+    python_inc_dirs = ([py_inc] if py_inc == py_plat_spec_inc
+                       else [py_inc, py_plat_spec_inc])
+    return numpy_inc_dirs + python_inc_dirs


 def std_lib_dirs_and_libs():
@@ -1512,11 +1517,8 @@ def gcc_llvm():
        pass
        p = None
        try:
-            p = call_subprocess_Popen(['g++', '--version'],
-                                      stdout=subprocess.PIPE,
-                                      stderr=subprocess.PIPE)
-            p.wait()
-            output = p.stdout.read() + p.stderr.read()
+            p_out = output_subprocess_Popen(['g++', '--version'])
+            output = p_out[0] + p_out[1]
        except OSError:
            # Typically means g++ cannot be found.
            # So it is not an llvm compiler.
@@ -1569,11 +1571,11 @@ class GCC_compiler(object):
            GCC_compiler.march_flags = []

            def get_lines(cmd, parse=True):
-                p = call_subprocess_Popen(cmd,
-                                          stdout=subprocess.PIPE,
-                                          stderr=subprocess.PIPE,
-                                          stdin=subprocess.PIPE,
-                                          shell=True)
+                p = subprocess_Popen(cmd,
+                                     stdout=subprocess.PIPE,
+                                     stderr=subprocess.PIPE,
+                                     stdin=subprocess.PIPE,
+                                     shell=True)
                # For mingw64 with GCC >= 4.7, passing os.devnull
                # as stdin (which is the default) results in the process
                # waiting forever without returning. For that reason,
@@ -1713,7 +1715,7 @@ class GCC_compiler(object):
                                    continue
                                mj, mn, patch = [int(vp) for vp in version]
                                if (((mj, mn) == (4, 6) and patch < 4) or
-                                        ((mj, mn) == (4, 7) and patch < 3) or
+                                        ((mj, mn) == (4, 7) and patch <= 3) or
                                        ((mj, mn) == (4, 8) and patch < 1)):
                                    new_flags[i] = p.rstrip('-avx')

@@ -1806,26 +1808,20 @@ class GCC_compiler(object):
                # Python3 compatibility: try to cast Py3 strings as Py2 strings
                try:
                    src_code = b(src_code)
-                except:
+                except Exception:
                    pass
                os.write(fd, src_code)
                os.close(fd)
                fd = None
-                proc = call_subprocess_Popen(
-                        ['g++', path, '-o', exe_path] + flags,
-                        stdout=subprocess.PIPE,
-                        stderr=subprocess.PIPE)
-                proc.wait()
-                if proc.returncode != 0:
+                p_ret = call_subprocess_Popen(
+                    ['g++', path, '-o', exe_path] + flags)
+                if p_ret != 0:
                    compilation_ok = False
                elif try_run:
                    # Try to execute the program
                    try:
-                        proc = call_subprocess_Popen([exe_path],
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE)
-                        proc.wait()
-                        run_ok = (proc.returncode == 0)
+                        p_ret = call_subprocess_Popen([exe_path])
+                        run_ok = (p_ret == 0)
                    finally:
                        os.remove(exe_path)
            finally:
@@ -1958,14 +1954,14 @@ class GCC_compiler(object):
            print >> sys.stderr, ' '.join(cmd)

        try:
-            p = call_subprocess_Popen(cmd, stderr=subprocess.PIPE)
-            compile_stderr = decode(p.communicate()[1])
+            p_out = output_subprocess_Popen(cmd)
+            compile_stderr = decode(p_out[1])
        except Exception:
            # An exception can occur e.g. if `g++` is not found.
            print_command_line_error()
            raise

-        status = p.returncode
+        status = p_out[2]

        if status:
            print '==============================='

--- a/theano/gof/compiledir.py
+++ b/theano/gof/compiledir.py
@@ -16,27 +16,17 @@ import numpy
 import theano
 from theano.configparser import config, AddConfigVar, ConfigParam, StrParam
 from theano.gof.utils import flatten
-from theano.misc.windows import call_subprocess_Popen
+from theano.misc.windows import output_subprocess_Popen


 _logger = logging.getLogger("theano.gof.compiledir")

-# Using the dummy file descriptors below is a workaround for a crash
-# experienced in an unusual Python 2.4.4 Windows environment with the default
-# None values.
-dummy_err = open(os.devnull, 'w')
-p = None
 try:
-    p = call_subprocess_Popen(['g++', '-dumpversion'],
-                              stdout=subprocess.PIPE,
-                              stderr=dummy_err.fileno())
-    p.wait()
-    gcc_version_str = p.stdout.readline().strip().decode()
+    p_out = output_subprocess_Popen(['g++', '-dumpversion'])
+    gcc_version_str = p_out[0].strip().decode()
 except OSError:
    # Typically means gcc cannot be found.
    gcc_version_str = 'GCC_NOT_FOUND'
-del p
-del dummy_err


 def local_bitwidth():

--- a/theano/gof/compilelock.py
+++ b/theano/gof/compilelock.py
@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
    my_pid = os.getpid()
    no_display = (verbosity == 0)

-    # Acquire lock.
    nb_error = 0
+    # The number of time we sleep when their is no errors.
+    # Used to don't display it the first time to display it less frequently.
+    # And so don't get as much email about this!
+    nb_wait = 0
+    # Acquire lock.
    while True:
        try:
            last_owner = 'no_owner'
@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
                    last_owner = read_owner
                    time_start = time.time()
                    no_display = (verbosity == 0)
-                if not no_display:
+                if not no_display and nb_wait > 0:
                    if read_owner == 'failure':
                        msg = 'unknown process'
                    else:
@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
                                 tmp_dir)
                    if verbosity <= 1:
                        no_display = True
+                nb_wait += 1
                time.sleep(random.uniform(min_wait, max_wait))

            try:

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -179,23 +179,33 @@ class Query(object):


 class EquilibriumDB(DB):
-    """ A set of potential optimizations which should be applied in an
+    """A set of potential optimizations which should be applied in an
        arbitrary order until equilibrium is reached.

    Canonicalize, Stabilize, and Specialize are all equilibrium optimizations.

+    :param ignore_newtrees: If False, we will apply local opt on new
+        node introduced during local optimization application. This
+        could result in less fgraph iterations, but this don't mean it
+        will be faster globally.
+
    .. note::

        We can put LocalOptimizer and Optimizer as EquilibriumOptimizer
        suppor both.

    """
+    def __init__(self, ignore_newtrees=True):
+        super(EquilibriumDB, self).__init__()
+        self.ignore_newtrees = ignore_newtrees

    def query(self, *tags, **kwtags):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
-        return opt.EquilibriumOptimizer(opts,
-                max_use_ratio=config.optdb.max_use_ratio,
-                failure_callback=opt.NavigatorOptimizer.warn_inplace)
+        return opt.EquilibriumOptimizer(
+            opts,
+            max_use_ratio=config.optdb.max_use_ratio,
+            ignore_newtrees=self.ignore_newtrees,
+            failure_callback=opt.NavigatorOptimizer.warn_inplace)


 class SequenceDB(DB):
@@ -238,6 +248,11 @@ class SequenceDB(DB):
                position_cutoff = tags[0].position_cutoff

        opts = [o for o in opts if self.__position__[o.name] < position_cutoff]
+        # We want to sort by position and then if collision by name
+        # for deterministic optimization.  Since Python 2.2, sort is
+        # stable, so sort by name first, then by position. This give
+        # the order we want.
+        opts.sort(key=lambda obj: obj.name)
        opts.sort(key=lambda obj: self.__position__[obj.name])
        ret = opt.SeqOptimizer(opts, failure_callback=self.failure_callback)
        if hasattr(tags[0], 'name'):

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -544,6 +544,109 @@ def grad(cost, wrt, consider_constant=None,
        rval, = rval
    return rval

+def subgraph_grad(wrt, end, start=None, cost=None, details=False):
+    '''
+    With respect to `wrt`, computes gradients of cost and/or from existing 
+    `start` gradients, up to the `end` variables of a symbolic digraph. 
+    In other words, computes gradients for a subgraph of the
+    symbolic theano function. Ignores all disconnected inputs.
+    
+    This can be useful when one needs to perform the gradient descent 
+    iteratively (e.g. one layer at a time in an MLP), or when a particular 
+    operation is not differentiable in theano (e.g. stochastic sampling 
+    from a multinomial). In the latter case, the gradient of the 
+    non-differentiable process could be approximated by user-defined 
+    formula, which could be calculated using the gradients of a cost 
+    with respect to samples (0s and 1s). These gradients are obtained 
+    by performing a subgraph_grad from the `cost` or previously known gradients 
+    (`start`) up to the outputs of the stochastic process (`end`). 
+    A dictionary mapping gradients obtained from the user-defined 
+    differentiation of the process, to variables, could then be fed into 
+    another subgraph_grad as `start` with any other `cost` (e.g. weight decay).
+    
+    :type wrt : List of Variables.
+        Gradients are computed with respect to `wrt`.
+    
+    :type end : List of Variables.
+        Theano variables at which to end gradient descent
+        (they are considered constant in theano.grad). 
+        For convenience, the gradients with respect to these variables 
+        are also returned.
+    
+    :type start : Dictionary of Variables
+    :param start: If not None, a dictionary mapping variables to 
+            their gradients. This is useful when the gradient on some 
+            variables are known. These are used to compute the gradients
+            backwards up to the variables in `end` 
+            (they are used as known_grad in theano.grad).
+    
+    :type cost: Scalar (0-dimensional) Variable.
+    :param cost: 
+            Additional costs for which to compute the gradients.  
+            For example, these could be weight decay, an l1 constraint,
+            MSE, NLL, etc. May optionally be None if start is provided.
+            Warning : If the gradients of `cost` with respect to any 
+            of the `start` variables is already part of the `start` 
+            dictionary, then it may be counted twice with respect to `wrt` 
+            and `end`.
+    
+    :type details: bool.
+    :param details: When True, additionally returns the 
+        list of gradients from `start` and of `cost`, respectively, 
+        with respect to `wrt` (not `end`).
+    
+    :rtype: Tuple of 2 or 4 Lists of Variables
+    
+    :return: Returns lists of gradients with respect to `wrt` and `end`, 
+            respectively.
+    '''
+    assert ((cost is not None) or (start is not None))
+    assert isinstance(end, list)
+    assert isinstance(wrt, list)
+    if start is not None:
+        assert isinstance(start, dict)
+        
+    params = list(set(wrt + end))
+    
+    start_grads = None
+    cost_grads = None
+    if start is not None:
+        start_grads = list(
+            theano.grad(
+                cost=None, wrt=params, known_grads=start, 
+                consider_constant=end, 
+                disconnected_inputs='ignore'
+            )
+        )
+        
+    if cost is not None:
+        cost_grads = list(
+            theano.grad(
+                cost=cost, wrt=params,
+                consider_constant=end,
+                disconnected_inputs='ignore'
+            )
+        )
+                        
+    grads = None
+    if start is None:
+        grads = cost_grads
+    else:
+        grads = start_grads
+        if cost_grads is not None:
+            for i in range(len(grads)):
+                grads[i] += cost_grads[i]
+    
+    pgrads = OrderedDict(zip(params, grads))
+    # separate wrt from end grads:
+    wrt_grads = list(pgrads[k] for k in wrt)
+    end_grads = list(pgrads[k] for k in end)
+   
+    
+    if details:
+        return wrt_grads, end_grads, start_grads, cost_grads
+    
+    return wrt_grads, end_grads

 def _node_to_pattern(node):
    """ given an apply node, obtain its connection pattern

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -203,6 +203,7 @@ if __name__ == "__main__":

        cuda version      5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
        gpu
+        K6000/NOECC       0.06s
        K20m/ECC                 0.07s
        K20/NOECC                0.07s
        M2090             0.19s

--- a/theano/misc/elemwise_openmp_speedup.py
+++ b/theano/misc/elemwise_openmp_speedup.py
@@ -9,7 +9,7 @@ parser = OptionParser(usage='%prog <options>\n Compute time for'
                      ' fast and slow elemwise operations')
 parser.add_option('-N', '--N', action='store', dest='N',
                  default=theano.config.openmp_elemwise_minsize, type="int",
-                  help="Number of vector element")
+                  help="Number of vector elements")


 def runScript(N):

--- a/theano/misc/elemwise_time_test.py
+++ b/theano/misc/elemwise_time_test.py
@@ -11,7 +11,7 @@ parser = OptionParser(usage='%prog <options>\n Compute time for'
                      ' fast and slow elemwise operations')
 parser.add_option('-N', '--N', action='store', dest='N',
                  default=theano.config.openmp_elemwise_minsize, type="int",
-                  help="Number of vector element")
+                  help="Number of vector elements")
 parser.add_option('--script', action='store_true', dest='script',
                  default=False,
                  help="Run program as script and print results on stdoutput")

--- a/theano/misc/windows.py
+++ b/theano/misc/windows.py
@@ -2,9 +2,11 @@ import os
 import subprocess


-def call_subprocess_Popen(command, **params):
+def subprocess_Popen(command, **params):
    """
-    Utility function to work around windows behavior that open windows
+    Utility function to work around windows behavior that open windows.
+
+    :see: call_subprocess_Popen and output_subprocess_Popen
    """
    startupinfo = None
    if os.name == 'nt':
@@ -36,3 +38,40 @@ def call_subprocess_Popen(command, **params):
        if stdin is not None:
            del stdin
    return proc
+
+def call_subprocess_Popen(command, **params):
+    """
+    Calls subprocess_Popen and discards the output, returning only the
+    exit code.
+    """
+    if 'stdout' in params or 'stderr' in params:
+        raise TypeError("don't use stderr or stdout with call_subprocess_Popen")
+    null = open(os.devnull, 'wb')
+    # stdin to devnull is a workaround for a crash in a weird Windows
+    # environement where sys.stdin was None
+    params.setdefault('stdin', null)
+    params['stdout'] = null
+    params['stderr'] = null
+    p = subprocess_Popen(command, **params)
+    p.wait()
+    return p.returncode
+
+def output_subprocess_Popen(command, **params):
+    """
+    Calls subprocess_Popen, returning the output, error and exit code
+    in a tuple.
+    """
+    if 'stdout' in params or 'stderr' in params:
+        raise TypeError("don't use stderr or stdout with output_subprocess_Popen")
+    # stdin to devnull is a workaround for a crash in a weird Windows
+    # environement where sys.stdin was None
+    if not hasattr(params, 'stdin'):
+        null = open(os.devnull, 'wb')
+        params['stdin'] = null
+    params['stdout'] = subprocess.PIPE
+    params['stderr'] = subprocess.PIPE
+    p = subprocess_Popen(command, **params)
+    # we need to use communicate to make sure we don't deadlock around
+    # the stdour/stderr pipe.
+    out = p.communicate()
+    return out + (p.returncode,)
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -296,38 +296,15 @@ class GpuDimShuffle(GpuOp):
    def __init__(self, input_broadcastable, new_order):
        input_broadcastable = tuple(input_broadcastable)
        self.input_broadcastable = input_broadcastable
-        new_order = tuple(new_order)
        self.new_order = new_order

-        # list of dimensions of the input to drop
-        self.drop = []
-        # this maps i before dropping dimensions to j after dropping
-        # dimensions so self.shuffle can be set properly later on
-        i2j = {}
-        j = 0
        for i, b in enumerate(input_broadcastable):
            if i not in new_order:
-                # we want to drop this dimension because it's not a
-                # value in new_order
-                if b == 1:  # 1 aka True
-                    self.drop.append(i)
-                else:
+                if not b:
                    # we cannot drop non-broadcastable dimensions
                    raise ValueError("You cannot drop a non-broadcastable"
                                     " dimension.",
                                     (input_broadcastable, new_order))
-            else:
-                i2j[i] = j
-                j += 1
-
-        # transposition of non-broadcastable dimensions This is how
-        # the dimensions will be permuted, without accounting for the
-        # extra 'x' broadcastable dimensions to insert.
-        self.shuffle = [i2j[x] for x in new_order if x != 'x']
-
-        # list of dimensions of the output that are broadcastable and
-        # were not in the original input
-        self.augment = [i for i, x in enumerate(new_order) if x == 'x']

        self.view_map = {0: [0]}

@@ -481,8 +458,6 @@ class GpuDimShuffle(GpuOp):
            print self
            print "IN BROAD", self.input_broadcastable
            print "NEW ORDER", self.new_order
-            print "SHUFFLE", self.shuffle
-            print "AUGMENT", self.augment
            print '------------'
            print ''
            print sio.getvalue()
@@ -1198,7 +1173,11 @@ class GpuCAReduce(GpuOp):
                    n_threads.z += 1;
                else
                    break;
-            }""" % locals()
+            }
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+
+        """ % locals()

        if len(self.reduce_mask) == 2:
            threads_y = ''
@@ -1509,6 +1488,8 @@ class GpuCAReduce(GpuOp):
                n_threads.z += 1;
            }
            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);

            dim3 n_blocks(1,1,1);
            %(makecall)s
@@ -1605,7 +1586,7 @@ class GpuCAReduce(GpuOp):
        """ % locals()

    def c_code_cache_version_apply(self, node):
-        version = [8]  # the version corresponding to the c code in this Op
+        version = [9]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
@@ -3192,13 +3173,27 @@ class GpuAlloc(GpuOp):
                # If the output is a constant, it will have to be deepcopied
                # each time the function is called.  So we do not fold.
                return False
-            elif (not isinstance(client[0], basestring)
-                    and isinstance(client[0].op, (
-                        tensor.IncSubtensor,
-                        tensor.AdvancedIncSubtensor1,
-                        GpuIncSubtensor,
-                        GpuAdvancedIncSubtensor1
-                        ))):
+            elif (#The following ops work inplace of their input id 0.
+                  client[1] == 0 and
+                  isinstance(client[0].op, (
+                    #Ops that will work inplace on the Alloc. So if they
+                    #get constant_folded, they would copy the
+                    #constant and this is less efficients.
+
+                    #Not doing the constant folding could also lower
+                    #the peak memory usage, as we the "constant" won't
+                    #always exists.
+                      #theano.tensor.subtensor.AdvancedIncSubtensor,
+                      GpuIncSubtensor,
+                      GpuAdvancedIncSubtensor1,
+                      theano.sandbox.cuda.blas.GpuGemm,
+                      theano.sandbox.cuda.blas.GpuGemv,
+                      theano.sandbox.cuda.blas.GpuGer,
+                  ))):
+                return False
+            #If the clients is a transfer, we don't want to fold. We
+            #let the moving opt finish before deciding what to do.
+            elif isinstance(client[0].op, HostFromGpu):
                return False
        return True


--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -26,6 +26,21 @@
 //if you want this to work.
 #define PRECHECK_ERROR 0

+//If true, we release the GIL around blocking GPU calls, to allow other Python
+//threads to run in the meantime. For a single-threaded program, the overhead
+//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
+//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
+#ifndef RELEASE_GIL
+#define RELEASE_GIL 1
+#endif
+#if RELEASE_GIL
+#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
+#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
+#else
+#define CNDA_BEGIN_ALLOW_THREADS
+#define CNDA_END_ALLOW_THREADS
+#endif
+
 /////////////////////////
 // Alloc and Free
 /////////////////////////
@@ -200,7 +215,9 @@ int device_free(void *ptr)

    // We need sync as the Theano's GC could remove intermediate variable that
    // are still needed as the gpu kernel are running or in the queue.
+    CNDA_BEGIN_ALLOW_THREADS
    cudaThreadSynchronize();
+    CNDA_END_ALLOW_THREADS

    cudaError_t err =  cudaFree(ptr);
    if (cudaSuccess != err)
@@ -518,10 +535,14 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)

    assert (PyArray_ITEMSIZE(rval) == sizeof(real));

-    cublasGetVector(PyArray_SIZE(rval), sizeof(real),
+    npy_intp rval_size = PyArray_SIZE(rval);
+    void *rval_data = PyArray_DATA(rval);
+    CNDA_BEGIN_ALLOW_THREADS
+    cublasGetVector(rval_size, sizeof(real),
                    contiguous_self->devdata, 1,
-                    PyArray_DATA(rval), 1);
-    CNDA_THREAD_SYNC;
+                    rval_data, 1);
+    //CNDA_THREAD_SYNC;  // unneeded because cublasGetVector is blocking anyway
+    CNDA_END_ALLOW_THREADS

    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
@@ -894,12 +915,14 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
        npy_int64 idx = indices[i0];
        if (idx<0)
            idx += dB0; // To allow negative indexing.
-        if ((idx < 0) || (idx >= dB0))
+        if ((idx < 0) || (idx >= dB0)){
            // Any value other the 0 probably work. But to be more safe, I want
            // to change all bits to prevent problem with concurrent write that
            // could cross cache line. But this should not happen with the
            // current code and driver.
            *err = 0xFFFF;
+            continue;
+        }
        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
            for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
                int a_idx = i0*sA0 + i1*sA1 + i2*sA2;
@@ -1217,14 +1240,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    //-10 could be any value different then 0.
    int cpu_err_var=-10;

-    // We are not 100% sure that cudaMemcpy wait that the async gpu kernel are
-    // finished before doing the transfer. So we add this explicit sync as it
-    // is pretty fast. In a python loop, I ran 1 000 000 call in 1 second.
-    // It is better to be safe and not significatively slower than unsafe.
-    cudaThreadSynchronize();
-
+    CNDA_BEGIN_ALLOW_THREADS
+    // As we execute cudaMemcpy on the default stream, it waits for all
+    // kernels (on all streams) to be finished before starting to copy
    err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
                     cudaMemcpyDeviceToHost);
+    CNDA_END_ALLOW_THREADS
    if (cudaSuccess != err) {
        PyErr_Format(
            PyExc_RuntimeError,
@@ -2838,7 +2859,9 @@ GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
 PyObject *
 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
 {
+    CNDA_BEGIN_ALLOW_THREADS
    cudaThreadSynchronize();
+    CNDA_END_ALLOW_THREADS
    Py_INCREF(Py_None);
    return Py_None;
 }
@@ -3554,11 +3577,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
    if (!py_src) {
        return -1;
    }
-    cublasSetVector(PyArray_SIZE(py_src),
+    npy_intp py_src_size = PyArray_SIZE(py_src);
+    void *py_src_data = PyArray_DATA(py_src);
+    CNDA_BEGIN_ALLOW_THREADS
+    cublasSetVector(py_src_size,
            sizeof(real),
-            PyArray_DATA(py_src), 1,
+            py_src_data, 1,
            self->devdata, 1);
-    CNDA_THREAD_SYNC;
+    //CNDA_THREAD_SYNC;  // unneeded because cublasSetVector is blocking anyway
+    CNDA_END_ALLOW_THREADS
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
        PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory");
@@ -4952,7 +4979,7 @@ cnda_copy_structure_to_device(const CudaNdarray * self)
                    1,
                    self->dev_structure,
                    1);
-    CNDA_THREAD_SYNC;
+    //CNDA_THREAD_SYNC;  // unneeded because cublasSetVector is blocking anyway
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
        PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
@@ -5093,7 +5120,7 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)


 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-        const int * dims)
+                            const int * dims, int fortran)
 {
    bool allocated = false;
    if (*arr == NULL)
@@ -5105,7 +5132,7 @@ int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
        allocated = true;
    }

-    if (CudaNdarray_alloc_contiguous(*arr, nd, dims))
+    if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
    {
        if (allocated)
        {

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -34,6 +34,11 @@

 #include <numpy/arrayobject.h>
 #include <stdio.h>
+#include <stdint.h>
+#ifndef SIZE_MAX
+    #define SIZE_MAX ((size_t)-1)
+#endif
+

 #include <cublas.h>

@@ -160,6 +165,12 @@ CudaNdarray_CheckExact(const PyObject * ob);
 DllExport bool
 CudaNdarray_is_c_contiguous(const CudaNdarray * self);

+/**
+ * Return true for a F-contiguous CudaNdarray, else false
+ */
+DllExport bool
+CudaNdarray_is_f_contiguous(const CudaNdarray * self);
+
 /****
 * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
 */
@@ -326,14 +337,17 @@ CudaNdarray_set_nd(CudaNdarray * self, const int nd)
 * Allocate storage space for a tensor of rank 'nd' and given dimensions.
 * (No-op if self already has a contiguous tensor of the right dimensions)
 *
+ * If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
+ *
 * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
 */
 template<typename inttype>
-static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
+static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
+                                        const inttype * dim, int fortran=0)
 {
    // allocate an empty ndarray with c_contiguous access
    // return 0 on success
-    int size = 1; //set up the strides for contiguous tensor
+    size_t size = 1; //set up the strides for contiguous tensor
    assert (nd >= 0);

    // Here we modify the host structure to have the desired shape and
@@ -342,11 +356,38 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
    {
        return -1;
    }
-    for (int i = nd-1; i >= 0; --i)
+    if (fortran)
    {
-        CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
-        CudaNdarray_set_dim(self, i, dim[i]);
-        size = size * dim[i];
+        for (int i = 0; i < nd; i++)
+        {
+            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
+            CudaNdarray_set_dim(self, i, dim[i]);
+            //Detect overflow on unsigned integer
+            if (size > (SIZE_MAX / dim[i])) {
+                PyErr_Format(PyExc_AssertionError,
+                             "Can't store in size_t the bytes resquested",
+                             size);
+                return -1;
+            }
+            size = size * dim[i];
+        }
+    }
+    else
+    {
+        for (int i = nd-1; i >= 0; --i)
+        {
+            CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
+            CudaNdarray_set_dim(self, i, dim[i]);
+
+            //Detect overflow on unsigned integer
+            if (size > (SIZE_MAX / dim[i])) {
+                PyErr_Format(PyExc_AssertionError,
+                             "Can't store in size_t the bytes resquested",
+                             size);
+                return -1;
+            }
+            size = size * dim[i];
+        }
    }

    // If the allocated buffer is already of the right size, we don't need to
@@ -372,14 +413,6 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
        return -1;
    }

-    if (size < 0)
-    {
-        PyErr_Format(PyExc_AssertionError,
-                     "size (%i) < 0",
-                     size);
-        return -1;
-    }
-
    self->devdata = (float*)device_malloc(size*sizeof(real));
    if (size && !self->devdata)
    {
@@ -497,6 +530,27 @@ CudaNdarray_is_c_contiguous(const CudaNdarray * self)
    return c_contiguous;
 }

+/**
+ * True iff the strides look like [1, dim[0], dim[0]*dim[1], ...]
+ */
+DllExport inline bool ALWAYS_INLINE
+CudaNdarray_is_f_contiguous(const CudaNdarray * self)
+{
+    bool f_contiguous = true;
+    int size = 1;
+    for (int i = 0; (i < self->nd) && f_contiguous; i++)
+    {
+        if (CudaNdarray_HOST_DIMS(self)[i] == 1)
+            continue;
+        if (CudaNdarray_HOST_STRIDES(self)[i] != size)
+        {
+            f_contiguous = false;
+        }
+        size = size * CudaNdarray_HOST_DIMS(self)[i];
+    }
+    return f_contiguous;
+}
+
 DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);

 DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
@@ -525,8 +579,9 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
 // *arr may initially be NULL, a pointer to an ndarray of the wrong size,
 // or a pointer to an ndarray of the right size. In the last case it will
 // not change.
+// If fortran is non-zero, a fortran order is expected/created
 DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
-        const int * dims);
+                                      const int * dims, int fortran = 0);

 DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){
    if(CUBLAS_STATUS_SUCCESS == err)

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -16,7 +16,7 @@ from theano.gof.cmodule import (std_libs, std_lib_dirs,
                                std_include_dirs, dlimport,
                                get_lib_extension)
 from theano.gof.python25 import any
-from theano.misc.windows import call_subprocess_Popen
+from theano.misc.windows import output_subprocess_Popen

 _logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
 _logger.setLevel(logging.WARN)
@@ -98,12 +98,8 @@ nvcc_version = None
 def is_nvcc_available():
    """Return True iff the nvcc compiler is found."""
    def set_version():
-        p = call_subprocess_Popen([nvcc_path, '--version'],
-                                  stdout=subprocess.PIPE,
-                                  stderr=subprocess.PIPE)
-        p.wait()
-
-        ver_line = decode(p.stdout.readlines()[-1])
+        p_out = output_subprocess_Popen([nvcc_path, '--version'])
+        ver_line = decode(p_out[0]).strip().split('\n')[-1]
        build, version = ver_line.split(',')[1].strip().split()

        assert build == 'release'

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -109,11 +109,13 @@ def test_careduce():
                               ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
                               #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
                               ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+                               ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111

                               ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
                               ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
                               ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
                               ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+                               ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
                               ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111



--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
+import operator
 import sys

 import numpy
@@ -213,20 +214,29 @@ def test_huge_elemwise_fusion():
    """
    shape = (2, 3, 4, 5, 6)
    ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape))
-    vars = [tensor.tanh(ttype) for x in range(7)]
-    f = pfunc(vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] -
-                     vars[5] - vars[6]], mode=mode_with_gpu)
+    gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()['gpu_ptr_size']
+    if gpu_ptr_size == 8:
+        nb_in = 7
+        len_topo = 10
+    elif gpu_ptr_size == 4:
+        nb_in = 8
+        len_topo = 11
+    else:
+        raise Exception("Unexpected value for gpu_ptr_size", gpu_ptr_size)
+    vars = [tensor.tanh(ttype) for x in range(nb_in)]
+    f = pfunc(vars, [reduce(operator.sub, vars)], mode=mode_with_gpu)
+
    topo = f.maker.fgraph.toposort()
    #theano.printing.debugprint(f)
    #for i, node in enumerate(topo):
    #    print >> sys.stdout, i, node
-    assert len(topo) == 10
+    assert len(topo) == len_topo
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2
-    assert isinstance(topo[7].op.scalar_op, theano.scalar.basic.Sub)
-    assert isinstance(topo[8].op.scalar_op, theano.scalar.basic.Composite)
+    assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub)
+    assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite)
    #let debugmode catch errors
    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
-    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())
+    f(*[gen() for i in range(nb_in)])

    # Test the case where we can't put the computation on the gpu! their is too
    # many dimensions to the input to have 2 inputs to the op!

--- a/theano/sandbox/cuda/tests/test_type.py
+++ b/theano/sandbox/cuda/tests/test_type.py
@@ -36,13 +36,11 @@ def test_unpickle_cudandarray_as_numpy_ndarray_flag0():
        with open(os.path.join(testfile_dir, fname), 'rb') as fp:
            if cuda_available:
                mat = cPickle.load(fp)
+                assert isinstance(mat, CudaNdarray)
+                assert numpy.asarray(mat)[0] == -42.0
            else:
                assert_raises(ImportError, cPickle.load, fp)

-        if cuda_available:
-            assert isinstance(mat, CudaNdarray)
-            assert numpy.asarray(mat)[0] == -42.0
-
    finally:
        config.experimental.unpickle_gpu_on_cpu = oldflag

@@ -53,8 +51,11 @@ def test_unpickle_cudandarray_as_numpy_ndarray_flag1():

    try:
        testfile_dir = os.path.dirname(os.path.realpath(__file__))
+        fname = 'CudaNdarray.pkl'
+        if sys.version_info[0] == 3:
+            fname = 'CudaNdarray_py3.pkl'

-        with open(os.path.join(testfile_dir, 'CudaNdarray.pkl')) as fp:
+        with open(os.path.join(testfile_dir, fname), 'rb') as fp:
            mat = cPickle.load(fp)

        assert isinstance(mat, numpy.ndarray)

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -44,7 +44,7 @@ if pygpu:
            init_dev(config.device)
            import theano.compile
            theano.compile.shared_constructor(gpuarray_shared_constructor)
-            optdb.add_tags('gpuarray_opt', 'fast_run', 'inplace')
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile', 'inplace')
        elif config.gpuarray.init_device != '':
            init_dev(config.gpuarray.init_device)
    except Exception:

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -3,12 +3,12 @@ import os
 import numpy

 import theano
-from theano import Op, Type, Apply, Variable, Constant
+from theano import Op, Apply
 from theano import tensor, scalar, config
 from theano.scalar import Scalar
 from theano.tensor.basic import Alloc

-from theano.gof.python25 import all, any
+from theano.gof.python25 import any
 from theano.gof.utils import MethodNotDefined
 from theano.compat import PY3

@@ -161,7 +161,7 @@ class HostFromGpu(Op):
            raise TypeError(x)
        return Apply(self, [x],
                     [tensor.TensorType(dtype=x.dtype,
-                                        broadcastable=x.broadcastable,)()])
+                                        broadcastable=x.broadcastable)()])

    def perform(self, node, inp, out):
        x, = inp
@@ -257,7 +257,7 @@ class GpuFromHost(Op):

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isintance(ev, GpuArrayType):
+        if isinstance(ev, GpuArrayType):
            return [host_from_gpu(ev)]
        else:
            return ev
@@ -317,7 +317,7 @@ class GpuFromCuda(Op):

    def R_op(self, inputs, eval_points):
        ev, = eval_points
-        if isintance(ev, GpuArrayType):
+        if isinstance(ev, GpuArrayType):
            return [cuda_from_gpu(ev)]
        else:
            return ev
@@ -651,6 +651,36 @@ class GpuAlloc(HideC, Alloc):
    def c_code_cache_version(self):
        return (2,)

+    def do_constant_folding(self, node):
+        for client in node.outputs[0].clients:
+            if client[0] == 'output':
+                # If the output is a constant, it will have to be deepcopied
+                # each time the function is called.  So we do not fold.
+                return False
+            elif (#The following ops work inplace of their input id 0.
+                  client[1] == 0 and
+                  isinstance(client[0].op, (
+                    #Ops that will work inplace on the Alloc. So if they
+                    #get constant_folded, they would copy the
+                    #constant and this is less efficients.
+
+                    #Not doing the constant folding could also lower
+                    #the peak memory usage, as we the "constant" won't
+                    #always exists.
+                      #theano.tensor.subtensor.AdvancedIncSubtensor,
+                      theano.sandbox.gpuarray.subtensor.GpuIncSubtensor,
+                      #theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1,
+                      theano.sandbox.gpuarray.blas.GpuGemm,
+                      theano.sandbox.gpuarray.blas.GpuGemv,
+                      theano.sandbox.gpuarray.blas.GpuGer,
+                  ))):
+                return False
+            #If the clients is a transfer, we don't want to fold. We
+            #let the moving opt finish before deciding what to do.
+            elif isinstance(client[0].op, HostFromGpu):
+                return False
+        return True
+
 gpu_alloc = GpuAlloc()



--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 from theano import Op, Apply, config

-from theano.tensor.blas import Dot22, Gemv, Gemm
+from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
 from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)

 try:
@@ -28,7 +28,7 @@ class GpuGemv(BlasOp, Gemv):
        A = as_gpuarray_variable(A)
        x = as_gpuarray_variable(x)
        y = as_gpuarray_variable(y)
-        assert A.dtype == x.dtype == y.dtype == alpha.dtype == beta.dtype
+        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [y, alpha, A, x, beta], [y.type()])

    def perform(self, node, inputs, out_storage):
@@ -45,8 +45,15 @@ class GpuGemv(BlasOp, Gemv):
        if self.inplace:
            code = """
                   Py_XDECREF(%(out)s);
-                   %(out)s = %(y)s;
-                   Py_INCREF(%(out)s);
+                   if (%(y)s->ga.strides[0] <= 0) {
+                     %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
+                     if (%(out)s == NULL) {
+                       %(fail)s
+                     }
+                   } else {
+                     %(out)s = %(y)s;
+                     Py_INCREF(%(out)s);
+                   }
                   """ % vars
        else:
            code = """
@@ -72,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
        return code

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)
@@ -84,7 +91,7 @@ class GpuGemm(BlasOp, Gemm):
        A = as_gpuarray_variable(A)
        B = as_gpuarray_variable(B)
        C = as_gpuarray_variable(C)
-        assert A.dtype == B.dtype == C.dtype == alpha.dtype == beta.dtype
+        assert A.dtype == B.dtype == C.dtype
        return Apply(self, [C, alpha, A, B, beta], [C.type()])

    def perform(self, node, inputs, outputs):
@@ -101,8 +108,15 @@ class GpuGemm(BlasOp, Gemm):
        if self.inplace:
            code = """
                   Py_XDECREF(%(out)s);
-                   %(out)s = %(C)s;
-                   Py_INCREF(%(out)s);
+                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
+                     %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
+                     if (%(out)s == NULL) {
+                       %(fail)s
+                     }
+                   } else {
+                     %(out)s = %(C)s;
+                     Py_INCREF(%(out)s);
+                   }
                   """ % vars
        else:
            code = """
@@ -128,13 +142,74 @@ class GpuGemm(BlasOp, Gemm):
        return code

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)


 gpugemm_no_inplace = GpuGemm(inplace=False)
 gpugemm_inplace = GpuGemm(inplace=True)


+class GpuGer(BlasOp, Ger):
+    def make_node(self, A, alpha, x, y):
+        res = Ger.make_node(self, A, alpha, x, y)
+        A = as_gpuarray_variable(A)
+        x = as_gpuarray_variable(x)
+        y = as_gpuarray_variable(y)
+        assert A.dtype == x.dtype == y.dtype
+        return Apply(self, [A, alpha, x, y], [A.type()])
+
+    def perform(self, node, inp, out):
+        A, alpha, x, y = inp
+        inplace = self.destructive
+        if inplace and not A.flags.forc:
+            inplace = False
+        outputs[0][0] = blas.ger(alpha, x, y, A,
+                                 overwrite_a=inplace)
+
+    def c_code(self, node, name, inp, out, sub):
+        vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
+                    fail=sub['fail'], name=name)
+        if self.destructive:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
+                     %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
+                     if (%(out)s == NULL) {
+                       %(fail)s
+                     }
+                   } else {
+                     %(out)s = %(A)s;
+                     Py_INCREF(%(out)s);
+                   }
+                   """ % vars
+        else:
+            code = """
+                   Py_XDECREF(%(out)s);
+                   %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
+                   if (%(out)s == NULL) {
+                       %(fail)s
+                   }
+                   """ % vars
+        code += """
+        if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
+                            %(x)s, %(y)s, %(out)s, 0) == -1) {
+            %(fail)s
+        }
+        """ % vars
+        if config.gpuarray.sync:
+            code += """
+            GpuArray_sync(&%(out)s->ga);
+            """ % vars
+        return code
+
+    def c_code_cache_version(self):
+        return (1,)
+
+
+gpuger_no_inplace = GpuGer(destructive=False)
+gpuger_inplace = GpuGer(destructive=True)
+
+
 class GpuDot22(BlasOp, Dot22):
    def make_node(self, x, y):
        res = Dot22.make_node(self, x, y)
@@ -200,19 +275,24 @@ from theano.gof import local_optimizer, LocalOptGroup
 from theano.tensor.opt import in2out


-@local_optimizer([gpugemv_no_inplace])
+@local_optimizer([gpugemv_no_inplace], inplace=True)
 def local_inplace_gpuagemv(node):
    if node.op == gpugemv_no_inplace:
        return [gpugemv_inplace(*node.inputs)]


-@local_optimizer([gpugemm_no_inplace])
+@local_optimizer([gpugemm_no_inplace], inplace=True)
 def local_inplace_gpuagemm(node):
    if node.op == gpugemm_no_inplace:
        return [gpugemm_inplace(*node.inputs)]

+@local_optimizer([gpuger_no_inplace], inplace=True)
+def local_inplace_gpuager(node):
+    if node.op == gpuger_no_inplace:
+        return [gpuger_inplace(*node.inputs)]
+
 gpuablas_opt_inplace = in2out(LocalOptGroup(
-        local_inplace_gpuagemv, local_inplace_gpuagemm),
+        local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager),
                              name='gpuablas_opt_inplace')
 optdb.register('InplaceGpuaBlasOpt',
               gpuablas_opt_inplace,

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    n_threads.z += 1;
                else
                    break;
-            }""" % locals()
+            }
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+        """ % locals()

        if len(self.reduce_mask) == 2:
            threads_y = ''
@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
                n_threads.z += 1;
            }
            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);

            dim3 n_blocks(1,1,1);
            %(makecall)s
@@ -1697,7 +1702,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        """ % locals()

    def c_code_cache_version_apply(self, node):
-        version = [8]  # the version corresponding to the c code in this Op
+        version = [9]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
 import copy
 import theano
 import numpy
-from theano import tensor, scalar
+from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, ProxyDB,
                        Optimizer, toolbox,
                        InconsistencyError, EquilibriumOptimizer)

+from theano.scan_module import scan_utils, scan_op, scan_opt
+
 from theano.gof.python25 import all, any
 from theano.tensor.nnet.conv import ConvOp
 from theano.sandbox.gpuarray.type import GpuArrayType
-from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
-                                               gpu_from_host,
-                                               gpu_alloc,
-                                               GpuAlloc,
-                                               GpuReshape,
-                                               GpuEye)
-from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
+from theano.sandbox.gpuarray.basic_ops import (
+    host_from_gpu, gpu_from_host, HostFromGpu,
+    gpu_alloc, GpuAlloc, GpuReshape, GpuEye
+    )
+from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
 from theano.sandbox.gpuarray.conv import GpuConv
-from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
-                                          GpuCrossentropySoftmax1HotWithBiasDx,
-                                          GpuSoftmaxWithBias,
-                                          GpuSoftmax)
+from theano.sandbox.gpuarray.nnet import (
+    GpuCrossentropySoftmaxArgmax1HotWithBias,
+    GpuCrossentropySoftmax1HotWithBiasDx,
+    GpuSoftmaxWithBias, GpuSoftmax
+    )
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduceCuda)
 from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
@@ -54,6 +55,20 @@ def register_opt(*tags, **kwargs):
 register_opt()(theano.tensor.opt.local_track_shape_i)


+def safe_to_gpu(x):
+    if isinstance(x.type, tensor.TensorType):
+        return gpu_from_host(x)
+    else:
+        return x
+
+
+def safe_to_cpu(x):
+    if isinstance(x.type, GpuArrayType):
+        return host_from_gpu(x)
+    else:
+        return x
+
+
 def op_lifter(OP):
    """
    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
@@ -73,10 +88,10 @@ def op_lifter(OP):
                    # This is needed as sometimes new_op inherit from OP.
                    if new_op and new_op != node.op:
                        if isinstance(new_op, theano.Op):
-                            return [host_from_gpu(o) for o in
+                            return [safe_to_cpu(o) for o in
                                    new_op(*node.inputs, return_list=True)]
                        elif isinstance(new_op, (tuple, list)):
-                            return [host_from_gpu(o) for o in new_op]
+                            return [safe_to_cpu(o) for o in new_op]
                        else:  # suppose it is a variable on the GPU
                            return [host_from_gpu(new_op)]
            return False
@@ -132,7 +147,17 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
 @register_opt()
 @op_lifter([tensor.Alloc])
 def local_gpualloc(node):
-    return gpu_alloc
+    new_out = gpu_alloc(*node.inputs)
+    # We need to hide new broadcastable dimensions because
+    # ReplaceValidate doesn't like when they change.
+    if new_out.broadcastable != node.outputs[0].broadcastable:
+        # but if a dim is suddenly not broadcastable anymore then that's a bug
+        for b_old, b_new in zip(node.outputs[0].broadcastable,
+                                new_out.broadcastable):
+            assert b_new or (not b_old)
+        new_out = tensor.patternbroadcast(new_out,
+                                          node.outputs[0].broadcastable)
+    return (new_out,)


 @register_opt()
@@ -158,6 +183,13 @@ def local_gpureshape(node):
    return res


+@register_opt()
+@op_lifter([tensor.Rebroadcast])
+def local_gpu_rebroadcast(node):
+    if isinstance(node.inputs[0].owner.op, HostFromGpu):
+        return node.op(node.inputs[0].owner.inputs[0])
+
+
 @register_opt()
 @op_lifter([tensor.Flatten])
 def local_gpuflatten(node):
@@ -176,8 +208,6 @@ def local_gpuflatten(node):
 def local_gpu_elemwise(node):
    op = node.op
    name = op.name
-    if node.outputs[0].ndim == 0:
-        return
    if name:
        name = 'Gpu'+name
    res = GpuElemwise(op.scalar_op, name=name,
@@ -302,23 +332,23 @@ def local_gpua_careduce(node):


 @register_opt()
-@op_lifter([tensor.blas.Gemv])
+@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
 def local_gpua_gemv(node):
    return GpuGemv(inplace=node.op.inplace)


-@register_opt()
-@op_lifter([tensor.blas_c.CGemv])
-def local_gpua_gemv2(node):
-    return GpuGemv(inplace=node.op.inplace)
-
-
 @register_opt()
 @op_lifter([tensor.blas.Gemm])
 def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)


+@register_opt()
+@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
+def local_gpua_ger(node):
+    return GpuGer(destructive=node.op.destructive)
+
+
 @register_opt()
 @op_lifter([tensor.blas.Dot22])
 def local_gpua_dot22(node):
@@ -341,17 +371,20 @@ def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
 @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
 def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
-    
+
+
 @register_opt()
 @op_lifter([tensor.nnet.Softmax])
 def local_gpua_softmax(node):
    return GpuSoftmax()

+
 @register_opt()
 @op_lifter([tensor.nnet.SoftmaxWithBias])
 def local_gpua_softmaxwithbias(node):
    return GpuSoftmaxWithBias()

+
 @register_opt()
 @op_lifter([gpu_from_host, ConvOp])
 def local_gpu_conv(node):
@@ -429,3 +462,97 @@ def local_gpu_conv(node):
    out = gpu_from_host(out)
    out.values_eq_approx = values_eq_approx
    return [out]
+
+
+def tensor_to_gpu(x):
+    if isinstance(x.type, tensor.TensorType):
+        y = GpuArrayType(broadcastable=x.type.broadcastable,
+                         dtype=x.type.dtype)()
+        if x.name:
+            y.name = x.name + '[Gpua]'
+        return y
+    else:
+        return x
+
+
+def gpu_safe_new(x, tag=''):
+    """
+    Internal function that constructs a new variable from x with the same
+    type, but with a different name ( old name + tag). This function is used
+    by gradient, or the R-op to construct new variables for the inputs of
+    the inner graph such that there is no interference between the original
+    graph and the newly constructed graph.
+    """
+    if hasattr(x, 'name') and x.name is not None:
+        nw_name = x.name + tag
+    else:
+        nw_name = None
+    if isinstance(x, theano.Constant):
+        return x.clone()
+
+    nw_x = x.type()
+    nw_x.name = nw_name
+    return nw_x
+
+
+def gpu_reconstruct_graph(inputs, outputs, tag=None):
+    """
+    Different interface to clone, that allows you to pass inputs.
+    Compared to clone, this method always replaces the inputs with
+    new variables of the same type, and returns those ( in the same
+    order as the original inputs).
+    """
+    if tag is None:
+        tag = ''
+    nw_inputs = [gpu_safe_new(x, tag) for x in inputs]
+    givens = {}
+    for nw_x, x in zip(nw_inputs, inputs):
+        givens[x] = nw_x
+    nw_outputs = scan_utils.clone(outputs, replace=givens)
+    return (nw_inputs, nw_outputs)
+
+
+@register_opt('scan')
+@op_lifter([scan_op.Scan])
+def local_scan_to_gpua(node):
+    info = copy.deepcopy(node.op.info)
+    info['gpua'] = True
+    nw_ins = [node.inputs[0]]
+    e = (1 +
+         node.op.n_seqs +
+         node.op.n_mit_mot +
+         node.op.n_mit_sot +
+         node.op.n_sit_sot +
+         node.op.n_shared_outs)
+    nw_ins += [safe_to_gpu(x) for x in node.inputs[1:e]]
+    b = e
+    e = e + node.op.n_nit_sot
+    nw_ins += node.inputs[b:e]
+    nw_ins += [safe_to_gpu(x) for x in node.inputs[e:]]
+    scan_ins = [tensor_to_gpu(x) for x in node.op.inputs]
+    scan_outs = [safe_to_gpu(x) for x in node.op.outputs]
+    scan_outs = scan_utils.clone(
+        scan_outs,
+        replace=zip(node.op.inputs,
+                    [safe_to_cpu(x) for x in scan_ins]))
+
+    # We need to construct the hash here, because scan
+    # __init__ does not know about the gpu and can not
+    # handle graphs with inputs being on the gpu
+    tmp_in, tmp_out = gpu_reconstruct_graph(scan_ins, scan_outs)
+    local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
+    _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
+    info['gpu_hash'] = hash(_cmodule_key)
+
+    nw_op =  scan_op.Scan(scan_ins, scan_outs, info,
+                          typeConstructor=GpuArrayType).make_node(*nw_ins)
+    return nw_op.outputs
+
+optdb.register('gpua_scanOp_make_inplace',
+               scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType,
+                                             gpua_flag=True),
+               75,
+               'gpua',
+               'fast_run',
+               'inplace',
+               'scan')
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -7,6 +7,7 @@ import theano
 from theano import tensor, gof
 from theano.gof.python25 import all, any
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
+import theano.tensor.inplace
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

 try:

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -32,11 +32,13 @@ if not theano.sandbox.gpuarray.pygpu_activated:

 from theano.sandbox.gpuarray.type import (GpuArrayType,
                                          gpuarray_shared_constructor)
-from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
-                                               gpu_alloc, gpu_from_cuda,
-                                               cuda_from_gpu, HostFromGpu,
-                                               GpuFromHost, GpuReshape,
-                                               GpuEye)
+from theano.sandbox.gpuarray.basic_ops import (
+    host_from_gpu, gpu_from_host,
+    gpu_alloc, GpuAlloc,
+    gpu_from_cuda,
+    cuda_from_gpu, HostFromGpu,
+    GpuFromHost, GpuReshape,
+    GpuEye)

 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -290,6 +292,13 @@ GpuAllocTester = makeTester(
 )


+class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
+    dtype = "float32"
+    mode = mode_with_gpu
+    shared = staticmethod(gpuarray_shared_constructor)
+    allocs = [GpuAlloc, GpuAlloc, T.Alloc]
+
+
 def test_shape():
    x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
    v = gpuarray.zeros((3, 4, 5), dtype='float32')

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
 from unittest import TestCase
+from nose.plugins.skip import SkipTest

 import theano
-from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
+from theano import tensor
+from theano.tests import unittest_tools
+from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
+                                _dot22)
+from theano.tensor.tests.test_blas import TestGer, BaseGemv

-from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
+from theano.sandbox.gpuarray import gpuarray_shared_constructor
+from theano.sandbox.gpuarray.tests.test_basic_ops import (makeTester, rand,
+                                                          mode_with_gpu)

-from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
-                                          gpugemm_inplace, gpu_dot22)
+from theano.sandbox.gpuarray.blas import (gpugemv_inplace, gpugemv_no_inplace,
+                                          gpugemm_inplace, gpugemm_no_inplace,
+                                          gpuger_inplace, gpuger_no_inplace,
+                                          GpuGer, gpu_dot22)


 GpuGemvTester = makeTester('GpuGemvTester',
@@ -21,6 +30,21 @@ GpuGemvTester = makeTester('GpuGemvTester',
        )
 )

+class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin):
+    mode = mode_with_gpu
+    dtype = 'float32'
+
+    gemv = gpugemv_no_inplace
+    gemv_inplace = gpugemv_inplace
+
+    @staticmethod
+    def shared(val):
+        try:
+            return gpuarray_shared_constructor(val)
+        except TypeError:
+            return theano.shared(val)
+
+
 GpuGemmTester = makeTester('GpuGemmTester',
                           op=gemm_inplace, gpu_op=gpugemm_inplace,
                           cases=dict(
@@ -37,9 +61,40 @@ GpuGemmTester = makeTester('GpuGemmTester',
 #       test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
 #       test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
 #       test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
-    )
+        )
 )

+class TestGpuSger(TestGer):
+    def setUp(self):
+        self.mode = mode_with_gpu
+        dtype = self.dtype = 'float32'  # optimization isn't dtype-dependent
+        self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
+        self.a = tensor.tensor(dtype=dtype, broadcastable=())
+        self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
+        self.ger_destructive = gpuger_inplace
+
+        # data on the gpu make the op always inplace
+        self.ger = gpuger_inplace
+        self.gemm = gpugemm_inplace
+
+    def test_f32_0_0(self):
+        raise SkipTest('0-sized objects not supported')
+    def test_f32_1_0(self):
+        raise SkipTest('0-sized objects not supported')
+    def test_f32_0_1(self):
+        raise SkipTest('0-sized objects not supported')
+
+class TestGpuSgerNoTransfer(TestGpuSger):
+    shared = staticmethod(gpuarray_shared_constructor)
+
+class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
+    def setUp(self):
+        self.ops = [gpuger_no_inplace, gpuger_inplace]
+
+    def clone(self, op):
+        return GpuGer(destructive=op.destructive)
+

 GpuDot22Tester = makeTester(
    'GpuGemmTester',

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
-import unittest
-
 from theano import scalar, gof
-from theano.gof import FunctionGraph
 from theano.gof.python25 import all, any
-from theano.tests.unittest_tools import SkipTest

 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
                                               test_CAReduce)
@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
             ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+             ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111

             ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
             ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+             ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111

             #test pattern implemented by reshape

--- a/theano/sandbox/gpuarray/tests/test_neighbours.py
+++ b/theano/sandbox/gpuarray/tests/test_neighbours.py
+import unittest
+
+# We let that import do the init of the back-end if needed.
+from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
+                                                          mode_without_gpu)
+
+import theano.sandbox.test_neighbours
+from theano.sandbox.gpuarray.neighbours import GpuImages2Neibs
+
+
+class T_GpuImages2Neibs(theano.sandbox.test_neighbours.T_Images2Neibs):
+    mode = mode_with_gpu
+    op = GpuImages2Neibs
+    dtypes = ['int64', 'float32', 'float64']
+
+if __name__ == '__main__':
+    unittest.main()
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
 import numpy

 import theano
+from theano import tensor
 from theano.tests import unittest_tools as utt
+import theano.sandbox.gpuarray
+from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
 from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda
-import theano.sandbox.gpuarray
+from theano.sandbox.gpuarray.tests.test_basic_ops import (
+    rand_gpuarray, mode_with_gpu, mode_without_gpu
+    )
 from theano.tests.unittest_tools import SkipTest

-if theano.sandbox.gpuarray.pygpu is None:
-    raise SkipTest("pygpu not installed")
-
-import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
-    if not cuda_ndarray.use.device_number:
-        cuda_ndarray.use('gpu')
-    theano.sandbox.gpuarray.init_dev('cuda')
-
-if not theano.sandbox.gpuarray.pygpu_activated:
-    raise SkipTest("pygpu disabled")
-
-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
-
-
 def test_flatten():
    m = theano.tensor.fmatrix()
    f = theano.function([m], m.flatten(), mode=mode_with_gpu)
@@ -104,3 +89,20 @@ def test_local_gpualloc_memset_0():
    assert isinstance(topo[0].op, GpuAlloc)
    assert not topo[0].op.memset_0
    assert (numpy.asarray(f(2)) == 1).all()
+
+
+def test_rebroadcast():
+    d = numpy.random.rand(10, 10).astype('float32')
+    v = theano.tensor.fmatrix()
+    up = tensor.unbroadcast(v.sum().dimshuffle('x', 'x'), 0, 1)
+    f = theano.function([v], [up], mode=mode_with_gpu)
+
+    f(d)
+
+    topo = f.maker.fgraph.toposort()
+    rebrs = [node for node in topo if isinstance(node.op, tensor.Rebroadcast)]
+    assert len(rebrs) == 1
+    rebr = rebrs[0]
+
+    assert isinstance(rebr.inputs[0].type, GpuArrayType)
+    assert isinstance(rebr.outputs[0].type, GpuArrayType)
--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ b/theano/sandbox/gpuarray/tests/test_scan.py
+from unittest import TestCase
+
+import numpy
+import theano
+
+from theano.tests import unittest_tools as utt
+import theano.sandbox.rng_mrg
+from theano.sandbox.gpuarray.basic_ops import (
+    gpu_from_host, GpuFromHost, HostFromGpu
+)
+from theano.sandbox.gpuarray.elemwise import GpuElemwise
+
+from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
+
+class T_Scan(TestCase):
+    def setUp(self):
+        utt.seed_rng()
+
+    def test_one_sequence_one_output_weights_gpu1(self):
+        def f_rnn(u_t, x_tm1, W_in, W):
+            return u_t * W_in + x_tm1 * W
+
+        u = theano.tensor.fvector('u')
+        x0 = theano.tensor.fscalar('x0')
+        W_in = theano.tensor.fscalar('win')
+        W = theano.tensor.fscalar('w')
+
+        mode = mode_with_gpu.excluding('InputToGpuOptimizer')
+        output, updates = theano.scan(f_rnn,
+                                      u,
+                                      x0,
+                                      [W_in, W],
+                                      n_steps=None,
+                                      truncate_gradient=-1,
+                                      go_backwards=False,
+                                      mode=mode)
+
+        output = gpu_from_host(output)
+        f2 = theano.function([u, x0, W_in, W],
+                             output,
+                             updates=updates,
+                             allow_input_downcast=True,
+                             mode=mode)
+
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_u = rng.uniform(size=(4,), low=-5., high=5.)
+        v_x0 = rng.uniform()
+        W = rng.uniform()
+        W_in = rng.uniform()
+
+        v_u = numpy.asarray(v_u, dtype='float32')
+        v_x0 = numpy.asarray(v_x0, dtype='float32')
+        W = numpy.asarray(W, dtype='float32')
+        W_in = numpy.asarray(W_in, dtype='float32')
+
+        # compute the output in numpy
+        v_out = numpy.zeros((4,))
+        v_out[0] = v_u[0] * W_in + v_x0 * W
+        for step in xrange(1, 4):
+            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
+
+        theano_values = f2(v_u, v_x0, W_in, W)
+        utt.assert_allclose(theano_values, v_out)
+
+        # TO DEL
+        topo = f2.maker.fgraph.toposort()
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+
+        topo = f2.maker.fgraph.toposort()
+        assert sum([isinstance(node.op, HostFromGpu)
+                    for node in topo]) == 0
+        assert sum([isinstance(node.op, GpuFromHost)
+                    for node in topo]) == 4
+
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
+
+        # check that there is no gpu transfer in the inner loop.
+        assert any([isinstance(node.op, GpuElemwise)
+                    for node in scan_node_topo])
+        assert not any([isinstance(node.op, HostFromGpu)
+                        for node in scan_node_topo])
+        assert not any([isinstance(node.op, GpuFromHost)
+                        for node in scan_node_topo])
+
+
+    # This second version test the second case in the optimizer to the gpu.
+    def test_one_sequence_one_output_weights_gpu2(self):
+        def f_rnn(u_t, x_tm1, W_in, W):
+            return u_t * W_in + x_tm1 * W
+
+        u = theano.tensor.fvector('u')
+        x0 = theano.tensor.fscalar('x0')
+        W_in = theano.tensor.fscalar('win')
+        W = theano.tensor.fscalar('w')
+        output, updates = theano.scan(f_rnn,
+                                      u,
+                                      x0,
+                                      [W_in, W],
+                                      n_steps=None,
+                                      truncate_gradient=-1,
+                                      go_backwards=False,
+                                      mode=mode_with_gpu)
+
+        f2 = theano.function([u, x0, W_in, W],
+                             output,
+                             updates=updates,
+                             allow_input_downcast=True,
+                             mode=mode_with_gpu)
+
+        # get random initial values
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_u = rng.uniform(size=(4,), low=-5., high=5.)
+        v_x0 = rng.uniform()
+        W = rng.uniform()
+        W_in = rng.uniform()
+
+        # compute the output in numpy
+        v_out = numpy.zeros((4,))
+        v_out[0] = v_u[0] * W_in + v_x0 * W
+        for step in xrange(1, 4):
+            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
+
+        theano_values = f2(v_u, v_x0, W_in, W)
+        utt.assert_allclose(theano_values, v_out)
+
+        topo = f2.maker.fgraph.toposort()
+        assert sum([isinstance(node.op, HostFromGpu)
+                    for node in topo]) == 1
+        assert sum([isinstance(node.op, GpuFromHost)
+                    for node in topo]) == 4
+
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
+
+        # check that there is no gpu transfer in the inner loop.
+        assert any([isinstance(node.op, GpuElemwise)
+                    for node in scan_node_topo])
+        assert not any([isinstance(node.op, HostFromGpu)
+                        for node in scan_node_topo])
+        assert not any([isinstance(node.op, GpuFromHost)
+                        for node in scan_node_topo])
+
+    # This third test checks that scan can deal with a mixture of dtypes as
+    # outputs when is running on GPU
+    def test_gpu3_mixture_dtype_outputs(self):
+        def f_rnn(u_t, x_tm1, W_in, W):
+            return (u_t * W_in + x_tm1 * W,
+                    theano.tensor.cast(u_t + x_tm1, 'int64'))
+
+        u = theano.tensor.fvector('u')
+        x0 = theano.tensor.fscalar('x0')
+        W_in = theano.tensor.fscalar('win')
+        W = theano.tensor.fscalar('w')
+        output, updates = theano.scan(f_rnn,
+                                      u,
+                                      [x0, None],
+                                      [W_in, W],
+                                      n_steps=None,
+                                      truncate_gradient=-1,
+                                      go_backwards=False,
+                                      mode=mode_with_gpu)
+
+        f2 = theano.function([u, x0, W_in, W],
+                             output,
+                             updates=updates,
+                             allow_input_downcast=True,
+                             mode=mode_with_gpu)
+
+        # get random initial values
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_u = rng.uniform(size=(4,), low=-5., high=5.)
+        v_x0 = rng.uniform()
+        W = rng.uniform()
+        W_in = rng.uniform()
+
+        # compute the output in numpy
+        v_out1 = numpy.zeros((4,))
+        v_out2 = numpy.zeros((4,), dtype='int64')
+        v_out1[0] = v_u[0] * W_in + v_x0 * W
+        v_out2[0] = v_u[0] + v_x0
+        for step in xrange(1, 4):
+            v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
+            v_out2[step] = numpy.int64(v_u[step] + v_out1[step - 1])
+
+        theano_out1, theano_out2 = f2(v_u, v_x0, W_in, W)
+        utt.assert_allclose(theano_out1, v_out1)
+        utt.assert_allclose(theano_out2, v_out2)
+
+        topo = f2.maker.fgraph.toposort()
+        scan_node = [node for node in topo
+                     if isinstance(node.op, theano.scan_module.scan_op.Scan)]
+        assert len(scan_node) == 1
+        scan_node = scan_node[0]
+        assert scan_node.op.gpua
+
+        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
+
+        # check that there is no gpu transfer in the inner loop.
+        assert not any([isinstance(node.op, HostFromGpu)
+                        for node in scan_node_topo])
+        assert not any([isinstance(node.op, GpuFromHost)
+                        for node in scan_node_topo])
+
+
+    def test_gpu4_gibbs_chain(self):
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        v_vsample = numpy.array(rng.binomial(1, .5, size=(3, 20),),
+                                dtype='float32')
+        vsample = theano.shared(v_vsample)
+        trng = theano.sandbox.rng_mrg.MRG_RandomStreams(
+                                utt.fetch_seed())
+
+        def f(vsample_tm1):
+            return trng.binomial(vsample_tm1.shape, n=1, p=0.3,
+                                 dtype='float32') * vsample_tm1
+
+        theano_vsamples, updates = theano.scan(f,
+                                               [],
+                                               vsample,
+                                               [],
+                                               n_steps=10,
+                                               truncate_gradient=-1,
+                                               go_backwards=False,
+                                               mode=mode_with_gpu)
+        my_f = theano.function([],
+                               theano_vsamples[-1],
+                               updates=updates,
+                               allow_input_downcast=True,
+                               mode=mode_with_gpu)
+
+        # I leave this to tested by debugmode, this test was anyway
+        # more of does the graph compile kind of test
+        t_result = my_f()
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
@@ -26,4 +26,6 @@ class G_subtensor(T_subtensor):
                             dtype='float32',
                             ignore_topo=(HostFromGpu, GpuFromHost,
                                          DeepCopyOp))
+        # GPU opt can't run in fast_compile only.
+        self.fast_compile = False
        assert self.sub == GpuSubtensor
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -303,6 +303,109 @@ def test_consistency_GPU_parallel():
    assert(numpy.allclose(samples, java_samples))


+def test_consistency_GPUA_serial():
+    '''Verify that the random numbers generated by GPUA_mrg_uniform, serially,
+    are the same as the reference (Java) implementation by L'Ecuyer et al.
+    '''
+    from theano.sandbox.gpuarray.tests.test_basic_ops import \
+        mode_with_gpu as mode
+    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+
+    seed = 12345
+    n_samples = 5
+    n_streams = 12
+    n_substreams = 7
+
+    samples = []
+    curr_rstate = numpy.array([seed] * 6, dtype='int32')
+
+    for i in range(n_streams):
+        stream_rstate = curr_rstate.copy()
+        for j in range(n_substreams):
+            substream_rstate = numpy.array(stream_rstate.copy(), dtype='int32')
+            # Transfer to device
+            rstate = gpuarray_shared_constructor(substream_rstate)
+
+            new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate,
+                                                              ndim=None,
+                                                              dtype='float32',
+                                                              size=(1,))
+            rstate.default_update = new_rstate
+
+            # Not really necessary, just mimicking
+            # rng_mrg.MRG_RandomStreams' behavior
+            sample.rstate = rstate
+            sample.update = (rstate, new_rstate)
+
+            # We need the sample back in the main memory
+            cpu_sample = tensor.as_tensor_variable(sample)
+            f = theano.function([], cpu_sample, mode=mode)
+            for k in range(n_samples):
+                s = f()
+                samples.append(s)
+
+            # next substream
+            stream_rstate = rng_mrg.ff_2p72(stream_rstate)
+
+        # next stream
+        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
+
+    samples = numpy.array(samples).flatten()
+    assert(numpy.allclose(samples, java_samples))
+
+
+def test_consistency_GPUA_parallel():
+    '''Verify that the random numbers generated by GPUA_mrg_uniform, in
+    parallel, are the same as the reference (Java) implementation by
+    L'Ecuyer et al.
+
+    '''
+    from theano.sandbox.gpuarray.tests.test_basic_ops import \
+        mode_with_gpu as mode
+    from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
+
+    seed = 12345
+    n_samples = 5
+    n_streams = 12
+    n_substreams = 7  # 7 samples will be drawn in parallel
+
+    samples = []
+    curr_rstate = numpy.array([seed] * 6, dtype='int32')
+
+    for i in range(n_streams):
+        stream_samples = []
+        rstate = [curr_rstate.copy()]
+        for j in range(1, n_substreams):
+            rstate.append(rng_mrg.ff_2p72(rstate[-1]))
+        rstate = numpy.asarray(rstate).flatten()
+        rstate = gpuarray_shared_constructor(rstate)
+
+        new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None,
+                dtype='float32', size=(n_substreams,))
+        rstate.default_update = new_rstate
+
+        # Not really necessary, just mimicking
+        # rng_mrg.MRG_RandomStreams' behavior
+        sample.rstate = rstate
+        sample.update = (rstate, new_rstate)
+
+        # We need the sample back in the main memory
+        cpu_sample = tensor.as_tensor_variable(sample)
+        f = theano.function([], cpu_sample, mode=mode)
+
+        for k in range(n_samples):
+            s = f()
+            stream_samples.append(s)
+
+        samples.append(numpy.array(stream_samples).T.flatten())
+
+        # next stream
+        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
+
+    samples = numpy.array(samples).flatten()
+    assert(numpy.allclose(samples, java_samples))
+
+
 def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=None,
              target_avg=0.5, target_std=None, mean_rtol=0.01, std_tol=0.01):
    if inputs is None:

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -71,9 +71,9 @@ def upcast(dtype, *dtypes):

 def get_scalar_type(dtype):
    """
-    Return an Scalar(dtype) object.
+    Return a Scalar(dtype) object.

-    This cache objects to save allocation and run time.
+    This caches objects to save allocation and run time.
    """
    if dtype not in get_scalar_type.cache:
        get_scalar_type.cache[dtype] = Scalar(dtype=dtype)
@@ -909,7 +909,22 @@ class UnaryScalarOp(ScalarOp):
            node.inputs[0].type != node.outputs[0].type):
            raise theano.gof.utils.MethodNotDefined()

-        dtype = node.inputs[0].dtype
+        dtype = node.inputs[0].type.dtype_specs()[1]
+        fct_call = self.c_code_contiguous_raw(dtype, 'n', 'x', 'z')
+        return """
+{
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        %(fct_call)s;
+}
+        """ % locals()
+
+    def c_code_contiguous_raw(self, dtype, n, i, o):
+        if not config.lib.amdlibm:
+            raise theano.gof.utils.MethodNotDefined()
+        if dtype.startswith('npy_'):
+            dtype = dtype[4:]
        if dtype == 'float32' and self.amd_float32 is not None:
            dtype = 'float'
            fct = self.amd_float32
@@ -918,12 +933,7 @@ class UnaryScalarOp(ScalarOp):
            fct = self.amd_float64
        else:
            raise theano.gof.utils.MethodNotDefined()
-        return """
-        npy_intp n = PyArray_SIZE(%(z)s);
-        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
-        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
-        %(fct)s(n, x, z);
-        """ % locals()
+        return "%(fct)s(%(n)s, %(i)s, %(o)s)" % locals()


 class BinaryScalarOp(ScalarOp):
@@ -2964,7 +2974,40 @@ class Composite(ScalarOp):
        # We need to clone the graph as sometimes its nodes already
        # contain a reference to an fgraph. As we want the Composite
        # to be pickable, we can't have reference to fgraph.
-        inputs, outputs = gof.graph.clone(inputs, outputs)
+
+        # Also, if there is Composite in the inner graph, we want to
+        # remove them. In that case, we do a more complicated clone
+        # that will flatten Composite. We don't need to do this
+        # recusively, as the way the fusion optimizer work, we have
+        # only 1 new Composite each time at the output.
+        if len(outputs) > 1 or not any([isinstance(var.owner.op, Composite)
+                                        for var in outputs]):
+            # No inner Composite
+            inputs, outputs = gof.graph.clone(inputs, outputs)
+        else:
+            # Inner Composite that we need to flatten
+            assert len(outputs) == 1
+            # 1. Create a new graph from inputs up to the
+            # Composite
+            res = theano.compile.rebuild_collect_shared(
+                inputs=inputs,
+                outputs=outputs[0].owner.inputs,
+                copy_inputs_over=False) #  Clone also the inputs
+            # 2. We continue this partial clone with the graph in
+            # the inner Composite
+            res2 = theano.compile.rebuild_collect_shared(
+                inputs=outputs[0].owner.op.inputs,
+                outputs=outputs[0].owner.op.outputs,
+                replace=dict(zip(outputs[0].owner.op.inputs, res[1]))
+            )
+            assert len(res2[1]) == len(outputs)
+            assert len(res[0]) == len(inputs)
+            assert res[0] != inputs
+            inputs, outputs = res[0], res2[1]
+            # Next assert comment just for speed
+            #assert not any([isinstance(node.op, Composite) for node in
+            #                theano.gof.graph.ops(inputs, outputs)])
+
        self.inputs = copy(inputs)
        self.outputs = copy(outputs)
        self.inputs_type = tuple([input.type for input in inputs])

--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -68,19 +68,17 @@ class test_composite(unittest.TestCase):
        fn = gof.DualLinker().accept(g).make_function()
        assert fn(1.0, 2.0) == 1.5

-#    def test_sin(self):
-#        x = inputs()
-#        e = sin(x)
-#        C = Composite([x], [e])
-#        c = C.make_node(x)
-#        # print c.c_code(['x'], ['z'], dict(id = 0))
-#        g = FunctionGraph([x], [c.out])
-#        fn = gof.DualLinker().accept(g).make_function()
-#        assert fn(0) == 0
-#        assert fn(3.14159265358/2) == 1
-#        assert fn(3.14159265358) == 0
-
-    # WRITEME: Test for sin, pow, and other scalar ops.
+    def test_flatten(self):
+        #Test that we flatten multiple Composite.
+        x, y, z = inputs()
+        C = Composite([x, y], [x + y])
+        CC = Composite([x, y], [C(x * y, y)])
+        assert not isinstance(CC.outputs[0].owner.op, Composite)
+
+        # Test with multiple outputs
+        CC = Composite([x, y, z], [C(x * y, y), C(x * z, y)])
+        #We don't flatten that case.
+        assert isinstance(CC.outputs[0].owner.op, Composite)

    def test_with_constants(self):
        x, y, z = inputs()

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -56,23 +56,24 @@ class Scan(PureOp):
            the scan op (like number of different types of
            arguments, name, mode, if it should run on GPU or
            not, etc.)
-        :param typeConstructor: function that constructs a Theano TensorType
-            able to represent a float32 ndarray.
-
-        Note: ``typeConstructor`` had been added to refactor how Theano
-        deals with the GPU. If it runs on the GPU, scan needs to construct
-        certain outputs (those who reside in the GPU memory) as CudaNdarray.
-        However we can not import cuda in this file (as it is in sandbox,
-        and not available on each machine) so the workaround is that the GPU
-        optimization (which is aware of cuda types) passes to the
-        constructor of this class a function that is able to construct
-        CudaNdarray. This way the class Scan does not need to be aware of
-        CudaNdarray, it just constructs any float32 tensor using this
-        function (which by default constructs normal tensors). Note that the
-        second assumption in this code is that any float32 output or input
-        will be moved on the GPU if the optimization gets applied (following
-        Theano's philosophy of moving as much as possible on gpu).
+        :param typeConstructor: function that constructs an equivalent
+            to Theano TensorType
+
+
+        Note: ``typeConstructor`` had been added to refactor how
+        Theano deals with the GPU. If it runs on the GPU, scan needs
+        to construct certain outputs (those who reside in the GPU
+        memory) as the GPU-specific type.  However we can not import
+        gpu code in this file (as it is in sandbox, and not available
+        on each machine) so the workaround is that the GPU
+        optimization passes to the constructor of this class a
+        function that is able to construct a GPU type. This way the
+        class Scan does not need to be aware of the details for the
+        GPU, it just constructs any tensor using this function (which
+        by default constructs normal tensors).
        """
+        if 'gpua' not in info:
+            info['gpua'] = False
        # adding properties into self
        self.inputs = inputs
        self.outputs = outputs
@@ -95,23 +96,10 @@ class Scan(PureOp):
            # Not that for mit_mot there are several output slices per
            # output sequence
            o = outputs[idx]
-            # Scan assumes that only variables of dtype float32 might need a
-            # special constructor (i.e. CudaNdarray constructor) when the
-            # code is running on GPU, as it is the only type supported by
-            # Theano yet. Therefore only for dtype float32 we use the passed
-            # type constructor ``typeConstructor``. For anything else we
-            # know that even if we run it on the GPU we still construct
-            # normal Theano tensors.
-            if o.type.dtype in ['float32']:
-                self.output_types.append(
-                    typeConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
-            else:
-                self.output_types.append(
-                    tensorConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
+            self.output_types.append(
+                typeConstructor(
+                    broadcastable=(False,) + o.type.broadcastable,
+                    dtype=o.type.dtype))

            idx += len(self.mit_mot_out_slices[jdx])
            jdx += 1
@@ -120,23 +108,11 @@ class Scan(PureOp):
        end = idx + self.n_mit_sot + self.n_sit_sot + self.n_nit_sot

        for o in outputs[idx:end]:
-            # Scan assumes that only variables of dtype float32 might need a
-            # special constructor (i.e. CudaNdarray constructor) when the
-            # code is running on GPU, as it is the only type supported by
-            # Theano yet. Therefore only for dtype float32 we use the passed
-            # type constructor ``typeConstructor``. For anything else we
-            # know that even if we run it on the GPU we still construct
-            # normal Theano tensors.
-            if o.type.dtype in ['float32']:
-                self.output_types.append(
-                    typeConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
-            else:
-                self.output_types.append(
-                    tensorConstructor(
-                        broadcastable=(False,) + o.type.broadcastable,
-                        dtype=o.type.dtype))
+            self.output_types.append(
+                typeConstructor(
+                    broadcastable=(False,) + o.type.broadcastable,
+                    dtype=o.type.dtype))
+
        # shared outputs + possibly the ending condition
        for o in outputs[end:]:
            self.output_types.append(o.type)
@@ -182,14 +158,14 @@ class Scan(PureOp):
                                   self.n_shared_outs)
        self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
        self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
-        if not self.info['gpu']:
+        if self.info['gpu'] or self.info['gpua']:
+            self._hash_inner_graph = self.info['gpu_hash']
+        else:
            tmp_in, tmp_out = scan_utils.reconstruct_graph(self.inputs,
                                                           self.outputs)
            local_fgraph = gof.FunctionGraph(tmp_in, tmp_out, clone=False)
            self._cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
            self._hash_inner_graph = hash(self._cmodule_key)
-        else:
-            self._hash_inner_graph = self.info['gpu_hash']

    def make_node(self, *inputs):
        """

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
--- a/theano/tests/run_tests_in_batch.py
+++ b/theano/tests/run_tests_in_batch.py
--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
--- a/theano/tests/test_tutorial.py
+++ b/theano/tests/test_tutorial.py