Merge branch 'master' of git://github.com/Theano/Theano into scan

d560a2fa · Laurent Dinh · 9ad99d0f · 755868e1 · d560a2fa · d560a2fa
--- a/doc/install.txt
+++ b/doc/install.txt
 .. _install:
@@ -130,20 +129,11 @@ by typing
 You may need to add ``sudo``  before this command to install into your
 system's ``site-packages`` directory. If you do not have administrator access
-to your machine, you can install to an alternate prefix using
+to your machine, you can install Theano locally (to ~/.local) using
 .. code-block:: bash
-    pip install Theano --install-option='--prefix=~/.local'
+    pip install Theano --user
-e.g. using ``--install-option='--prefix=~/.local'`` on Python 2.4 would
-install Theano into ``.local/lib/python2.4/site-packages`` inside your home
-directory on Mac OS X or Unix/Linux (this ``site-packages`` directory must be
-listed in your ``PYTHONPATH`` environment variable; for Python 2.6 and later,
-``~/.local`` is
-automatically searched and does *not* need to be explicitly included in
-``PYTHONPATH``, see :ref:`config_pythonpath` for instructions).
-You can change ``~/.local``, but you need to change your ``PYTHONPATH`` as said above.
 Alternatively you can use virtualenv_ to create an isolated ``site-packages``
 directory; see the `virtualenv documentation`_ for details.
@@ -225,7 +215,7 @@ or (if you want to install it for the current user only):
 .. code-block:: bash
-    pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --install-option='--prefix=~/.local'
+    pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --user
 The following are general instructions that will set you up with the
 bleeding-edge version of Theano and allow you to hack it. First,

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -607,6 +607,27 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
    have shape (2, 60).
+.. function:: tile(x, reps, ndim=None)
+    Construct an array by repeating the input `x` according to `reps`
+    pattern.
+    Tiles its input according to `reps`. The length of `reps` is the
+    number of dimension of `x` and contains the number of times to
+    tile `x` in each dimension.
+    :see: `numpy.tile
+        <http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_
+        documentation for examples.
+    :see: :func:`theano.tensor.extra_ops.repeat
+        <theano.tensor.extra_ops.repeat>`
+    :note: Currently, `reps` must be a constant, `x.ndim` and
+        `len(reps)` must be equal and, if specified, `ndim` must be
+        equal to both.
 Creating Tensor
 ===============
@@ -1542,6 +1563,86 @@ Gradient / Differentiation
    :rtype: variable or list of variables (matching `wrt`)
    :returns: gradients of the cost with respect to each of the `wrt` terms
+.. function:: subgraph_grad(wrt, end, start=None, cost=None, details=False)
+    With respect to `wrt`, computes gradients of cost and/or from existing 
+    `start` gradients, up to the `end` variables of a symbolic digraph. 
+    In other words, computes gradients for a subgraph of the
+    symbolic theano function. Ignores all disconnected inputs.
+    This can be useful when one needs to perform the gradient descent 
+    iteratively (e.g. one layer at a time in an MLP), or when a particular 
+    operation is not differentiable in theano (e.g. stochastic sampling 
+    from a multinomial). In the latter case, the gradient of the 
+    non-differentiable process could be approximated by user-defined 
+    formula, which could be calculated using the gradients of a cost 
+    with respect to samples (0s and 1s). These gradients are obtained 
+    by performing a subgraph_grad from the `cost` or previously known gradients 
+    (`start`) up to the outputs of the stochastic process (`end`). 
+    A dictionary mapping gradients obtained from the user-defined 
+    differentiation of the process, to variables, could then be fed into 
+    another subgraph_grad as `start` with any other `cost` (e.g. weight decay).
+    In an MLP, we could use subgraph_grad to iteratively backpropagate:
+    >>> x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
+    >>> w1 = theano.shared(np.random.randn(3,4))
+    >>> w2 = theano.shared(np.random.randn(4,2))
+    >>> a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
+    >>> a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
+    >>> cost2 = theano.tensor.sqr(a2 - t).sum() 
+    >>> cost2 += theano.tensor.sqr(w2.sum())
+    >>> cost1 = theano.tensor.sqr(w1.sum())
+    >>> params = [[w2],[w1]]
+    >>> costs = [cost2,cost1]
+    >>> grad_ends = [[a1], [x]]
+    >>> next_grad = None
+    >>> param_grads = []
+    >>> for i in xrange(2):
+    >>>     param_grad, next_grad = theano.subgraph_grad(
+    >>>         wrt=params[i], end=grad_ends[i], 
+    >>>         start=next_grad, cost=costs[i]
+    >>>     )
+    >>>     next_grad = dict(zip(grad_ends[i], next_grad))
+    >>>     param_grads.extend(param_grad)
+    :type wrt : List of Variables.
+        Gradients are computed with respect to `wrt`.
+    :type end : List of Variables.
+        Theano variables at which to end gradient descent
+        (they are considered constant in theano.grad). 
+        For convenience, the gradients with respect to these variables 
+        are also returned.
+    :type start : Dictionary of Variables
+    :param start: If not None, a dictionary mapping variables to 
+            their gradients. This is useful when the gradient on some 
+            variables are known. These are used to compute the gradients
+            backwards up to the variables in `end` 
+            (they are used as known_grad in theano.grad).
+    :type cost: Scalar (0-dimensional) Variable.
+    :param cost: 
+            Additional costs for which to compute the gradients.  
+            For example, these could be weight decay, an l1 constraint,
+            MSE, NLL, etc. May optionally be None if start is provided.
+            Warning : If the gradients of `cost` with respect to any 
+            of the `start` variables is already part of the `start` 
+            dictionary, then it may be counted twice with respect to `wrt` 
+            and `end`.
+    :type details: bool.
+    :param details: When True, additionally returns the 
+        list of gradients from `start` and of `cost`, respectively, 
+        with respect to `wrt` (not `end`).
+    :rtype: Tuple of 2 or 4 Lists of Variables
+    :return: Returns lists of gradients with respect to `wrt` and `end`, 
+            respectively.
 .. _R_op_list:

--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -79,7 +79,7 @@ from theano.updates import Updates, OrderedUpdates
 #we don't import by default as we don't want to force having scipy installed.
 #import sparse
-from theano.gradient import Rop, Lop, grad
+from theano.gradient import Rop, Lop, grad, subgraph_grad
 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
    import theano.sandbox.cuda

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
+import theano
 from theano import gof
-from theano import gradient as G
 from theano.compile.function_module import orig_function
 from theano.compile import SharedVariable, rebuild_collect_shared
 from theano.gof import ops_with_inner_function
@@ -142,7 +142,7 @@ class OpFromGraph(gof.Op):
        if hasattr(self, "grad_ops"):
            grad_ops = self.grad_ops
        else:
-            gs = G.grad(cost=None,
+            gs = theano.gradient.grad(cost=None,
                        known_grads=dict(zip(self.new_outputs, output_grads)),
                        wrt=self.new_inputs,
                        disconnected_inputs='ignore')

--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -57,7 +57,10 @@ from theano.gof.link import \
 from theano.gof.op import \
    Op, OpenMPOp, PureOp, ops_with_inner_function
-from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
+from theano.gof.opt import (
+    Optimizer,
+    optimizer, inplace_optimizer,
+    SeqOptimizer,
    MergeOptimizer, MergeOptMerge,
    LocalOptimizer, local_optimizer, LocalOptGroup,
    OpSub, OpRemove, PatternSub,

--- a/theano/gof/compilelock.py
+++ b/theano/gof/compilelock.py
@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
    my_pid = os.getpid()
    no_display = (verbosity == 0)
-    # Acquire lock.
    nb_error = 0
+    # The number of time we sleep when their is no errors.
+    # Used to don't display it the first time to display it less frequently.
+    # And so don't get as much email about this!
+    nb_wait = 0
+    # Acquire lock.
    while True:
        try:
            last_owner = 'no_owner'
@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
                    last_owner = read_owner
                    time_start = time.time()
                    no_display = (verbosity == 0)
-                if not no_display:
+                if not no_display and nb_wait > 0:
                    if read_owner == 'failure':
                        msg = 'unknown process'
                    else:
@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
                                 tmp_dir)
                    if verbosity <= 1:
                        no_display = True
+                nb_wait += 1
                time.sleep(random.uniform(min_wait, max_wait))
            try:

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -114,13 +114,13 @@ class Optimizer(object):
 class FromFunctionOptimizer(Optimizer):
    """WRITEME"""
-    def __init__(self, fn):
+    def __init__(self, fn, requirements=()):
        self.apply = fn
+        self.requirements = requirements
    def add_requirements(self, fgraph):
-        # Added by default
+        for req in self.requirements:
-        #fgraph.attach_feature(toolbox.ReplaceValidate())
+            req(fgraph)
-        pass
    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
        print >> stream, "%s%s id=%i" % (
@@ -142,6 +142,16 @@ def optimizer(f):
    return rval
+def inplace_optimizer(f):
+    """decorator for FromFunctionOptimizer"""
+    dh_handler = dh.DestroyHandler
+    requirements = (lambda fgraph:
+                    fgraph.attach_feature(dh_handler()),)
+    rval = FromFunctionOptimizer(f, requirements)
+    rval.__name__ = f.__name__
+    return rval
 class SeqOptimizer(Optimizer, list):
    #inherit from Optimizer first to get Optimizer.__hash__
    """WRITEME
@@ -790,9 +800,14 @@ class LocalOptimizer(object):
 class FromFunctionLocalOptimizer(LocalOptimizer):
    """WRITEME"""
-    def __init__(self, fn, tracks=None):
+    def __init__(self, fn, tracks=None, requirements=()):
        self.transform = fn
        self._tracks = tracks
+        self.requirements = requirements
+    def add_requirements(self, fgraph):
+        for req in self.requirements:
+            req(fgraph)
    def tracks(self):
        return self._tracks
@@ -808,7 +823,7 @@ class FromFunctionLocalOptimizer(LocalOptimizer):
                id(self))
-def local_optimizer(tracks):
+def local_optimizer(tracks, inplace=False):
    def decorator(f):
        """WRITEME"""
        if tracks is not None:
@@ -817,7 +832,12 @@ def local_optimizer(tracks):
            for t in tracks:
                if not (isinstance(t, op.Op) or issubclass(t, op.PureOp)):
                    raise ValueError, ("Tracks are op classes or instances", f.__module__, f.__name__)
-        rval = FromFunctionLocalOptimizer(f, tracks)
+        requirements = ()
+        if inplace:
+            dh_handler = dh.DestroyHandler
+            requirements = (lambda fgraph:
+                            fgraph.attach_feature(dh_handler()),)
+        rval = FromFunctionLocalOptimizer(f, tracks, requirements)
        rval.__name__ = f.__name__
        return rval
    return decorator
@@ -852,6 +872,10 @@ class LocalOptGroup(LocalOptimizer):
            for lopt in self.opts:
                lopt.print_summary(stream, level=(level + 2), depth=depth)
+    def add_requirements(self, fgraph):
+        for opt in self.opts:
+            opt.add_requirements(fgraph)
 class _LocalOpKeyOptGroup(LocalOptGroup):
    """WRITEME"""

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -23,6 +23,7 @@ from theano.gof import Variable
 from theano.gof.python25 import OrderedDict
 from theano.gof.null_type import NullType
 from theano.gof.op import get_debug_values
+from theano.compile import ViewOp
 # we can't do "import theano.tensor"
 # tensor depends on theano.compile
@@ -543,6 +544,109 @@ def grad(cost, wrt, consider_constant=None,
        rval, = rval
    return rval
+def subgraph_grad(wrt, end, start=None, cost=None, details=False):
+    '''
+    With respect to `wrt`, computes gradients of cost and/or from existing 
+    `start` gradients, up to the `end` variables of a symbolic digraph. 
+    In other words, computes gradients for a subgraph of the
+    symbolic theano function. Ignores all disconnected inputs.
+    This can be useful when one needs to perform the gradient descent 
+    iteratively (e.g. one layer at a time in an MLP), or when a particular 
+    operation is not differentiable in theano (e.g. stochastic sampling 
+    from a multinomial). In the latter case, the gradient of the 
+    non-differentiable process could be approximated by user-defined 
+    formula, which could be calculated using the gradients of a cost 
+    with respect to samples (0s and 1s). These gradients are obtained 
+    by performing a subgraph_grad from the `cost` or previously known gradients 
+    (`start`) up to the outputs of the stochastic process (`end`). 
+    A dictionary mapping gradients obtained from the user-defined 
+    differentiation of the process, to variables, could then be fed into 
+    another subgraph_grad as `start` with any other `cost` (e.g. weight decay).
+    :type wrt : List of Variables.
+        Gradients are computed with respect to `wrt`.
+    :type end : List of Variables.
+        Theano variables at which to end gradient descent
+        (they are considered constant in theano.grad). 
+        For convenience, the gradients with respect to these variables 
+        are also returned.
+    :type start : Dictionary of Variables
+    :param start: If not None, a dictionary mapping variables to 
+            their gradients. This is useful when the gradient on some 
+            variables are known. These are used to compute the gradients
+            backwards up to the variables in `end` 
+            (they are used as known_grad in theano.grad).
+    :type cost: Scalar (0-dimensional) Variable.
+    :param cost: 
+            Additional costs for which to compute the gradients.  
+            For example, these could be weight decay, an l1 constraint,
+            MSE, NLL, etc. May optionally be None if start is provided.
+            Warning : If the gradients of `cost` with respect to any 
+            of the `start` variables is already part of the `start` 
+            dictionary, then it may be counted twice with respect to `wrt` 
+            and `end`.
+    :type details: bool.
+    :param details: When True, additionally returns the 
+        list of gradients from `start` and of `cost`, respectively, 
+        with respect to `wrt` (not `end`).
+    :rtype: Tuple of 2 or 4 Lists of Variables
+    :return: Returns lists of gradients with respect to `wrt` and `end`, 
+            respectively.
+    '''
+    assert ((cost is not None) or (start is not None))
+    assert isinstance(end, list)
+    assert isinstance(wrt, list)
+    if start is not None:
+        assert isinstance(start, dict)
+    params = list(set(wrt + end))
+    start_grads = None
+    cost_grads = None
+    if start is not None:
+        start_grads = list(
+            theano.grad(
+                cost=None, wrt=params, known_grads=start, 
+                consider_constant=end, 
+                disconnected_inputs='ignore'
+            )
+        )
+    if cost is not None:
+        cost_grads = list(
+            theano.grad(
+                cost=cost, wrt=params,
+                consider_constant=end,
+                disconnected_inputs='ignore'
+            )
+        )
+    grads = None
+    if start is None:
+        grads = cost_grads
+    else:
+        grads = start_grads
+        if cost_grads is not None:
+            for i in range(len(grads)):
+                grads[i] += cost_grads[i]
+    pgrads = OrderedDict(zip(params, grads))
+    # separate wrt from end grads:
+    wrt_grads = list(pgrads[k] for k in wrt)
+    end_grads = list(pgrads[k] for k in end)
+    if details:
+        return wrt_grads, end_grads, start_grads, cost_grads
+    return wrt_grads, end_grads
 def _node_to_pattern(node):
    """ given an apply node, obtain its connection pattern
@@ -1685,3 +1789,29 @@ def _is_zero(x):
        return 'no'
    return 'yes'
+class ConsiderConstant(ViewOp):
+    def grad(self, args, g_outs):
+        return [g_out.zeros_like(g_out) for g_out in g_outs]
+consider_constant_ = ConsiderConstant()
+#I create a function only to have the doc show well.
+def consider_constant(x):
+    """ Consider an expression constant when computing gradients.
+    The expression itself is unaffected, but when its gradient is
+    computed, or the gradient of another expression that this
+    expression is a subexpression of, it will not be backpropagated
+    through. In other words, the gradient of the expression is
+    truncated to 0.
+    :param x: A Theano expression whose gradient should be truncated.
+    :return: The expression is returned unmodified, but its gradient
+        is now truncated to 0.
+    .. versionadded:: 0.6.1
+    """
+    return consider_constant_(x)
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1198,7 +1198,11 @@ class GpuCAReduce(GpuOp):
                    n_threads.z += 1;
                else
                    break;
-            }""" % locals()
+            }
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+        """ % locals()
        if len(self.reduce_mask) == 2:
            threads_y = ''
@@ -1509,6 +1513,8 @@ class GpuCAReduce(GpuOp):
                n_threads.z += 1;
            }
            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
            dim3 n_blocks(1,1,1);
            %(makecall)s
@@ -1605,7 +1611,7 @@ class GpuCAReduce(GpuOp):
        """ % locals()
    def c_code_cache_version_apply(self, node):
-        version = [8]  # the version corresponding to the c code in this Op
+        version = [9]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1214,19 +1214,19 @@ def local_gpujoin_1(node):
 #   shared =  dimshuffle(gemm_inplace(dimshuffle(shared)))
 # which causes memory leaks (long term fix is to make the above not leak
 # memory)
-@local_optimizer([gpu_gemm_no_inplace])
+@local_optimizer([gpu_gemm_no_inplace], inplace=True)
 def local_inplace_gemm(node):
    if node.op == gpu_gemm_no_inplace:
        return [gpu_gemm_inplace(*node.inputs)]
-@local_optimizer([gpu_gemv_no_inplace])
+@local_optimizer([gpu_gemv_no_inplace], inplace=True)
 def local_inplace_gemv(node):
    if node.op == gpu_gemv_no_inplace:
        return [gpu_gemv_inplace(*node.inputs)]
-@local_optimizer([gpu_ger_no_inplace])
+@local_optimizer([gpu_ger_no_inplace], inplace=True)
 def local_inplace_ger(node):
    if node.op == gpu_ger_no_inplace:
        return [gpu_ger_inplace(*node.inputs)]

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -109,11 +109,13 @@ def test_careduce():
                               ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
                               #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
                               ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+                               ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
                               ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
                               ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
                               ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
                               ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+                               ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
                               ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111

--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
@@ -200,13 +200,13 @@ from theano.gof import local_optimizer, LocalOptGroup
 from theano.tensor.opt import in2out
-@local_optimizer([gpugemv_no_inplace])
+@local_optimizer([gpugemv_no_inplace], inplace=True)
 def local_inplace_gpuagemv(node):
    if node.op == gpugemv_no_inplace:
        return [gpugemv_inplace(*node.inputs)]
-@local_optimizer([gpugemm_no_inplace])
+@local_optimizer([gpugemm_no_inplace], inplace=True)
 def local_inplace_gpuagemm(node):
    if node.op == gpugemm_no_inplace:
        return [gpugemm_inplace(*node.inputs)]

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    n_threads.z += 1;
                else
                    break;
-            }""" % locals()
+            }
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+        """ % locals()
        if len(self.reduce_mask) == 2:
            threads_y = ''
@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
                n_threads.z += 1;
            }
            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
            dim3 n_blocks(1,1,1);
            %(makecall)s
@@ -1697,7 +1702,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        """ % locals()
    def c_code_cache_version_apply(self, node):
-        version = [8]  # the version corresponding to the c code in this Op
+        version = [9]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -341,17 +341,20 @@ def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
 @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
 def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
 @register_opt()
 @op_lifter([tensor.nnet.Softmax])
 def local_gpua_softmax(node):
    return GpuSoftmax()
 @register_opt()
 @op_lifter([tensor.nnet.SoftmaxWithBias])
 def local_gpua_softmaxwithbias(node):
    return GpuSoftmaxWithBias()
 @register_opt()
 @op_lifter([gpu_from_host, ConvOp])
 def local_gpu_conv(node):

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
-import unittest
 from theano import scalar, gof
-from theano.gof import FunctionGraph
 from theano.gof.python25 import all, any
-from theano.tests.unittest_tools import SkipTest
 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
                                               test_CAReduce)
@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
             ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+             ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
             ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
             ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+             ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
             #test pattern implemented by reshape

--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
@@ -26,4 +26,6 @@ class G_subtensor(T_subtensor):
                             dtype='float32',
                             ignore_topo=(HostFromGpu, GpuFromHost,
                                          DeepCopyOp))
+        # GPU opt can't run in fast_compile only.
+        self.fast_compile = False
        assert self.sub == GpuSubtensor
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -28,16 +28,8 @@ if cuda_available:
 def matVecModM(A, s, m):
-    # return (A * s) % m
+    assert A.dtype == 'int64'
-    x = numpy.zeros_like(s)
+    return numpy.int32(numpy.sum((A*s) % m, 1) % m)
-    for i in xrange(len(x)):
-        for j in xrange(len(s)):
-            r = numpy.int32((numpy.int64(A[i][j]) * s[j] + x[i]) % m)
-            if r >= 0:
-                x[i] = r
-            else:
-                x[i] = r + m
-    return x
 def multMatVect(v, A, m1, B, m2):
@@ -63,24 +55,30 @@ MASK2 = numpy.int32(65535)      #2^16 - 1
 MULT2 = numpy.int32(21069)
 NORM = 4.656612873077392578125e-10; #1./2^31
-A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]])
+#A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]],
-A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]])
+#                      dtype='int64')
+#A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]],
+#                      dtype='int64')
 A1p72 = numpy.asarray([[1516919229, 758510237, 499121365],
                       [1884998244, 1516919229, 335398200],
-                       [601897748, 1884998244, 358115744]])
+                       [601897748, 1884998244, 358115744]],
+                      dtype='int64')
 A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935],
                       [1133297478, 1407477216, 1496414766],
-                       [2002613992, 1639496704, 1407477216]])
+                       [2002613992, 1639496704, 1407477216]],
+                      dtype='int64')
 A1p134 = numpy.asarray(
    [[1702500920, 1849582496, 1656874625],
     [828554832, 1702500920, 1512419905],
-     [1143731069, 828554832, 102237247]])
+     [1143731069, 828554832, 102237247]],
+    dtype='int64')
 A2p134 = numpy.asarray(
    [[796789021, 1464208080, 607337906],
     [1241679051, 1431130166, 1464208080],
-     [1401213391, 1178684362, 1431130166]])
+     [1401213391, 1178684362, 1431130166]],
+    dtype='int64')
 np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]

--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -58,7 +58,7 @@ def shared(*args, **kw):
 from theano.tensor import nnet  # used for softmax, sigmoid, etc.
 from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
-    jacobian, hessian
+    jacobian, hessian, consider_constant
 from theano.tensor.sort import sort, argsort
 from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1715,20 +1715,19 @@ def local_dot_to_dot22(node):
    _logger.info('Not optimizing dot with inputs %s %s %s %s',
                 x, y, x.type, y.type)
+@local_optimizer([gemm_no_inplace], inplace=True)
-@local_optimizer([gemm_no_inplace])
 def local_inplace_gemm(node):
    if node.op == gemm_no_inplace:
        return [gemm_inplace(*node.inputs)]
-@local_optimizer([gemv_no_inplace])
+@local_optimizer([gemv_no_inplace], inplace=True)
 def local_inplace_gemv(node):
    if node.op == gemv_no_inplace:
        return [gemv_inplace(*node.inputs)]
-@local_optimizer([ger])
+@local_optimizer([ger], inplace=True)
 def local_inplace_ger(node):
    if node.op == ger:
        return [ger_destructive(*node.inputs)]

--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
@@ -571,6 +571,8 @@ def repeat(x, repeats, axis=None):
    :param axis: int, optional.
+    :see: :func:`tensor.tile <tensor.tile>`
    .. versionadded:: 0.6
    """
    return RepeatOp(axis=axis)(x, repeats)

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -174,7 +174,7 @@ def inplace_elemwise_optimizer_op(OP):
    """
    We parametrise it to make it work for Elemwise and GpuElemwise op.
    """
-    @gof.optimizer
+    @gof.inplace_optimizer
    def inplace_elemwise_optimizer(fgraph):
        """
        Usage: inplace_elemwise_optimizer.optimize(fgraph)
@@ -2110,7 +2110,7 @@ compile.optdb.register('pre_local_IncSubtensor_serialize',
 #after priority 50 Destructive inplace operations
 #gemm is the first one now, at priority 70
-@gof.local_optimizer([IncSubtensor]) # XXX: GPU
+@gof.local_optimizer([IncSubtensor], inplace=True)
 def local_inplace_setsubtensor(node):
    """
    Also work for GpuIncSubtensor
@@ -2129,7 +2129,7 @@ compile.optdb.register('local_inplace_setsubtensor',
                       'fast_run', 'inplace')  # DEBUG
-@gof.local_optimizer([AdvancedIncSubtensor1]) # XXX: GPU
+@gof.local_optimizer([AdvancedIncSubtensor1], inplace=True)
 def local_inplace_incsubtensor1(node):
    """ also work for GpuAdvancedIncSubtensor1 """
    if isinstance(node.op, AdvancedIncSubtensor1) and not node.op.inplace:
@@ -4866,3 +4866,13 @@ else:
                           FusionOptimizer(local_elemwise_fusion), 49,
                           'fusion', 'local_elemwise_fusion',
                           'FusionOptimizer')
+# ############################
+# # Remove consider_constant #
+# ############################
+# Although the op just returns its input, it should be removed from
+# the graph to make sure all possible optimizations can be applied.
+register_canonicalize(gof.OpRemove(theano.gradient.consider_constant_),
+    'fast_compile', name='remove_consider_constant')
--- a/theano/tensor/tests/test_extra_ops.py
+++ b/theano/tensor/tests/test_extra_ops.py
 import numpy as np
 import numpy
+import unittest
 import theano
 from theano.tests import unittest_tools as utt

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -164,7 +164,8 @@ class TensorType(Type):
                            " Theano C code does not support that.",
                            msg,
                            "object shape", data.shape,
-                            "object strides", data.strides)
+                            "object strides", data.strides,
+                            "object dtype", data.dtype)
        i = 0
        for b in self.broadcastable:

--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -5,6 +5,7 @@
 import unittest
 import theano
 from theano import gof
+from theano.tests import unittest_tools as utt
 from theano import gradient
 from theano.tensor.nnet.Conv3D import conv3D
@@ -553,6 +554,92 @@ def test_disconnected_cost_grad():
        except theano.gradient.DisconnectedInputError:
            return
        raise AssertionError("A disconnected gradient has been ignored.")
+def test_subgraph_grad():
+    # Tests that the grad method with no known_grads
+    # matches what happens if you use successive subgraph_grads
+    x = theano.tensor.fvector('x')
+    t = theano.tensor.fvector('t')
+    w1 = theano.shared(np.random.randn(3,4))
+    w2 = theano.shared(np.random.randn(4,2))
+    a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
+    a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
+    cost2 = theano.tensor.sqr(a2 - t).sum() 
+    cost2 += theano.tensor.sqr(w2.sum())
+    cost1 = theano.tensor.sqr(w1.sum())
+    params = [[w2],[w1]]
+    costs = [cost2,cost1]
+    grad_ends = [[a1], [x]]
+    inputs = [t, x]
+    rng = np.random.RandomState([2012, 11, 15])
+    values = [rng.randn(2), rng.randn(3)]
+    values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
+    wrt = [w2, w1]
+    cost = cost2 + cost1
+    true_grads = theano.grad(cost, wrt)
+    true_grads = theano.function(inputs, true_grads)
+    true_grads = true_grads(*values)
+    from theano.gof.python25 import OrderedDict
+    next_grad = None
+    param_grads = []
+    for i in xrange(2):
+        param_grad, next_grad = theano.subgraph_grad(
+            wrt=params[i], end=grad_ends[i], 
+            start=next_grad, cost=costs[i]
+        )
+        next_grad = OrderedDict(zip(grad_ends[i], next_grad))
+        param_grads.extend(param_grad)
+    pgrads = theano.function(inputs, param_grads)
+    pgrads = pgrads(*values)
+    for true_grad, pgrad in zip(true_grads, pgrads):
+        assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)
+class TestConsiderConstant(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+        self.rng = np.random.RandomState(seed=utt.fetch_seed())
+    def test_op_removed(self):
+        x = theano.tensor.matrix('x')
+        y = x * gradient.consider_constant(x)
+        f = theano.function([x], y)
+        # need to refer to theano.gradient.consider_constant_ here,
+        # theano.gradient.consider_constant is a wrapper function!
+        assert gradient.consider_constant_ not in \
+            [node.op for node in f.maker.fgraph.toposort()]
+    def test_grad(self):
+        T = theano.tensor
+        a = np.asarray(self.rng.randn(5, 5),
+            dtype=config.floatX)
+        x = T.matrix('x')
+        expressions_gradients = [
+            (x * gradient.consider_constant(x), x),
+            (x * gradient.consider_constant(T.exp(x)), T.exp(x)),
+            (gradient.consider_constant(x), T.constant(0.)),
+            (x**2 * gradient.consider_constant(x), 2 * x**2),
+        ]
+        for expr, expr_grad in expressions_gradients:
+            g = gradient.grad(expr.sum(), x)
+            # gradient according to theano
+            f = theano.function([x], g, on_unused_input='ignore')
+            # desired gradient
+            f2 = theano.function([x], expr_grad, on_unused_input='ignore')
+            assert np.allclose(f(a), f2(a))
 if __name__ == '__main__':
    unittest.main()