Merge branch 'master' of https://github.com/Theano/Theano

f92215df · Nicholas Leonard · 7f1836c9 · 4927f127 · f92215df · f92215df
--- a/doc/install.txt
+++ b/doc/install.txt
 .. _install:
@@ -130,20 +129,11 @@ by typing
 You may need to add ``sudo``  before this command to install into your
 system's ``site-packages`` directory. If you do not have administrator access
-to your machine, you can install to an alternate prefix using
+to your machine, you can install Theano locally (to ~/.local) using
 .. code-block:: bash
-    pip install Theano --install-option='--prefix=~/.local'
+    pip install Theano --user
-e.g. using ``--install-option='--prefix=~/.local'`` on Python 2.4 would
-install Theano into ``.local/lib/python2.4/site-packages`` inside your home
-directory on Mac OS X or Unix/Linux (this ``site-packages`` directory must be
-listed in your ``PYTHONPATH`` environment variable; for Python 2.6 and later,
-``~/.local`` is
-automatically searched and does *not* need to be explicitly included in
-``PYTHONPATH``, see :ref:`config_pythonpath` for instructions).
-You can change ``~/.local``, but you need to change your ``PYTHONPATH`` as said above.
 Alternatively you can use virtualenv_ to create an isolated ``site-packages``
 directory; see the `virtualenv documentation`_ for details.
@@ -225,7 +215,7 @@ or (if you want to install it for the current user only):
 .. code-block:: bash
-    pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --install-option='--prefix=~/.local'
+    pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git --user
 The following are general instructions that will set you up with the
 bleeding-edge version of Theano and allow you to hack it. First,

--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
@@ -18,7 +18,7 @@ those operations will run in parallel in Theano.
 The most frequent way to control the number of threads used is via the
 ``OMP_NUM_THREADS`` environment variable. Set it to the number of
 threads you want to use before starting the python process. Some BLAS
-implementation support other enviroment variable.
+implementations support other enviroment variables.
 Parallel element wise ops with OpenMP
@@ -27,8 +27,8 @@ Parallel element wise ops with OpenMP
 Because element wise ops work on every tensor entry independently they
 can be easily parallelized using OpenMP.
-To use OpenMP you must set the ``openmp`` flag to ``True`` in Theano
+To use OpenMP you must set the ``openmp`` :ref:`flag <libdoc_config>`
-configuration.
+to ``True``.
 You can use the flag ``openmp_elemwise_minsize`` to set the minimum
 tensor size for which the operation is parallelized because for short

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
+import theano
 from theano import gof
-from theano import gradient as G
 from theano.compile.function_module import orig_function
 from theano.compile import SharedVariable, rebuild_collect_shared
 from theano.gof import ops_with_inner_function
@@ -142,7 +142,7 @@ class OpFromGraph(gof.Op):
        if hasattr(self, "grad_ops"):
            grad_ops = self.grad_ops
        else:
-            gs = G.grad(cost=None,
+            gs = theano.gradient.grad(cost=None,
                        known_grads=dict(zip(self.new_outputs, output_grads)),
                        wrt=self.new_inputs,
                        disconnected_inputs='ignore')

--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -62,7 +62,6 @@ from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
    LocalOptimizer, local_optimizer, LocalOptGroup,
    OpSub, OpRemove, PatternSub,
    NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer,
-    InplaceOptimizer, PureThenInplaceOptimizer,
    OpKeyOptimizer)
 from theano.gof.optdb import \

--- a/theano/gof/compilelock.py
+++ b/theano/gof/compilelock.py
@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
    my_pid = os.getpid()
    no_display = (verbosity == 0)
-    # Acquire lock.
    nb_error = 0
+    # The number of time we sleep when their is no errors.
+    # Used to don't display it the first time to display it less frequently.
+    # And so don't get as much email about this!
+    nb_wait = 0
+    # Acquire lock.
    while True:
        try:
            last_owner = 'no_owner'
@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
                    last_owner = read_owner
                    time_start = time.time()
                    no_display = (verbosity == 0)
-                if not no_display:
+                if not no_display and nb_wait > 0:
                    if read_owner == 'failure':
                        msg = 'unknown process'
                    else:
@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
                                 tmp_dir)
                    if verbosity <= 1:
                        no_display = True
+                nb_wait += 1
                time.sleep(random.uniform(min_wait, max_wait))
            try:

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -131,6 +131,9 @@ class FromFunctionOptimizer(Optimizer):
    def __call__(self, *args, **kwargs):
        return self.fn(*args, **kwargs)
+    def __str__(self):
+        return self.__name__
 def optimizer(f):
    """decorator for FromFunctionOptimizer"""
@@ -626,7 +629,10 @@ class MergeOptimizer(Optimizer):
        print >> stream, blanc, "  replace_time", replace_time
        print >> stream, blanc, "  validate_time", validate_time
        print >> stream, blanc, "  callback_time", callback_time
-        print >> stream, blanc, "  callback_times", callbacks_time
+        print >> stream, blanc, "  callbacks_time"
+        for i in sorted(callbacks_time.iteritems(), key=lambda a: a[1]):
+            if i[1] > 0:
+                print i
        print >> stream, blanc, "  nb_merged", nb_merged
        print >> stream, blanc, "  nb_constant", nb_constant
@@ -1490,7 +1496,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
    def __init__(self,
                 optimizers,
                 failure_callback=None,
-                 max_depth=None,
                 max_use_ratio=None):
        """
        :param optimizers:  list or set of local or global optimizations to
@@ -1499,8 +1504,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        :param max_use_ratio: each optimizer can be applied at most
            (size of graph * this number) times
-        :param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
        """
        super(EquilibriumOptimizer, self).__init__(
@@ -1520,7 +1523,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                        self.local_optimizers_map.setdefault(c, []).append(opt)
            else:
                self.global_optimizers.append(opt)
-        self.max_depth = max_depth
        self.max_use_ratio = max_use_ratio
        assert self.max_use_ratio is not None, (
                'max_use_ratio has to be a number')
@@ -1723,10 +1725,12 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            for (t, count, opt) in count_opt[::-1]:
                print >> stream, blanc, '  %.3fs - %d - %s' % (
                    t, count, opt)
-            print >> stream, blanc, '  %.3fs - in %d optimization that where not used' % (
+            print >> stream, blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
                not_used_time, len(not_used))
            not_used.sort()
            for (t, opt) in not_used[::-1]:
+                if t > 0:
+                    # Skip opt that have 0 times, they probably wasn't even tried.
                    print >> stream, blanc + "  ", '  %.3fs - %s' % (t, opt)
            print >> stream
@@ -1899,31 +1903,3 @@ def pre_greedy_local_optimizer(list_optimizations, out):
    final_outs, optimized_nodes = local_recursive_function(
        list_optimizations, out, {}, 0)
    return final_outs[out_index]
-############
-### Misc ###
-############
-class InplaceOptimizer(Optimizer):
-    def __init__(self, inplace):
-        self.inplace = inplace
-    def apply(self, fgraph):
-        self.inplace(fgraph)
-    def add_requirements(self, fgraph):
-        fgraph.attach_feature(dh.DestroyHandler())
-class PureThenInplaceOptimizer(Optimizer):
-    def __init__(self, pure, inplace):
-        self.pure = pure
-        self.inplace = inplace
-    def apply(self, fgraph):
-        self.pure(fgraph)
-        fgraph.attach_feature(dh.DestroyHandler())
-        self.inplace(fgraph)
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -194,7 +194,6 @@ class EquilibriumDB(DB):
    def query(self, *tags, **kwtags):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
        return opt.EquilibriumOptimizer(opts,
-                max_depth=5,
                max_use_ratio=config.optdb.max_use_ratio,
                failure_callback=opt.NavigatorOptimizer.warn_inplace)

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -23,6 +23,7 @@ from theano.gof import Variable
 from theano.gof.python25 import OrderedDict
 from theano.gof.null_type import NullType
 from theano.gof.op import get_debug_values
+from theano.compile import ViewOp
 # we can't do "import theano.tensor"
 # tensor depends on theano.compile
@@ -1788,3 +1789,29 @@ def _is_zero(x):
        return 'no'
    return 'yes'
+class ConsiderConstant(ViewOp):
+    def grad(self, args, g_outs):
+        return [g_out.zeros_like(g_out) for g_out in g_outs]
+consider_constant_ = ConsiderConstant()
+#I create a function only to have the doc show well.
+def consider_constant(x):
+    """ Consider an expression constant when computing gradients.
+    The expression itself is unaffected, but when its gradient is
+    computed, or the gradient of another expression that this
+    expression is a subexpression of, it will not be backpropagated
+    through. In other words, the gradient of the expression is
+    truncated to 0.
+    :param x: A Theano expression whose gradient should be truncated.
+    :return: The expression is returned unmodified, but its gradient
+        is now truncated to 0.
+    .. versionadded:: 0.6.1
+    """
+    return consider_constant_(x)
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1198,7 +1198,11 @@ class GpuCAReduce(GpuOp):
                    n_threads.z += 1;
                else
                    break;
-            }""" % locals()
+            }
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+        """ % locals()
        if len(self.reduce_mask) == 2:
            threads_y = ''
@@ -1509,6 +1513,8 @@ class GpuCAReduce(GpuOp):
                n_threads.z += 1;
            }
            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
            dim3 n_blocks(1,1,1);
            %(makecall)s

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -671,7 +671,7 @@ class GpuConv(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 20)
+        return (0, 21)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -1018,6 +1018,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        (version==3||version==4||version==5||version==-1) &&
        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
+        (kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
        !work_complete) //conv_full_patch_stack_padded
    {
      //version 3 without split

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -14,7 +14,7 @@ import theano.ifelse
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
-                        Optimizer, toolbox, DestroyHandler)
+                        Optimizer, toolbox)
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import (
    device_properties, gpu_eye,
@@ -62,7 +62,7 @@ optdb.register('gpu_opt',
 # inside the elemwise. When there is no float64 op, this is working.
 optdb.register('gpu_after_fusion',
               ProxyDB(gpu_seqopt),
-               optdb.__position__.get('elemwise_fusion', 71) + .1,
+               optdb.__position__.get('elemwise_fusion', 49) + .1,
               'gpu')
@@ -88,7 +88,6 @@ class InputToGpuOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        for input in fgraph.inputs:
@@ -1339,9 +1338,10 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
        max_inputs_to_GpuElemwise)
 if config.gpu.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
+    #Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
    optdb.register('gpu_elemwise_fusion',
                   tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
-                   71.00, 'fast_run', 'fusion',
+                   49, 'fast_run', 'fusion',
                   'local_elemwise_fusion', 'gpu')
 else:
    _logger.debug(("not enabling optimization fusion of gpu elemwise in "

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -109,11 +109,13 @@ def test_careduce():
                               ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
                               #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
                               ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+                               ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
                               ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
                               ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
                               ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
                               ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+                               ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
                               ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -679,6 +679,7 @@ def test_full():
            #Test more than maxThreadsDim0
            , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
            , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
+            , ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
            ]
 #    shapes=shapes[:277]

--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
@@ -61,7 +61,7 @@ class GpuGemv(BlasOp, Gemv):
                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
                             %(A)s, %(x)s,
                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-                             %(out)s) == NULL) {
+                             %(out)s, 0) == -1) {
            %(fail)s
        }
        """ % vars
@@ -72,7 +72,7 @@ class GpuGemv(BlasOp, Gemv):
        return code
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)
@@ -117,7 +117,7 @@ class GpuGemm(BlasOp, Gemm):
                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
                             %(A)s, %(B)s,
                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-                             %(out)s) == NULL) {
+                             %(out)s, 0) == -1) {
            %(fail)s
        }
        """ % vars
@@ -128,7 +128,7 @@ class GpuGemm(BlasOp, Gemm):
        return code
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
 gpugemm_no_inplace = GpuGemm(inplace=False)
@@ -176,7 +176,7 @@ class GpuDot22(BlasOp, Dot22):
                             one,
                             %(A)s, %(B)s,
                             zero,
-                             %(out)s) == NULL) {
+                             %(out)s, 0) == -1) {
            %(fail)s
        }
        """ % vars
@@ -187,7 +187,7 @@ class GpuDot22(BlasOp, Dot22):
        return code
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
    def c_headers(self):
        ret = super(GpuDot22, self).c_headers()

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    n_threads.z += 1;
                else
                    break;
-            }""" % locals()
+            }
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+        """ % locals()
        if len(self.reduce_mask) == 2:
            threads_y = ''
@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
                n_threads.z += 1;
            }
            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
            dim3 n_blocks(1,1,1);
            %(makecall)s

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -5,7 +5,7 @@ from theano import tensor, scalar
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, ProxyDB,
-                        Optimizer, toolbox, DestroyHandler,
+                        Optimizer, toolbox,
                        InconsistencyError, EquilibriumOptimizer)
 from theano.gof.python25 import all, any
@@ -90,7 +90,6 @@ class InputToGpuOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        for input in fgraph.inputs:

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
-import unittest
 from theano import scalar, gof
-from theano.gof import FunctionGraph
 from theano.gof.python25 import all, any
-from theano.tests.unittest_tools import SkipTest
 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
                                               test_CAReduce)
@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
             ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+             ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
             ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
             ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+             ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
             #test pattern implemented by reshape

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -28,16 +28,8 @@ if cuda_available:
 def matVecModM(A, s, m):
-    # return (A * s) % m
+    assert A.dtype == 'int64'
-    x = numpy.zeros_like(s)
+    return numpy.int32(numpy.sum((A*s) % m, 1) % m)
-    for i in xrange(len(x)):
-        for j in xrange(len(s)):
-            r = numpy.int32((numpy.int64(A[i][j]) * s[j] + x[i]) % m)
-            if r >= 0:
-                x[i] = r
-            else:
-                x[i] = r + m
-    return x
 def multMatVect(v, A, m1, B, m2):
@@ -63,24 +55,30 @@ MASK2 = numpy.int32(65535)      #2^16 - 1
 MULT2 = numpy.int32(21069)
 NORM = 4.656612873077392578125e-10; #1./2^31
-A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]])
+#A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]],
-A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]])
+#                      dtype='int64')
+#A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]],
+#                      dtype='int64')
 A1p72 = numpy.asarray([[1516919229, 758510237, 499121365],
                       [1884998244, 1516919229, 335398200],
-                       [601897748, 1884998244, 358115744]])
+                       [601897748, 1884998244, 358115744]],
+                      dtype='int64')
 A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935],
                       [1133297478, 1407477216, 1496414766],
-                       [2002613992, 1639496704, 1407477216]])
+                       [2002613992, 1639496704, 1407477216]],
+                      dtype='int64')
 A1p134 = numpy.asarray(
    [[1702500920, 1849582496, 1656874625],
     [828554832, 1702500920, 1512419905],
-     [1143731069, 828554832, 102237247]])
+     [1143731069, 828554832, 102237247]],
+    dtype='int64')
 A2p134 = numpy.asarray(
    [[796789021, 1464208080, 607337906],
     [1241679051, 1431130166, 1464208080],
-     [1401213391, 1178684362, 1431130166]])
+     [1401213391, 1178684362, 1431130166]],
+    dtype='int64')
 np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -1509,7 +1509,6 @@ class PushOutDot1(gof.Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):

--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -58,7 +58,7 @@ def shared(*args, **kw):
 from theano.tensor import nnet  # used for softmax, sigmoid, etc.
 from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
-    jacobian, hessian
+    jacobian, hessian, consider_constant
 from theano.tensor.sort import sort, argsort
 from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -139,7 +139,7 @@ except ImportError:
    pass
 from theano.configparser import config, AddConfigVar, StrParam
-from theano.gof import (utils, Op, view_roots, DestroyHandler,
+from theano.gof import (utils, Op, view_roots,
                        local_optimizer, Optimizer,
                        InconsistencyError, toolbox, SequenceDB,
                        EquilibriumOptimizer, Apply,
@@ -1488,7 +1488,6 @@ class GemmOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        did_something = True
@@ -1501,9 +1500,21 @@ class GemmOptimizer(Optimizer):
        time_factor_can = 0
        time_factor_list = 0
        time_toposort = 0
+        if fgraph.profile:
+            validate_before = fgraph.profile.validate_time
+            callbacks_before = fgraph.execute_callbacks_times.copy()
+            callback_before = fgraph.execute_callbacks_time
+        class Updater:
+            def on_import(self, fgraph, new_node, reason):
+                if new_node is not node:
+                    nodelist.append(new_node)
+        u = Updater()
+        fgraph.attach_feature(u)
        while did_something:
+            nb_iter += 1
            t0 = time.time()
-            nodelist = list(fgraph.toposort())
+            nodelist = theano.gof.graph.io_toposort(fgraph.inputs, fgraph.outputs)
            time_toposort += time.time() - t0
            did_something = False
            nodelist.reverse()
@@ -1546,16 +1557,30 @@ class GemmOptimizer(Optimizer):
                    except ReplacementDidntRemovedError, e:
                        nb_replacement_didn_t_remove += 1
                        self.warned = True
-            nb_iter += 1
+        fgraph.remove_feature(u)
+        if fgraph.profile:
+            validate_time = fgraph.profile.validate_time - validate_before
+            callback_time = fgraph.execute_callbacks_time - callback_before
+            callbacks_time = {}
+            for k, v in fgraph.execute_callbacks_times.iteritems():
+                if k in callbacks_before:
+                    callbacks_time[k] = v - callbacks_before[k]
+                else:
+                    callbacks_time[k] = v
+        else:
+            validate_time = None
+            callback_time = None
+            callbacks_time = {}
        return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove,
                nb_inconsistency_make, nb_inconsistency_replace,
                time_canonicalize, time_factor_can,
-                time_factor_list, time_toposort)
+                time_factor_list, time_toposort,
+                validate_time, callback_time, callbacks_time,)
    @staticmethod
    def print_profile(stream, prof, level=0):
        blanc = ('    ' * level)
-        #1946.912556s - ('gemm_optimizer', 'GemmOptimizer', 1)
        print >> stream, blanc, "GemmOptimizer"
        print >> stream, blanc, " nb_iter", prof[1]
        print >> stream, blanc, " nb_replacement", prof[2]
@@ -1566,6 +1591,12 @@ class GemmOptimizer(Optimizer):
        print >> stream, blanc, " time_factor_can", prof[7]
        print >> stream, blanc, " time_factor_list", prof[8]
        print >> stream, blanc, " time_toposort", prof[9]
+        print >> stream, blanc, " validate_time", prof[10]
+        print >> stream, blanc, " callback_time", prof[11]
+        print >> stream, blanc, " callbacks_time"
+        for i in sorted(prof[12].iteritems(), key=lambda a: a[1]):
+            if i[1] > 0:
+                print i
 class Dot22(GemmRelated):
@@ -1816,17 +1847,15 @@ blas_optdb.register('local_gemm_to_gemv',
        15, 'fast_run')
-# After destroyhandler is in but before we try to make elemwise things inplace
+# After destroyhandler(49.5) but before we try to make elemwise things
-# Try to make gemm inplace
+# inplace (75)
-# Also, need to make the gemm optimisation(step 70) happen before the
-# fusion of elemwise(step 71)
 blas_opt_inplace = in2out(local_inplace_gemm,
                          local_inplace_gemv,
                          local_inplace_ger,
                          name="blas_opt_inplace")
 optdb.register('InplaceBlasOpt',
               blas_opt_inplace,
-        70.0, 'fast_run', 'inplace')
+               70.0, 'fast_run', 'inplace', 'blas_opt_inplace')
 class Dot22Scalar(GemmRelated):

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -7,11 +7,12 @@ Tensor optimizations addressing the ops in basic.py
 import logging
 _logger = logging.getLogger('theano.tensor.opt')
-import operator
 import itertools
+from itertools import izip
+import operator
 import sys
+import time
 import traceback
-from itertools import izip
 import numpy
 import numpy as N  # guys... please don't do this in the library :(
@@ -25,7 +26,8 @@ from theano.gof.utils import MethodNotDefined
 from theano.configparser import config
 from theano.tensor.elemwise import Elemwise, DimShuffle
 from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
-                                     Subtensor, IncSubtensor, AdvancedIncSubtensor1)
+                                     Subtensor, IncSubtensor, make_constant,
+                                     AdvancedIncSubtensor1)
 from theano import scalar
 from theano.tensor import basic as T
 from theano import compile  # to register the optimizer built by this file
@@ -35,7 +37,7 @@ from theano.gof.python25 import any, all
 from theano.gof.opt import (Optimizer, pre_constant_merge,
                            pre_greedy_local_optimizer)
 from theano.gof.opt import merge_optimizer
-from theano.gof import toolbox, DestroyHandler
+from theano.gof import toolbox
 from theano.tensor.basic import get_scalar_constant_value, ShapeError, NotScalarConstantError
 from theano.compat.six import StringIO
@@ -1955,6 +1957,7 @@ def local_subtensor_merge(node):
            else:
                merged_slices += slices1[pos_1:]
+            merged_slices = make_constant(merged_slices)
            subtens = Subtensor(merged_slices)
            sl_ins = Subtensor.collapse(
                merged_slices,
@@ -4072,20 +4075,22 @@ local_one_plus_erf = gof.PatternSub((T.add,
                                     dict(pattern='y', constraint=_is_1),
                                     (T.erf, 'x')),
                                    (T.erfc, (T.neg, 'x')),
-                                    allow_multiple_clients=True,)
+                                    allow_multiple_clients=True,
-register_canonicalize(local_one_plus_erf, name='local_one_plus_erf')
+                                    name='local_one_plus_erf')
-register_stabilize(local_one_plus_erf, name='local_one_plus_erf')
+register_canonicalize(local_one_plus_erf)
-register_specialize(local_one_plus_erf, name='local_one_plus_erf')
+register_stabilize(local_one_plus_erf)
+register_specialize(local_one_plus_erf)
 #1-erf(x)=>erfc(x)
 local_one_minus_erf = gof.PatternSub((T.sub,
                                      dict(pattern='y', constraint=_is_1),
                                      (T.erf, 'x')),
                                     (T.erfc, 'x'),
-                                    allow_multiple_clients=True,)
+                                     allow_multiple_clients=True,
-register_canonicalize(local_one_minus_erf, name='local_one_minus_erf')
+                                     name='local_one_minus_erf',)
-register_stabilize(local_one_minus_erf, name='local_one_minus_erf')
+register_canonicalize(local_one_minus_erf)
-register_specialize(local_one_minus_erf, name='local_one_minus_erf')
+register_stabilize(local_one_minus_erf)
+register_specialize(local_one_minus_erf)
 local_one_minus_erf2 = gof.PatternSub((T.add,
                                      1,
@@ -4103,10 +4108,11 @@ local_one_plus_neg_erf = gof.PatternSub((T.add,
                                         dict(pattern='y', constraint=_is_1),
                                         (T.neg, (T.erf, 'x'))),
                                        (T.erfc, 'x'),
-                                    allow_multiple_clients=True,)
+                                        allow_multiple_clients=True,
-register_canonicalize(local_one_plus_neg_erf, name='local_one_plus_neg_erf')
+                                        name='local_one_plus_neg_erf')
-register_stabilize(local_one_plus_neg_erf, name='local_one_plus_neg_erf')
+register_canonicalize(local_one_plus_neg_erf)
-register_specialize(local_one_plus_neg_erf, name='local_one_plus_neg_erf')
+register_stabilize(local_one_plus_neg_erf)
+register_specialize(local_one_plus_neg_erf)
 #(-1)+erf(x) => -erfc(x) don't need erf(x)+(-1) as the canonicalize
 #will put the -1 as the first argument.
@@ -4114,20 +4120,22 @@ local_erf_minus_one = gof.PatternSub((T.add,
                                      dict(pattern='y', constraint=_is_minus1),
                                      (T.erf, 'x')),
                                     (T.neg, (T.erfc, 'x')),
-                                    allow_multiple_clients=True,)
+                                     allow_multiple_clients=True,
-register_canonicalize(local_erf_minus_one, name='local_erf_minus_one')
+                                     name='local_erf_minus_one')
-register_stabilize(local_erf_minus_one, name='local_erf_minus_one')
+register_canonicalize(local_erf_minus_one)
-register_specialize(local_erf_minus_one, name='local_erf_minus_one')
+register_stabilize(local_erf_minus_one)
+register_specialize(local_erf_minus_one)
 #1-erfc(x) => erf(x)
 local_one_minus_erfc = gof.PatternSub((T.sub,
                                       dict(pattern='y', constraint=_is_1),
                                       (T.erfc, 'x')),
                                      (T.erf, 'x'),
-                                    allow_multiple_clients=True,)
+                                      allow_multiple_clients=True,
-register_canonicalize(local_one_minus_erfc, name='local_one_minus_erfc')
+                                      name='local_one_minus_erfc')
-register_stabilize(local_one_minus_erfc, name='local_one_minus_erfc')
+register_canonicalize(local_one_minus_erfc)
-register_specialize(local_one_minus_erfc, name='local_one_minus_erfc')
+register_stabilize(local_one_minus_erfc)
+register_specialize(local_one_minus_erfc)
 local_one_minus_erfc2 = gof.PatternSub((T.add,
                                        1,
@@ -4155,20 +4163,22 @@ local_one_add_neg_erfc = gof.PatternSub((T.add,
                                         dict(pattern='y', constraint=_is_1),
                                         (T.neg, (T.erfc, 'x'))),
                                        (T.erf, 'x'),
-                                    allow_multiple_clients=True,)
+                                        allow_multiple_clients=True,
-register_canonicalize(local_one_add_neg_erfc, name='local_one_add_neg_erfc')
+                                        name='local_one_add_neg_erfc')
-register_stabilize(local_one_add_neg_erfc, name='local_one_add_neg_erfc')
+register_canonicalize(local_one_add_neg_erfc)
-register_specialize(local_one_add_neg_erfc, name='local_one_add_neg_erfc')
+register_stabilize(local_one_add_neg_erfc)
+register_specialize(local_one_add_neg_erfc)
 #(-1)+erfc(-x)=>erf(x)
 local_erf_neg_minus_one = gof.PatternSub((T.add,
                                          dict(pattern='y', constraint=_is_minus1),
                                          (T.erfc, (T.neg, 'x'))),
                                         (T.erf, 'x'),
-                                    allow_multiple_clients=True,)
+                                         allow_multiple_clients=True,
-register_canonicalize(local_erf_neg_minus_one, name='local_erf_neg_minus_one')
+                                         name='local_erf_neg_minus_one')
-register_stabilize(local_erf_neg_minus_one, name='local_erf_neg_minus_one')
+register_canonicalize(local_erf_neg_minus_one)
-register_specialize(local_erf_neg_minus_one, name='local_erf_neg_minus_one')
+register_stabilize(local_erf_neg_minus_one)
+register_specialize(local_erf_neg_minus_one)
 #(-1)+erfc(-1*x)=>erf(x)
 local_erf_neg_minus_one2 = gof.PatternSub((T.add,
@@ -4775,12 +4785,21 @@ class FusionOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        did_something = True
+        nb_iter = 0
+        nb_replacement = 0
+        nb_inconsistency_replace = 0
+        time_toposort = 0
+        if fgraph.profile:
+            validate_before = fgraph.profile.validate_time
+            callbacks_before = fgraph.execute_callbacks_times.copy()
+            callback_before = fgraph.execute_callbacks_time
        while did_something:
+            t0 = time.time()
            nodelist = list(fgraph.toposort())
+            time_toposort += time.time() - t0
            nodelist.reverse()
            did_something = False
            for node in nodelist:
@@ -4794,18 +4813,66 @@ class FusionOptimizer(Optimizer):
                                zip(node.outputs, new_outputs),
                                reason=self.__class__.__name__)
                            did_something = True
+                            nb_replacement += 1
                        except InconsistencyError:
+                            nb_inconsistency_replace += 1
                            pass
+            nb_iter += 1
+        if fgraph.profile:
+            validate_time = fgraph.profile.validate_time - validate_before
+            callback_time = fgraph.execute_callbacks_time - callback_before
+            callbacks_time = {}
+            for k, v in fgraph.execute_callbacks_times.iteritems():
+                if k in callbacks_before:
+                    callbacks_time[k] = v - callbacks_before[k]
+                else:
+                    callbacks_time[k] = v
+        else:
+            validate_time = None
+            callback_time = None
+            callbacks_time = {}
+        return (self, nb_iter, nb_replacement,
+                nb_inconsistency_replace,
+                validate_time, callback_time, callbacks_time,
+                time_toposort)
+    @staticmethod
+    def print_profile(stream, prof, level=0):
+        blanc = ('    ' * level)
+        print >> stream, blanc, "FusionOptimizer"
+        print >> stream, blanc, " nb_iter", prof[1]
+        print >> stream, blanc, " nb_replacement", prof[2]
+        print >> stream, blanc, " nb_inconsistency_replace", prof[3]
+        print >> stream, blanc, " validate_time", prof[4]
+        print >> stream, blanc, " callback_time", prof[5]
+        print >> stream, blanc, " callbacks_time"
+        for i in sorted(prof[6].iteritems(), key=lambda a: a[1]):
+            if i[1] > 0:
+                print i
+        print >> stream, blanc, " time_toposort", prof[7]
 if config.tensor.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion elemwise in fast_run")
+    #Must be after gpu(48.5) and before AddDestroyHandler(49.5)
    compile.optdb.register('elemwise_fusion',
-                           FusionOptimizer(local_elemwise_fusion), 71.00,
+                           FusionOptimizer(local_elemwise_fusion), 49,
                           'fast_run', 'fusion', 'local_elemwise_fusion',
                           'FusionOptimizer')
 else:
    _logger.debug("not enabling optimization fusion elemwise in fast_run")
    compile.optdb.register('elemwise_fusion',
-                           FusionOptimizer(local_elemwise_fusion), 71.00,
+                           FusionOptimizer(local_elemwise_fusion), 49,
                           'fusion', 'local_elemwise_fusion',
                           'FusionOptimizer')
+# ############################
+# # Remove consider_constant #
+# ############################
+# Although the op just returns its input, it should be removed from
+# the graph to make sure all possible optimizations can be applied.
+register_canonicalize(gof.OpRemove(theano.gradient.consider_constant_),
+    'fast_compile', name='remove_consider_constant')
--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -47,6 +47,23 @@ class AdvancedIndexingError(TypeError):
 # Helpful functions to deal with Subtensor and IncSubtensor
 ##########
+def make_constant(args):
+    """
+    Convert python litterals to theano constants in subtensor arguments.
+    """
+    def conv(a):
+            if a is None:
+                return a
+            elif isinstance(a, slice):
+                return slice(conv(a.start),
+                             conv(a.stop),
+                             conv(a.step))
+            elif isinstance(a, (int, long, numpy.integer)):
+                return scal.ScalarConstant(scal.int64, a)
+            else:
+                return a
+    return tuple(map(conv, args))
 def get_idx_list(inputs, idx_list):
    '''
    Given a list of inputs to the subtensor and its idx_list reorders

--- a/theano/tensor/tests/test_extra_ops.py
+++ b/theano/tensor/tests/test_extra_ops.py
 import numpy as np
 import numpy
+import unittest
 import theano
 from theano.tests import unittest_tools as utt

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -164,7 +164,8 @@ class TensorType(Type):
                            " Theano C code does not support that.",
                            msg,
                            "object shape", data.shape,
-                            "object strides", data.strides)
+                            "object strides", data.strides,
+                            "object dtype", data.dtype)
        i = 0
        for b in self.broadcastable:

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -4,8 +4,7 @@ import numpy
 import theano
 from theano.compat import all, PY3
-from theano.scalar import (ComplexError, IntegerDivisionError,
+from theano.scalar import ComplexError, IntegerDivisionError
-                           ScalarConstant, int64)
 from theano.gof import Constant, Variable
 from theano.gof.utils import hashtype
 from theano.tensor.utils import hash_from_ndarray
@@ -350,18 +349,7 @@ class _tensor_py_operators:
        if not isinstance(args, tuple):
            args = args,
        # Convert python literals to theano constants
-        def conv(a):
+        args = theano.tensor.subtensor.make_constant(args)
-            if a is None:
-                return a
-            elif isinstance(a, slice):
-                return slice(conv(a.start),
-                             conv(a.stop),
-                             conv(a.step))
-            elif isinstance(a, (int, long, numpy.integer)):
-                return ScalarConstant(int64, a)
-            else:
-                return a
-        args = tuple(map(conv, args))
        # Determine if advanced indexing is needed or not
        # The logic is already in Subtensor.convert: if it succeeds,
        # standard indexing is used; if it fails with

--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -5,6 +5,7 @@
 import unittest
 import theano
 from theano import gof
+from theano.tests import unittest_tools as utt
 from theano import gradient
 from theano.tensor.nnet.Conv3D import conv3D
@@ -601,5 +602,45 @@ def test_subgrad():
        print(true_grad, pgrad)
        assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)
+class TestConsiderConstant(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+        self.rng = np.random.RandomState(seed=utt.fetch_seed())
+    def test_op_removed(self):
+        x = theano.tensor.matrix('x')
+        y = x * gradient.consider_constant(x)
+        f = theano.function([x], y)
+        # need to refer to theano.gradient.consider_constant_ here,
+        # theano.gradient.consider_constant is a wrapper function!
+        assert gradient.consider_constant_ not in \
+            [node.op for node in f.maker.fgraph.toposort()]
+    def test_grad(self):
+        T = theano.tensor
+        a = np.asarray(self.rng.randn(5, 5),
+            dtype=config.floatX)
+        x = T.matrix('x')
+        expressions_gradients = [
+            (x * gradient.consider_constant(x), x),
+            (x * gradient.consider_constant(T.exp(x)), T.exp(x)),
+            (gradient.consider_constant(x), T.constant(0.)),
+            (x**2 * gradient.consider_constant(x), 2 * x**2),
+        ]
+        for expr, expr_grad in expressions_gradients:
+            g = gradient.grad(expr.sum(), x)
+            # gradient according to theano
+            f = theano.function([x], g, on_unused_input='ignore')
+            # desired gradient
+            f2 = theano.function([x], expr_grad, on_unused_input='ignore')
+            assert np.allclose(f(a), f2(a))
 if __name__ == '__main__':
    unittest.main()