Merge pull request #1967 from nouiz/fast_compile_gpu

[WIP] Fast compile gpu

Merge pull request #1967 from nouiz/fast_compile_gpu
c022347b · abergeron · 1563ea38 · c4fed2b2 · c022347b · c022347b
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1837,8 +1837,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                    process_count[process] += count
                else:
                    process_count[process] = count
-        for i in range(len(loop_process_count), len(prof2[2])):
+        loop_process_count.extend(prof2[2][len(loop_process_count):])
-            loop_process_count.append(list(prof2[2]))
        max_nb_nodes = max(prof1[3], prof2[3])

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -415,6 +415,7 @@ def use(device,
    if default_to_move_computation_to_gpu:
        optdb.add_tags('gpu_opt',
+                       'fast_compile',
                       'fast_run',
                       'inplace')
        optdb.add_tags('gpu_after_fusion',

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -55,10 +55,10 @@ gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
 gpu_cut_copies = EquilibriumDB()
 gpu_seqopt = SequenceDB()
 gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
-        'fast_run', 'inplace', 'gpu')
+        'fast_run', 'fast_compile', 'inplace', 'gpu')
 gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
-        'fast_run', 'gpu')
+        'fast_run', 'fast_compile', 'gpu')
-# DO NOT PUT fast_run in gpu_opt! This will ALWAYS enable the GPU!
+# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS enable the GPU!
 optdb.register('gpu_opt',
               gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
@@ -72,13 +72,15 @@ optdb.register('gpu_after_fusion',
               'gpu')
 ## Register merge_optimizer as a global opt
-gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer, 'fast_run')
+gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer,
+                       'fast_run', 'fast_compile')
 def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
-        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpu', *tags)
+        gpu_optimizer.register(name, local_opt, 'fast_run', 'fast_compile',
+                               'gpu', *tags)
        return local_opt
    return f
@@ -163,14 +165,15 @@ def local_cut_gpu_host_gpu(node):
        return [node.inputs[0].owner.inputs[0]]
    return False
 gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu,
-        'fast_run', 'gpu')
+                        'fast_run', 'fast_compile', 'gpu')
 gpu_cut_copies.register('cut_gpu_constant_transfers',
                        tensor.opt.constant_folding,
-                        'fast_run', 'gpu')
+                        'fast_run', 'fast_compile', 'gpu')
 #register it into canonicalize to allow other optimization to work without
 #botering with this useless pattern.
 optdb['canonicalize'].register('local_cut_gpu_host_gpu',
-                               local_cut_gpu_host_gpu, 'fast_run', 'gpu')
+                               local_cut_gpu_host_gpu,
+                               'fast_run', 'fast_compile', 'gpu')
 # 'float64', 'complex128' and 'complex64' are not supported in elemwise
 # on the gpu.
@@ -347,7 +350,7 @@ def local_gpu_specifyShape_0(node):
 @register_opt()
-@local_optimizer([gpu_from_host]) # XXX: broken: tensor.basic.dot is not an op
+@local_optimizer([gpu_from_host, tensor.basic.Dot])
 def local_gpu_dot_to_dot22(node):
    """
    gpu_from_host(dot) -> gpudot(gpu_from_host)
@@ -358,6 +361,8 @@ def local_gpu_dot_to_dot22(node):
    the output.
    A more suitable solution would be to use the right cublas call
+    This is needed in fast_compile
    """
    # In case the got do input upcast, we much check that we can
@@ -366,17 +371,18 @@ def local_gpu_dot_to_dot22(node):
        if node.outputs[0].type.dtype != 'float32':
            return False
        host_input = node.inputs[0]
-        if host_input.owner and host_input.owner.op == tensor.basic.dot:
+        if host_input.owner and isinstance(host_input.owner.op,
+                                           tensor.basic.Dot):
            x, y = host_input.owner.inputs
            # case one: vector X matrix
            if _is_real_vector(x) and _is_real_matrix(y):
-                new_op = GpuDimShuffle((False,), ['x', 0])
+                new_op = GpuDimShuffle((False,), ('x', 0))
                shape_out = y.shape[1].dimshuffle(['x'])
                gpu_x = new_op(gpu_from_host(x))
                gpu_y = gpu_from_host(y)
            # case two: matrix X vector
            elif _is_real_matrix(x) and _is_real_vector(y):
-                new_op = GpuDimShuffle((False,), [0, 'x'])
+                new_op = GpuDimShuffle((False,), (0, 'x'))
                shape_out = x.shape[0].dimshuffle(['x'])
                gpu_x = gpu_from_host(x)
                gpu_y = new_op(gpu_from_host(y))
@@ -384,20 +390,20 @@ def local_gpu_dot_to_dot22(node):
                return False
            return [GpuReshape(1)(gpu_dot22(gpu_x, gpu_y), shape_out)]
-    if node.op == tensor.basic.dot:
+    if isinstance(node.op, tensor.basic.Dot):
        if node.outputs[0].type.dtype != 'float32':
            return False
        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
                for i in node.inputs]):
            x, y = node.inputs
            if _is_real_vector(x) and _is_real_matrix(y):
-                new_op = GpuDimShuffle((False,), ['x', 0])
+                new_op = GpuDimShuffle((False,), ('x', 0))
                shape_out = y.shape[1].dimshuffle(['x'])
                gpu_x = new_op(gpu_from_host(x))
                gpu_y = gpu_from_host(y)
            elif _is_real_matrix(x) and _is_real_vector(y):
-                new_op = GpuDimShuffle((False,), [0, 'x'])
+                new_op = GpuDimShuffle((False,), (0, 'x'))
                shape_out = x.shape[0].dimshuffle(['x'])
                gpu_x = gpu_from_host(x)
                gpu_y = new_op(gpu_from_host(y))
@@ -1629,8 +1635,10 @@ else:
 #GpuElemwise inplace
 gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
        GpuElemwise)
+# DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile.
+# It still will be run in fast_run with device=gpu with the current tag.
 optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
-               'fast_run', 'inplace', 'gpu_inplace', 'gpu')
+               'fast_run', 'inplace', 'gpu_inplace')
 @register_opt()

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -404,6 +404,32 @@ def test_erfinvgpu():
    assert numpy.allclose(f(xv),f2(xv))
+def test_local_gpu_dot_to_dot22dot():
+    def cmp(a_shp, b_shp):
+        a0 = numpy.random.rand(*a_shp).astype('float32')
+        a = cuda.shared_constructor(a0, 'a')
+        b0 = numpy.random.rand(*b_shp).astype('float32')
+        b = cuda.shared_constructor(b0, 'a')
+        f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
+        assert cuda.opt.local_gpu_dot_to_dot22.transform(
+            tensor.dot(a, b).owner)
+        out = f()
+        assert numpy.allclose(numpy.dot(a0, b0), out)
+        # Try with a matrix equal to a0, but with strides in both dims
+        a.set_value(a0)
+        a.set_value(
+            a.get_value(borrow=True,
+                        return_internal_type=True)[::-1],
+            borrow=True)
+        f()
+    cmp((4,), (4, 5))
+    cmp((3, 4), (4,))
 class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
    mode = mode_with_gpu
    shared = staticmethod(cuda.shared_constructor)

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -44,9 +44,9 @@ gpu_cut_copies = EquilibriumDB()
 gpu_seqopt = SequenceDB()
 gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
-                    'fast_run', 'inplace', 'gpuarray')
+                    'fast_compile', 'fast_run', 'inplace', 'gpuarray')
 gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
-                    'fast_run', 'gpuarray')
+                    'fast_compile', 'fast_run', 'gpuarray')
 # do not add 'fast_run' to these two as this would always enable gpuarray mode
 optdb.register('gpuarray_opt', gpu_seqopt,
@@ -61,7 +61,7 @@ def register_opt(*tags, **kwargs):
        return local_opt
    return f
-register_opt()(theano.tensor.opt.local_track_shape_i)
+register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
 def safe_to_gpu(x):
@@ -145,19 +145,20 @@ def local_cut_gpu_host_gpu(node):
        return [node.inputs[0].owner.inputs[0]]
    return False
 gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
-                        'fast_run', 'inplace', 'gpuarray')
+                        'fast_compile', 'fast_run', 'inplace', 'gpuarray')
 gpu_cut_copies.register('cut_gpua_constant_transfers',
                        tensor.opt.constant_folding,
-                        'fast_run', 'gpuarray')
+                        'fast_compile', 'fast_run', 'gpuarray')
 optdb['canonicalize'].register('local_cut_gpua_host_gpua',
-                               local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
+                               local_cut_gpu_host_gpu,
+                               'fast_compile', 'fast_run', 'gpuarray')
-@register_opt()
+@register_opt('fast_compile')
 @local_optimizer([tensor.Alloc])
 def local_gpuaalloc2(node):
    """
-    Join(axis, Alloc, Alloc, ...) -> Join(axis, GpuAlloc, Alloc, ...)
+    Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
    Moves an alloc that is an input to join to the gpu.
    """
@@ -171,7 +172,7 @@ def local_gpuaalloc2(node):
        return [host_from_gpu(gpu_alloc(*node.inputs))]
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.Alloc])
 def local_gpuaalloc(node):
    new_out = gpu_alloc(*node.inputs)
@@ -199,7 +200,7 @@ def local_gpualloc_memset_0(node):
            return [new_out]
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.Reshape])
 def local_gpureshape(node):
    op = node.op
@@ -210,14 +211,14 @@ def local_gpureshape(node):
    return res
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.Rebroadcast])
 def local_gpu_rebroadcast(node):
    if isinstance(node.inputs[0].owner.op, HostFromGpu):
        return node.op(node.inputs[0].owner.inputs[0])
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.Flatten])
 def local_gpuflatten(node):
    op = node.op
@@ -230,7 +231,7 @@ def local_gpuflatten(node):
    return o
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.Elemwise])
 def local_gpu_elemwise(node):
    op = node.op
@@ -273,14 +274,14 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
               'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.DimShuffle])
 def local_gpua_dimshuffle(node):
    return GpuDimShuffle(node.op.input_broadcastable,
                         node.op.new_order)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.SpecifyShape])
 def local_gpua_specifyShape(node):
    if isinstance(node.inputs[0].type, GpuArrayType):
@@ -289,11 +290,21 @@ def local_gpua_specifyShape(node):
    return tensor.specify_shape(*inp)
+@register_opt('fast_compile')
+@op_lifter([theano.compile.ops.Shape])
+def local_gpua_shape(node):
+    # op_lifter will call this opt too frequently as the output is
+    # always on the CPU.
+    if isinstance(node.inputs[0].type, GpuArrayType):
+        return
+    return [gpu_from_host(node.inputs[0]).shape]
 def gpu_print_wrapper(op, cnda):
    op.old_op.global_fn(op.old_op, numpy.asarray(cnda))
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.printing.Print])
 def local_gpu_print_op(node):
    x, = node.inputs
@@ -303,13 +314,13 @@ def local_gpu_print_op(node):
    return new_op(gpu_x)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.Join])
 def local_gpua_join(node):
    return gpu_join
-@register_opt()
+@register_opt('fast_compile')
 @local_optimizer([GpuJoin])
 def local_gpuajoin_1(node):
    # join of a single element
@@ -318,19 +329,19 @@ def local_gpuajoin_1(node):
        return [node.inputs[1]]
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.Split])
 def local_gpua_split(node):
    return GpuSplit(node.op.len_splits)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.Subtensor])
 def local_gpua_subtensor(node):
    return GpuSubtensor(node.op.idx_list)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.IncSubtensor])
 def local_gpua_incsubtensor(node):
    return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
@@ -338,7 +349,7 @@ def local_gpua_incsubtensor(node):
                           node.op.destroyhandler_tolerate_aliased)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
 def local_gpua_advanced_incsubtensor(node):
@@ -362,7 +373,7 @@ def local_gpua_advanced_incsubtensor(node):
            set_instead_of_inc=set_instead_of_inc)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
 def local_gpua_careduce(node):
    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
@@ -442,71 +453,67 @@ def local_gpua_careduce(node):
                return [unreshaped_reduce]
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
 def local_gpua_gemv(node):
    return GpuGemv(inplace=node.op.inplace)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.blas.Gemm])
 def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
 def local_gpua_ger(node):
    return GpuGer(destructive=node.op.destructive)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.blas.Dot22])
 def local_gpua_dot22(node):
    return gpu_dot22
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.basic.Eye])
 def local_gpua_eye(node):
    return GpuEye(dtype=node.op.dtype)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
 def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
    return GpuCrossentropySoftmaxArgmax1HotWithBias()
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
 def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.nnet.Softmax])
 def local_gpua_softmax(node):
    return GpuSoftmax()
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([tensor.nnet.SoftmaxWithBias])
 def local_gpua_softmaxwithbias(node):
    return GpuSoftmaxWithBias()
-@register_opt()
+@register_opt('fast_compile')
-@local_optimizer([theano.tensor.opt.Assert])
+@op_lifter([theano.tensor.opt.Assert])
 def local_assert(node):
-    if (isinstance(node.op, theano.tensor.opt.Assert) and
+    return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0]))]
-        node.inputs[0].owner and
-        isinstance(node.inputs[0].owner.op,
-                   HostFromGpu)):
-        return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0]))]
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([gpu_from_host, ConvOp])
 def local_gpu_conv(node):
    """
@@ -654,7 +661,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
    return (nw_inputs, nw_outputs)
-@register_opt('scan')
+@register_opt('scan', 'fast_compile')
 @op_lifter([scan_op.Scan])
 def local_scan_to_gpua(node):
    info = copy.deepcopy(node.op.info)

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -4,7 +4,8 @@ import theano
 from theano import tensor
 from theano.tests import unittest_tools as utt
 import theano.sandbox.gpuarray
-from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.type import (
+    GpuArrayType, gpuarray_shared_constructor)
 from theano.sandbox.gpuarray.basic_ops import (
    GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
 from theano.sandbox.gpuarray.elemwise import (

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1815,13 +1815,14 @@ def local_dot22_to_ger_or_gemv(node):
 blas_optdb = SequenceDB()
 # run after numerical stability optimizations (1.5)
-optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run')
+optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run', 'fast_compile')
 # run before specialize (2.0) because specialize is basically a
 # free-for-all that makes the graph crazy.
+#fast_compile is needed to have GpuDot22 created.
 blas_optdb.register('local_dot_to_dot22',
                    in2out(local_dot_to_dot22),
-                    0, 'fast_run')
+                    0, 'fast_run', 'fast_compile')
 blas_optdb.register('gemm_optimizer',
        GemmOptimizer(),
        10, 'fast_run')

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
 """Provides neural-network specific Ops.
 :note: TODO: factor this out into a neural-network toolbox.
+:note: We register all optimization with the gpu tag as we don't
+    implement all the intermediate case on the GPU (in particular
+    AdvancedSubtensor). So to make sure it run well on the gpu with
+    fast_compile, we register them as needed for the GPU. This can be
+    revisited later when all the intermediate part are on the GPU.
 """
 import logging
 import numpy
@@ -570,7 +577,7 @@ class Softmax(gof.Op):
 softmax = Softmax()
-@opt.register_specialize
+@opt.register_specialize('gpu')
 @gof.local_optimizer([softmax])
 def local_softmax_with_bias(node):
    """Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias)
@@ -1323,8 +1330,8 @@ class CrossentropyCategorical1Hot(gof.Op):
 crossentropy_categorical_1hot = CrossentropyCategorical1Hot()
-@opt.register_stabilize
+@opt.register_stabilize('gpu')
-@opt.register_specialize
+@opt.register_specialize('gpu')
 @gof.optimizer
 def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
    """This is a stabilization optimization
@@ -1397,9 +1404,10 @@ def crossentropy_to_crossentropy_with_softmax(fgraph):
 optdb.register('crossentropy_to_crossentropy_with_softmax',
               crossentropy_to_crossentropy_with_softmax, 2.01,
-               'fast_run', 'xent')
+               'fast_run', 'xent', 'gpu')
+@opt.register_specialize('gpu')
 @gof.local_optimizer([softmax_grad])
 def local_crossentropy_to_crossentropy_with_softmax_grad(node):
    if node.op == softmax_grad:
@@ -1410,10 +1418,9 @@ def local_crossentropy_to_crossentropy_with_softmax_grad(node):
            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll,
                 coding_dist, true_one_of_n)
            return [dx]
-opt.register_specialize(local_crossentropy_to_crossentropy_with_softmax_grad)
-@opt.register_specialize
+@opt.register_specialize('gpu')
 @gof.local_optimizer([tensor._max_and_argmax])
 def local_argmax_pushdown(node):
    if node.op == tensor._max_and_argmax and node.inputs[0].owner and \
@@ -1499,7 +1506,7 @@ def _is_const(z, val, approx=False):
        return numpy.all(maybe == val)
-@opt.register_specialize
+@opt.register_specialize('gpu')
 @gof.local_optimizer([subtensor.AdvancedSubtensor, tensor.log])
 def local_advanced_indexing_crossentropy_onehot(node):
    log = None
@@ -1540,7 +1547,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
                                                                    labels)[0]]
-@opt.register_specialize
+@opt.register_specialize('gpu')
 @gof.local_optimizer([softmax_grad])
 def local_advanced_indexing_crossentropy_onehot_grad(node):
    if not (node.op == softmax_grad):
@@ -1763,7 +1770,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
        return
-@opt.register_specialize
+@opt.register_specialize('gpu')
 @gof.local_optimizer([softmax_with_bias])
 def graph_merge_softmax_with_crossentropy_softmax(node):
    if node.op == softmax_with_bias:
@@ -1963,10 +1970,10 @@ def make_out_pattern(X):
 local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax, 'x')),
-                                    out_pattern=(make_out_pattern, 'x'),
+                                   out_pattern=(make_out_pattern, 'x'),
                                   allow_multiple_clients=True)
 #don't do register_stabilize, this is to make local_log_softmax run
 #only after another more specific optimization that stabilizes cross entropy
 #opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
-opt.register_specialize(local_log_softmax, name='local_log_softmax')
+opt.register_specialize(local_log_softmax, 'gpu', name='local_log_softmax')
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -310,21 +310,36 @@ compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
 def register_canonicalize(lopt, *tags, **kwargs):
-    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    if type(lopt) == str:
-    compile.optdb['canonicalize'].register(name, lopt, 'fast_run', *tags)
+        def register(inner_lopt):
-    return lopt
+            return register_canonicalize(inner_lopt, *tags, **kwargs)
+        return register
+    else:
+        name = (kwargs and kwargs.pop('name')) or lopt.__name__
+        compile.optdb['canonicalize'].register(name, lopt, 'fast_run', *tags)
+        return lopt
 def register_stabilize(lopt, *tags, **kwargs):
-    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    if type(lopt) == str:
-    compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
+        def register(inner_lopt):
-    return lopt
+            return register_stabilize(inner_lopt, *tags, **kwargs)
+        return register
+    else:
+        name = (kwargs and kwargs.pop('name')) or lopt.__name__
+        compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
+        return lopt
 def register_specialize(lopt, *tags, **kwargs):
-    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    if type(lopt) == str:
-    compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
+        def register(inner_lopt):
-    return lopt
+            return register_specialize(inner_lopt, *tags, **kwargs)
+        return register
+    else:
+        name = (kwargs and kwargs.pop('name')) or lopt.__name__
+        compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
+        return lopt
 def register_uncanonicalize(lopt, *tags, **kwargs):
@@ -1304,7 +1319,7 @@ def local_track_shape_i(node):
 @register_specialize
-@register_canonicalize
+@register_canonicalize('gpu')
 @gof.local_optimizer([Subtensor])
 def local_subtensor_make_vector(node):
    # replace all subtensor(make_vector) like:
@@ -1354,8 +1369,7 @@ def local_subtensor_make_vector(node):
 #TODO: the other optimization for and, or, xor, le and ge see ticket #496.
+@register_canonicalize('fast_compile')
-@register_canonicalize
 @register_specialize
 @gof.local_optimizer([T.Elemwise])
 def local_useless_elemwise(node):
@@ -3508,7 +3522,7 @@ def local_reduce_join(node):
        #else the reduction do something about the dtype.
-@register_canonicalize
+@register_canonicalize('fast_compile')
 @gof.local_optimizer(ALL_REDUCE)
 def local_cut_useless_reduce(node):
    """Sum(a, axis=[]) -> a  """
@@ -4152,6 +4166,8 @@ def attempt_distribution(factor, num, denum, out_type):
                                   neg_pairs))), num, denum
+@register_canonicalize
+@register_stabilize
 @gof.local_optimizer([T.mul, T.true_div, T.inv])
 def local_greedy_distributor(node):
    """
@@ -4216,10 +4232,10 @@ def local_greedy_distributor(node):
    return [rval]
-register_canonicalize(local_greedy_distributor)
-register_stabilize(local_greedy_distributor)
+@register_canonicalize('fast_compile')
+@register_stabilize('fast_compile')
+@register_specialize('fast_compile')
 @gof.local_optimizer(None)
 def constant_folding(node):
    for input in node.inputs:
@@ -4253,10 +4269,6 @@ def constant_folding(node):
        rval.append(constant(output.type, storage_map[output][0]))
    return rval
-register_canonicalize(constant_folding, 'fast_compile')
-register_stabilize(constant_folding, 'fast_compile')
-register_specialize(constant_folding, 'fast_compile')
 def _is_1(expr):
    """rtype bool. True iff expr is a constant close to 1
@@ -5145,4 +5157,4 @@ else:
 # Although the op just returns its input, it should be removed from
 # the graph to make sure all possible optimizations can be applied.
 register_canonicalize(gof.OpRemove(theano.gradient.consider_constant_),
-    'fast_compile', name='remove_consider_constant')
+    'fast_compile', 'fast_run', name='remove_consider_constant')