Merge pull request #4570 from Sentient07/new_graph2gpu

New graph2gpu

Merge pull request #4570 from Sentient07/new_graph2gpu
a24fd9bb · Frédéric Bastien · GitHub · 2b371c6f · 195f9b1d · a24fd9bb
--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -402,6 +402,14 @@ class Shape_i(gof.Op):
    def infer_shape(self, node, input_shapes):
        return [()]
+    def connection_pattern(self, node):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [[False]]
    def grad(self, inp, grads):
        return [theano.gradient.grad_not_implemented(
                op=self, x_pos=0, x=inp[0],
@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None):
    return var.shape[i]
+def shape_i_op(i):
+    key = i
+    if key not in shape_i_op.cache:
+        shape_i_op.cache[key] = Shape_i(i)
+    return shape_i_op.cache[key]
+shape_i_op.cache = {}
 def register_shape_i_c_code(typ, code, check_input, version=()):
    """
    Tell Shape_i how to generate C code for a Theano Type.

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -54,7 +54,7 @@ def _atexit_print_fn():
        destination_file = open(config.profiling.destination, 'w')
    for ps in _atexit_print_list:
-        if ps.fct_callcount or ps.compile_time > 0:
+        if ps.fct_callcount >= 1 or ps.compile_time > 1:
            ps.summary(file=destination_file,
                       n_ops_to_print=config.profiling.n_ops,
                       n_apply_to_print=config.profiling.n_apply)

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            for (t, count, n_created, o) in count_opt[::-1]:
                print(blanc, '  %.3fs - %d - %d - %s' % (
                    t, count, n_created, o), file=stream)
-            print(blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
+            print(blanc, '  %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
                not_used_time, len(not_used)), file=stream)
            not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
            for (t, o) in not_used[::-1]:

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name):
        # If we couldn't deal with transfers, then maybe it's a tensor
        if isinstance(x.type, tensor.TensorType):
-            return GpuFromHost(context_name)(x)
+            return gpu_from_host(context_name)(x)
    # Try _as_GpuArrayVariable if possible
    if hasattr(x, '_as_GpuArrayVariable'):
@@ -544,7 +544,7 @@ class HostFromGpu(Op):
    def grad(self, inputs, grads):
        gz, = grads
-        return [GpuFromHost(inputs[0].type.context_name)(gz)]
+        return [gpu_from_host(inputs[0].type.context_name)(gz)]
    def R_op(self, inputs, eval_points):
        ev, = eval_points
@@ -647,6 +647,14 @@ class GpuFromHost(Op):
        return (9,)
+# Caching GPUAlloc
+def gpu_from_host(ctx):
+    if ctx not in gpu_alloc.cache:
+        gpu_from_host.cache[ctx] = GpuFromHost(ctx)
+    return gpu_from_host.cache[ctx]
+gpu_from_host.cache = {}
 class GpuToGpu(Op):
    """
    Transfer data between GPUs.
@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc):
        return True
+# Caching GPUAlloc
+def gpu_alloc(ctx, memset_0=False):
+    key = (ctx, memset_0)
+    if key not in gpu_alloc.cache:
+        gpu_alloc.cache[key] = GpuAlloc(ctx, memset_0)
+    return gpu_alloc.cache[key]
+gpu_alloc.cache = {}
 class GpuAllocEmpty(HideC, Alloc):
    """
    Allocate uninitialized memory on the GPU.
@@ -956,6 +973,14 @@ def empty_like(var):
    return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
+def gpu_alloc_empty(ctx, dtype):
+    key = (dtype, ctx)
+    if key not in gpu_alloc_empty.cache:
+        gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
+    return gpu_alloc_empty.cache[key]
+gpu_alloc_empty.cache = {}
 class GpuContiguous(Op):
    """
    Return a C contiguous version of the input.
@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape):
    def make_node(self, x, shp):
        ctx_name = infer_context_name(x)
        x = as_gpuarray_variable(x, context_name=ctx_name)
+        shp = tensor.as_tensor_variable(shp)
        res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
        otype = GpuArrayType(dtype=res.dtype,
                             broadcastable=res.broadcastable,

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -14,7 +14,7 @@ from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.cmodule import GCC_compiler
 from theano.gof.type import CDataType, Generic
 from theano.compile import optdb
-from theano.compile.ops import shape_i
+from theano.compile.ops import shape_i, shape_i_op
 from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
@@ -23,15 +23,18 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from . import pygpu
-from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
+from .type import get_context, gpu_context_type, list_contexts
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
-                        gpu_contiguous, GpuAllocEmpty, empty_like)
+                        gpu_contiguous, gpu_alloc_empty,
+                        empty_like, GpuArrayType)
 from .elemwise import GpuElemwise
 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from .nnet import GpuSoftmax
-from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
+from .opt import (gpu_seqopt, register_opt, conv_groupopt,
+                  op_lifter, register_opt2)
 from .opt_util import alpha_merge, output_merge, inplace_allocempty
 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
@@ -94,6 +97,7 @@ def dnn_present():
    if config.dnn.enabled == "False":
        dnn_present.msg = "Disabled by dnn.enabled flag"
        dnn_present.avail = False
+        return False
    if pygpu is None:
        dnn_present.msg = "PyGPU not available"
@@ -370,6 +374,19 @@ class GpuDnnConvDesc(COp):
    def c_code_cache_version(self):
        return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
+def gpu_dnn_conv_desc(border_mode, subsample=(1, 1), conv_mode='conv',
+                      precision="float32"):
+    key = (border_mode, subsample, conv_mode, precision)
+    if key not in gpu_dnn_conv_desc.cache:
+        gpu_dnn_conv_desc.cache[key] = GpuDnnConvDesc(border_mode,
+                                                      subsample,
+                                                      conv_mode,
+                                                      precision)
+    return gpu_dnn_conv_desc.cache[key]
+gpu_dnn_conv_desc.cache = {}
 # scalar constants
 _zero = constant(numpy.asarray(0.0, dtype='float64'))
 _one = constant(numpy.asarray(1.0, dtype='float64'))
@@ -526,8 +543,8 @@ class GpuDnnConv(DnnBase):
        top = gpu_contiguous(top)
-        d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
+        d_img = gpu_dnn_conv_gradI()(kerns, top, empty_like(img), desc)
-        d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
+        d_kerns = gpu_dnn_conv_gradW()(img, top, empty_like(kerns), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
@@ -564,6 +581,14 @@ class GpuDnnConv(DnnBase):
        return [shape[2]]
+def gpu_dnn_conv(algo=None, inplace=False):
+    key = (algo, inplace)
+    if key not in gpu_dnn_conv.cache:
+        gpu_dnn_conv.cache[key] = GpuDnnConv(algo, inplace)
+    return gpu_dnn_conv.cache[key]
+gpu_dnn_conv.cache = {}
 class GpuDnnConvGradW(DnnBase):
    """
@@ -608,8 +633,8 @@ class GpuDnnConvGradW(DnnBase):
        kerns = gpu_contiguous(kerns)
-        d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
+        d_img = gpu_dnn_conv_gradI()(kerns, top, empty_like(img), desc)
-        d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
+        d_top = gpu_dnn_conv()(img, kerns, empty_like(top), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
@@ -686,6 +711,14 @@ class GpuDnnConvGradW(DnnBase):
        return [shape[2]]
+def gpu_dnn_conv_gradW(algo=None, inplace=False):
+    key = (algo, inplace)
+    if key not in gpu_dnn_conv_gradW.cache:
+        gpu_dnn_conv_gradW.cache[key] = GpuDnnConvGradW(inplace, algo)
+    return gpu_dnn_conv_gradW.cache[key]
+gpu_dnn_conv_gradW.cache = {}
 class GpuDnnConvGradI(DnnBase):
    """
@@ -741,8 +774,8 @@ class GpuDnnConvGradI(DnnBase):
        img = gpu_contiguous(img)
-        d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
+        d_kerns = gpu_dnn_conv_gradW()(img, top, empty_like(kerns), desc)
-        d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
+        d_top = gpu_dnn_conv()(img, kerns, empty_like(top), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
@@ -823,6 +856,14 @@ class GpuDnnConvGradI(DnnBase):
        return [shape[2]]
+def gpu_dnn_conv_gradI(algo=None, inplace=False):
+    key = (algo, inplace)
+    if key not in gpu_dnn_conv_gradI.cache:
+        gpu_dnn_conv_gradI.cache[key] = GpuDnnConvGradI(inplace, algo)
+    return gpu_dnn_conv_gradI.cache[key]
+gpu_dnn_conv_gradI.cache = {}
 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
             conv_mode='conv', direction_hint=None, workmem=None,
             algo=None, precision=None):
@@ -896,12 +937,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = GpuAllocEmpty(img.dtype, ctx_name)(
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
            shape_i(kerns, 1, fgraph),
            shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross', precision=precision)(out.shape)
-        conv = GpuDnnConvGradW()(img, kerns, out, desc)
+        conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
    elif (border_mode == 'full' and subsample == (1, 1) and
@@ -914,26 +955,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
-                                                 shape_i(kerns, 1, fgraph),
+                                                         shape_i(kerns, 1, fgraph),
-                                                 shape2, shape3)
+                                                         shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
-        return GpuDnnConvGradI()(kerns, img, out, desc)
+        return gpu_dnn_conv_gradI()(kerns, img, out, desc)
    # Standard case: We use GpuDnnConv with suitable padding.
    # contig_version will return a gpu_contiguous copy
    # if the img contains negative strides
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
-    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode, precision=precision)(kerns.shape)
+                             conv_mode=conv_mode, precision=precision)(kerns.shape)
    desc_op = desc.owner.op
-    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
+    # We can use Shape_i and bypass the infer_shape here as this is on
-                                       desc_op.border_mode,
+    # the input of node and it will always be present.
-                                       desc_op.subsample)
+    ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
-    out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp)
+    kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
-    return GpuDnnConv(algo=algo)(img, kerns, out, desc)
+    out_shp = get_conv_output_shape(ishape, kshape,
+                                    desc_op.border_mode,
+                                    desc_op.subsample)
+    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
+    return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
 def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
@@ -944,10 +989,10 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
    img = gpu_contiguous(img)
    topgrad = gpu_contiguous(topgrad)
    kerns_shp = as_tensor_variable(kerns_shp)
-    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(kerns_shp)
+                             conv_mode=conv_mode)(kerns_shp)
-    out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
+    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp)
-    return GpuDnnConvGradW()(img, topgrad, out, desc)
+    return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
 def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
@@ -958,10 +1003,10 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
    kerns = gpu_contiguous(kerns)
    topgrad = gpu_contiguous(topgrad)
    img_shp = as_tensor_variable(img_shp)
-    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(kerns.shape)
+                             conv_mode=conv_mode)(kerns.shape)
-    out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
+    out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp)
-    return GpuDnnConvGradI()(kerns, topgrad, out, desc)
+    return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
 class GpuDnnPoolDesc(Op):
@@ -1382,53 +1427,59 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        return Apply(self, [dy, sm], [sm.type()])
-@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
+@register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
-                  AbstractConv2d_gradInputs])
+                AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
-def local_abstractconv_cudnn(node):
+def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
-    if (not isinstance(node.op, (AbstractConv2d,
+    if (not isinstance(op, (AbstractConv2d,
-                                 AbstractConv2d_gradWeights,
+                            AbstractConv2d_gradWeights,
-                                 AbstractConv2d_gradInputs))):
+                            AbstractConv2d_gradInputs))):
-        return None
+        return
-    inp1 = node.inputs[0]
-    inp2 = node.inputs[1]
-    if (node.op.filter_dilation != (1, 1)):
+    if (op.filter_dilation != (1, 1)):
        return None
-    if not isinstance(inp1.type, GpuArrayType):
+    inp1 = inputs[0]
-        return None
+    inp2 = inputs[1]
    if not dnn_available(inp1.type.context_name):
        raise_no_cudnn()
-    if node.op.filter_flip:
+    if op.filter_flip:
        conv_mode = 'conv'
    else:
        conv_mode = 'cross'
-    if isinstance(node.op, AbstractConv2d):
+    if isinstance(op, AbstractConv2d):
        rval = dnn_conv(inp1, inp2,
-                        border_mode=node.op.border_mode,
+                        border_mode=op.border_mode,
-                        subsample=node.op.subsample,
+                        subsample=op.subsample,
                        direction_hint='forward!',
                        conv_mode=conv_mode)
-    if isinstance(node.op, AbstractConv2d_gradWeights):
+    elif isinstance(op, AbstractConv2d_gradWeights):
        shape = (inp2.shape[1], inp1.shape[1],
-                 node.inputs[2][0], node.inputs[2][1])
+                 inputs[2][0], inputs[2][1])
        rval = dnn_gradweight(inp1, inp2, shape,
-                              border_mode=node.op.border_mode,
+                              border_mode=op.border_mode,
-                              subsample=node.op.subsample,
+                              subsample=op.subsample,
                              conv_mode=conv_mode)
-    if isinstance(node.op, AbstractConv2d_gradInputs):
+    elif isinstance(op, AbstractConv2d_gradInputs):
        shape = (inp2.shape[0], inp1.shape[1],
-                 node.inputs[2][0], node.inputs[2][1])
+                 inputs[2][0], inputs[2][1])
        rval = dnn_gradinput(inp1, inp2, shape,
-                             border_mode=node.op.border_mode,
+                             border_mode=op.border_mode,
-                             subsample=node.op.subsample,
+                             subsample=op.subsample,
                             conv_mode=conv_mode)
    return [rval]
+@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
+                  AbstractConv2d_gradInputs])
+def local_abstractconv_cudnn(node):
+    ctx = infer_context_name(*node.inputs)
+    if not isinstance(node.inputs[0].type, GpuArrayType):
+        return
+    return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
 conv_groupopt.register('local_abstractconv_cudnn',
                       local_abstractconv_cudnn, 20,
                       'fast_compile', 'fast_run',
@@ -1437,17 +1488,17 @@ conv_groupopt.register('local_abstractconv_cudnn',
 @inplace_allocempty(GpuDnnConv, 2)
 def local_dnn_conv_inplace(node, inputs):
-    return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
+    return [gpu_dnn_conv(algo=node.op.algo, inplace=True)(*inputs)]
 @inplace_allocempty(GpuDnnConvGradW, 2)
 def local_dnn_convgw_inplace(node, inputs):
-    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
+    return [gpu_dnn_conv_gradW(algo=node.op.algo, inplace=True)(*inputs)]
 @inplace_allocempty(GpuDnnConvGradI, 2)
 def local_dnn_convgi_inplace(node, inputs):
-    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
+    return [gpu_dnn_conv_gradI(algo=node.op.algo, inplace=True)(*inputs)]
 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
@@ -1460,73 +1511,75 @@ optdb.register('local_dnna_conv_inplace',
 @register_opt('cudnn')
 @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
 def local_dnn_conv_alpha_merge(node, *inputs):
-    return [GpuDnnConv(algo=node.op.algo)(*inputs)]
+    return [gpu_dnn_conv(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
 @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
 def local_dnn_convw_alpha_merge(node, *inputs):
-    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
+    return [gpu_dnn_conv_gradW(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
 @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
 def local_dnn_convi_alpha_merge(node, *inputs):
-    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
+    return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
 @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_conv_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConv(algo=node.op.algo)(*inputs)]
+    return [gpu_dnn_conv(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
 @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convw_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
+    return [gpu_dnn_conv_gradW(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn')
 @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
 def local_dnn_convi_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
+    return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)]
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([Pool])
-def local_pool_dnn_alternative(node, ctx_name):
+@register_opt2([Pool], 'fast_compile', 'cudnn')
+def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
-    if not node.op.ignore_border:
+    if not op.ignore_border:
        return
-    img, = node.inputs
+    img, = inputs
    img = as_gpuarray_variable(img, ctx_name)
-    ds = node.op.ds
+    ds = op.ds
-    stride = node.op.st
+    stride = op.st
-    pad = node.op.padding
+    pad = op.padding
-    mode = node.op.mode
+    mode = op.mode
    return dnn_pool(gpu_contiguous(img), ds, stride=stride, pad=pad, mode=mode)
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([MaxPoolGrad])
-def local_pool_dnn_grad_stride(node, ctx_name):
+@register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
+def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
-    if not node.op.ignore_border:
+    if not op.ignore_border:
        return
-    inp, out, out_grad = node.inputs
+    inp, out, out_grad = inputs
    inp = as_gpuarray_variable(inp, ctx_name)
    out = as_gpuarray_variable(out, ctx_name)
    out_grad = as_gpuarray_variable(out_grad, ctx_name)
-    ds = node.op.ds
+    ds = op.ds
-    st = node.op.st
+    st = op.st
-    pad = node.op.padding
+    pad = op.padding
-    mode = node.op.mode
+    mode = op.mode
    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
                                     gpu_contiguous(out),
@@ -1538,18 +1591,19 @@ def local_pool_dnn_grad_stride(node, ctx_name):
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([AveragePoolGrad])
-def local_avg_pool_dnn_grad_stride(node, ctx_name):
+@register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
+def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
-    if not node.op.ignore_border:
+    if not op.ignore_border:
        return
-    inp, out_grad = node.inputs
+    inp, out_grad = inputs
    inp = as_gpuarray_variable(inp, ctx_name)
    out_grad = as_gpuarray_variable(out_grad, ctx_name)
-    ds = node.op.ds
+    ds = op.ds
-    st = node.op.st
+    st = op.st
-    pad = node.op.padding
+    pad = op.padding
-    mode = node.op.mode
+    mode = op.mode
    cg = gpu_contiguous(out_grad)
@@ -1591,9 +1645,10 @@ def local_log_softmax_dnn(node):
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([LogSoftmax])
-def local_logsoftmax_to_dnn(node, ctx_name):
+@register_opt2([LogSoftmax], 'fast_compile', 'cudnn')
+def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
    # Transform the input in the format expected by GpuDnnSoftmax
-    inp = node.inputs[0]
+    inp = inputs[0]
    if inp.ndim != 2:
        return
    if not dnn_available(ctx_name) or version(raises=False) < 3000:
@@ -1629,11 +1684,12 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
 @register_opt('cudnn', 'fast_compile')
 @op_lifter([SoftmaxGrad])
-def local_softmax_dnn_grad(node, ctx_name):
+@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
+def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn("cuDNN needed for SoftmaxGrad")
    ins = []
-    for n in node.inputs:
+    for n in inputs:
        n = as_gpuarray_variable(n, ctx_name)
        if n.ndim != 2:
            return

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        return kernels
+# Caching GpuCAReduceCuda
+def gpu_ca_reduce_cuda(scalar_op, axis=None, reduce_mask=None, dtype=None, acc_dtype=None,
+                       pre_scalar_op=None):
+    key = (scalar_op, axis, reduce_mask, dtype, acc_dtype,
+           pre_scalar_op)
+    if key not in gpu_ca_reduce_cuda.cache:
+        gpu_ca_reduce_cuda.cache[key] = GpuCAReduceCuda(scalar_op, axis, reduce_mask, dtype,
+                                                        acc_dtype, pre_scalar_op)
+    return gpu_ca_reduce_cuda.cache[key]
+gpu_ca_reduce_cuda.cache = {}
 class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
    """
    CAReduce that reuse the python code from gpuarray.

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division
 import os
 from theano import Apply, Op
 from theano.tensor.extra_ops import CumsumOp
+from .basic_ops import infer_context_name
 try:
    from pygpu import gpuarray
 except ImportError:
    pass
-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape)
-                        infer_context_name, GpuFromHost)
+from .opt import register_opt, op_lifter, register_opt2
-from .opt import register_opt as register_gpu_opt, op_lifter
 class GpuCumsum(GpuKernelBase, Op):
@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op):
    def make_node(self, x):
        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
-        x = as_gpuarray_variable(x, infer_context_name(x))
+        context_name = infer_context_name(x)
+        x = as_gpuarray_variable(x, context_name)
        if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            raise NotImplementedError('Only cumsum on 1D, 2D and\
@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op):
        return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
+@register_opt('fast_compile')
 @op_lifter([CumsumOp])
-def use_gpu_cumsumop(node, ctx_name):
+@register_opt2([CumsumOp], 'fast_compile')
-    if node.inputs[0].dtype == 'float32':
+def local_gpua_cumsumop(op, ctx_name, inputs, outputs):
-        axis = node.op.axis
+    if inputs[0].dtype == 'float32':
-        x = node.inputs[0]
+        axis = op.axis
+        x = inputs[0]
        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None
-        if axis is None and x.ndim > 1:
+        x = as_gpuarray_variable(x, ctx_name)
-            x = x.flatten()
-        x = GpuFromHost(ctx_name)(x)
+        if axis is None and x.ndim > 1:
+            x = GpuReshape(1)(x, (-1,))
        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0
        return GpuCumsum(axis)(x)
-register_gpu_opt()(use_gpu_cumsumop)
--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType
 from theano.gpuarray import (basic_ops, GpuArrayType)
 import theano.tensor.fft
-from .opt import register_opt, op_lifter
+from .opt import register_opt, op_lifter, register_opt2
 try:
    import pygpu
@@ -373,10 +373,12 @@ def _unitary(norm):
 if scikits_cuda_available:
    @register_opt('fast_compile')
    @op_lifter([theano.tensor.fft.RFFTOp])
-    def local_curfft_op(node, context_name):
+    @register_opt2([theano.tensor.fft.RFFTOp], 'fast_compile')
+    def local_gpua_curfft_op(op, ctx_name, inputs, outputs):
        return curfft_op
    @register_opt('fast_compile')
    @op_lifter([theano.tensor.fft.IRFFTOp])
-    def local_cuirfft_op(node, context_name):
+    @register_opt2([theano.tensor.fft.IRFFTOp], 'fast_compile')
+    def local_gpua_cuirfft_op(op, ctx_name, inputs, outputs):
        return cuirfft_op
--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -14,7 +14,7 @@ from theano.gof import Op
 from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from theano import gpuarray
 from .basic_ops import as_gpuarray_variable, infer_context_name
-from .opt import register_opt, op_lifter
+from .opt import register_opt, op_lifter, register_opt2
 from .type import GpuArrayType
@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial(
        return (1,)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
-def local_gpua_multinomial(node, context_name):
+@register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
+def local_gpua_multinomial(op, context_name, inputs, outputs):
    # TODO : need description for function
-    if len(node.inputs) == 2:
+    if len(inputs) == 2:
-        p, u = node.inputs
+        p, u = inputs
        n_samples = 1
    else:
-        p, u, n_samples = node.inputs
+        p, u, n_samples = inputs
    try:
        if get_scalar_constant_value(n_samples) != 1:
            return None
    except NotScalarConstantError:
        return None
-    m, = node.outputs
+    m, = outputs
    if (p.dtype == u.dtype == m.dtype == 'float32'):
-        gpu_op = GPUAMultinomialFromUniform(node.op.odtype)
+        gpu_op = GPUAMultinomialFromUniform(op.odtype)
        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
            gpu_op(p, u))
--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -13,7 +13,7 @@ except ImportError:
 from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
                        infer_context_name)
-from .opt import register_opt as register_gpu_opt, op_lifter
+from .opt import register_opt2, op_lifter, register_opt
 from .type import GpuArrayType
@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        Op.perform(self, node, inp, out, ctx)
+@register_opt('fast_compile')
 @op_lifter([Images2Neibs])
-def use_gpu_images2neibs(node, context_name):
+@register_opt2([Images2Neibs], 'fast_compile')
-    if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
+def local_gpua_images2neibs(op, context_name, inputs, outputs):
-        return GpuImages2Neibs(node.op.mode)
+    if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
+        return GpuImages2Neibs(op.mode)
-register_gpu_opt()(use_gpu_images2neibs)
--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
 from . import opt
 from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
-                        infer_context_name)
+                        infer_context_name, gpu_alloc_empty)
 from .type import gpu_context_type
 from .opt_util import alpha_merge, output_merge
@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
        return '\n'.join(codel)
-@opt.register_opt()
+@opt.register_opt('fast_compile')
 @opt.op_lifter([tensor.Dot])
-def local_dot_to_gemm16(node, ctx_name):
+@opt.register_opt2([tensor.Dot], 'fast_compile')
+def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
    if nerv is None:
        return
-    A = node.inputs[0]
+    A = inputs[0]
-    B = node.inputs[1]
+    B = inputs[1]
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = node.inputs[0].fgraph
+        fgraph = getattr(outputs[0], 'fgraph', None)
-        C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
+        C = gpu_alloc_empty(ctx_name, dtype='float16')(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -3,6 +3,8 @@ import copy
 import numpy
 import logging
 import pdb
+import time
+from six import iteritems
 from six.moves import xrange
 import theano
@@ -10,9 +12,11 @@ from theano import tensor, scalar, gof, config
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
-                        SequenceDB, Optimizer, toolbox)
+                        SequenceDB, Optimizer, DB, toolbox, graph)
+from theano.gof.opt import NavigatorOptimizer
 from theano.gof.optdb import LocalGroupDB
 from theano.ifelse import IfElse
+from theano.misc.ordered_set import OrderedSet
 from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scan_module import scan_utils, scan_op, scan_opt
@@ -32,7 +36,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous, gpu_contiguous,
                        GpuAlloc, GpuAllocEmpty, GpuReshape,
-                        GpuEye, gpu_join, GpuJoin)
+                        GpuEye, gpu_join, GpuJoin, gpu_alloc_empty, gpu_alloc, gpu_from_host)
 from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemm_inplace, gpugemmbatch_no_inplace,
                   gpugemv_no_inplace, gpugemv_inplace)
@@ -44,7 +48,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
                   gpu_softmax_with_bias, gpu_softmax)
 from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
-                       GpuCAReduceCPY)
+                       GpuCAReduceCPY, gpu_ca_reduce_cuda)
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedSubtensor1,
                        GpuAdvancedIncSubtensor1,
@@ -57,12 +61,31 @@ _logger = logging.getLogger("theano.gpuarray.opt")
 gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
+# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
+gpu_optimizer2 = EquilibriumDB()
+class GraphToGPUDB(DB):
+    """
+    Retrieves the list local optimizers based on the optimizer flag's value
+    from EquilibriumOptimizer by calling the method query.
+    """
+    def query(self, *tags, **kwtags):
+        opt = gpu_optimizer2.query(*tags, **kwtags)
+        return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
 gpu_seqopt = SequenceDB()
 # Don't register this right now
 conv_groupopt = LocalGroupDB()
 conv_groupopt.__name__ = "gpua_conv_opts"
+gpu_seqopt.register('gpuarray_graph_optimization', GraphToGPUDB(), -0.5,
+                    'fast_compile', 'fast_run', 'gpuarray')
 gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'gpuarray')
 gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
@@ -82,6 +105,28 @@ def register_opt(*tags, **kwargs):
    return f
+def register_opt2(tracks, *tags, **kwargs):
+    '''
+    Decorator for the new GraphToGPU optimizer.
+    Takes an extra parameter(Op) compared to register_opt decorator.
+    Parameters
+    ----------
+    tracks : List of Op class Or Op instance or None
+        The Node's Op to which optimization is being applied.
+    tags : String
+        The optimization tag to which the optimizer will be registered.
+    '''
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        opt = theano.gof.local_optimizer(tracks)(local_opt)
+        gpu_optimizer2.register(name, opt, 'fast_run', 'gpuarray', *tags)
+        return local_opt
+    return f
 def register_inplace(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
@@ -102,7 +147,7 @@ gpu_optimizer.register('local_remove_all_assert',
 def safe_to_gpu(x, ctx_name):
    if isinstance(x.type, tensor.TensorType):
-        return GpuFromHost(ctx_name)(x)
+        return gpu_from_host(ctx_name)(x)
    else:
        return x
@@ -135,6 +180,7 @@ def op_lifter(OP, cuda_only=False):
                        context_name = i.owner.inputs[0].type.context_name
                        replace = True
                        break
                if not replace:
                    # We replace if *all* clients are on the GPU
                    clients = [c for o in node.outputs for c in o.clients]
@@ -158,7 +204,9 @@ def op_lifter(OP, cuda_only=False):
                # the context was derived from the outputs
                for i in node.inputs:
                    i.tag.context_name = context_name
-                new_op = maker(node, context_name)
+                new_op = maker(node.op, context_name, node.inputs, node.outputs)
                # This is needed as sometimes new_op inherits from OP.
                if new_op and new_op != node.op:
                    if isinstance(new_op, theano.Op):
@@ -195,9 +243,13 @@ class InputToGpuOptimizer(Optimizer):
            target = getattr(input.tag, 'target', None)
            if target == 'cpu':
                continue
+            # Do not move *int* scalar to the GPU.
+            if (isinstance(input.type, tensor.TensorType) and
+                    input.ndim == 0 and 'int' in input.dtype):
+                continue
            try:
-                new_input = host_from_gpu(GpuFromHost(target)(input))
+                new_input = host_from_gpu(gpu_from_host(target)(input))
                fgraph.replace_validate(input, new_input,
                                        "InputToGpuOptimizer")
            except TypeError:
@@ -215,6 +267,222 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
                    0, 'fast_run', 'fast_compile', 'merge')
+class GraphToGPU(NavigatorOptimizer):
+    """
+    Transfer the graph as a whole to GPU instead of transfering node by node.
+    Parameters
+    ----------
+    local_optimizers_all : List or SortedSet
+        The local optimizations to apply to a node.
+    local_optimizers_map : Dict
+        Dictionary object containing the mapping of Op to list of
+        LocalOptimizers.
+    """
+    def __init__(self, local_optimizers_all, local_optimizers_map):
+        self.local_optimizers_all = local_optimizers_all
+        self.local_optimizers_map = local_optimizers_map
+    def add_requirements(self, fgraph):
+        fgraph.attach_feature(toolbox.ReplaceValidate())
+    def apply(self, fgraph):
+        mapping = {}
+        time_opts = {}
+        node_created = {}
+        process_count = {}
+        t_topo = time.time()
+        topo = fgraph.toposort()
+        time_topo = time.time()
+        toposort_timing = time_topo - t_topo
+        # Building a new graph
+        # Iterating through inputs of graph
+        target = infer_context_name(*fgraph.inputs)
+        for i in fgraph.inputs:
+            # Do not move *int* scalar to the GPU.
+            if (isinstance(i.type, tensor.TensorType) and
+               (i.ndim > 0 or 'int' not in i.dtype)):
+                mapping[i] = i.transfer(getattr(i.tag, 'target', target))
+            else:
+                mapping[i] = i
+        for i in fgraph.variables:
+            if isinstance(i, theano.Constant):
+                mapping[i] = i
+        for node in topo:
+            for lopt in (self.local_optimizers_map.get(node.op, []) +
+                         self.local_optimizers_map.get(type(node.op), []) +
+                         self.local_optimizers_all):
+                process_count.setdefault(lopt, 0)
+                time_opts.setdefault(lopt, 0)
+                node_created.setdefault(lopt, 0)
+        for node in topo:
+            if isinstance(node.op, HostFromGpu):
+                mapping[node.outputs[0]] = mapping[node.inputs[0]]
+                continue
+            # Move only if any of the inputs are on the GPU.
+            move_to_GPU = False
+            context_name = None
+            for i in [mapping[i] for i in node.inputs]:
+                if isinstance(i.type, GpuArrayType):
+                    context_name = i.type.context_name
+                    move_to_GPU = True
+                    break
+            if (not move_to_GPU and
+                    isinstance(node.op, (theano.tensor.Alloc,
+                                         theano.tensor.AllocEmpty,
+                                         theano.tensor.basic.Eye))):
+                # If the Alloc[Empty] have a client that will be moved
+                # to the GPU, we should move the Alloc* on the GPU.
+                # We approximate this by supposing that if we have an
+                # optimization for one of the clients op, then we will
+                # move the client to the GPU.
+                for c, _ in node.outputs[0].clients:
+                    if (c != 'output' and
+                        (self.local_optimizers_map.get(c.op, []) +
+                         self.local_optimizers_map.get(type(c.op), []))):
+                        move_to_GPU = True
+            new_ops = None
+            # Apply the lifter
+            if move_to_GPU:
+                for lopt in (self.local_optimizers_map.get(node.op, []) +
+                             self.local_optimizers_map.get(type(node.op), []) +
+                             self.local_optimizers_all):
+                        t_opt = time.time()
+                        new_ops = lopt.transform(node.op, context_name,
+                                                 [mapping[i] for i in node.inputs],
+                                                 node.outputs)
+                        t_opt2 = time.time()
+                        time_opts[lopt] += t_opt2 - t_opt
+                        if new_ops:
+                            process_count[lopt] += 1
+                            break
+            outputs = []
+            if isinstance(new_ops, theano.Op):
+                outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True)
+            elif not new_ops:
+                newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
+                outputs = newnode.outputs
+            elif isinstance(new_ops, (tuple, list)):
+                outputs = new_ops
+            elif isinstance(new_ops, theano.Variable):
+                outputs = [new_ops]
+            if new_ops:
+                node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
+            for new_o, old_o in zip(outputs, node.outputs):
+                assert len(outputs) == len(node.outputs)
+                mapping[old_o] = new_o
+        new_nodes = []
+        for o in fgraph.outputs:
+            new_o = mapping[o]
+            if new_o.type != o.type:
+                assert isinstance(o.type, tensor.TensorType)
+                assert isinstance(new_o.type, GpuArrayType)
+                # This condition is needed in the case one input is an
+                # output of the graph. Without this, it would
+                # introduce cycle as we don't replace correctly that
+                # case. It would also add extra transfer to/from the
+                # gpu.
+                if (new_o.owner and
+                        isinstance(new_o.owner.op, GpuFromHost) and
+                        new_o.owner.inputs[0].type == o.type):
+                    new_o = new_o.owner.inputs[0]
+                else:
+                    new_o = safe_to_cpu(new_o)
+            new_nodes.append(new_o)
+        fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes),
+                                    reason=self.__class__.__name__)
+        return (self, toposort_timing, time_opts, node_created, process_count)
+    @staticmethod
+    def print_profile(stream, prof, level=0):
+        (opt, toposort_timing, time_opts, node_created, process_count) = prof
+        blanc = ('    ' * level)
+        print(blanc, "GraphToGPUOptimizer", end=' ', file=stream)
+        print(blanc, getattr(opt, "name",
+                             getattr(opt, "__name__", "")), file=stream)
+        print(blanc, "  time io_toposort %.3fs" % toposort_timing, file=stream)
+        s = sum([v for k, v in time_opts.iteritems()])
+        print(blanc, "Total time taken by local optimizers %.3fs " % s, file=stream)
+        count_opt = []
+        not_used = []
+        not_used_time = 0
+        for o, count in iteritems(process_count):
+            if count > 0:
+                count_opt.append((time_opts[o], count,
+                                  node_created[o], o))
+            else:
+                not_used.append((time_opts[o], o))
+                not_used_time += time_opts[o]
+        if count_opt:
+            print(blanc,
+                  '  times - times applied - Node created - name:',
+                  file=stream)
+            count_opt.sort()
+            for (t, count, n_created, o) in count_opt[::-1]:
+                print(blanc, '  %.3fs - %d - %d - %s' % (
+                    t, count, n_created, o), file=stream)
+            print(blanc, '  %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
+                not_used_time, len(not_used)), file=stream)
+            not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
+            for (t, o) in not_used[::-1]:
+                if t > 0:
+                    # Skip opt that have 0 times, they probably wasn't even tried.
+                    print(blanc + "  ", '  %.3fs - %s' % (t, o), file=stream)
+            print(file=stream)
+    @staticmethod
+    def merge_profile(prof1, prof2):
+        # (opt, toposort_timing, time_opts, node_created, process_count) = prof1
+        local_optimizers = OrderedSet(prof1[0].local_optimizers_all).union(
+            prof2[0].local_optimizers_all)
+        def merge_dict(d1, d2):
+            """
+            merge 2 dicts by adding the values.
+            """
+            d = d1.copy()
+            for k, v in iteritems(d2):
+                if k in d:
+                    d[k] += v
+                else:
+                    d[k] = v
+            return d
+        local_optimizers_map = merge_dict(prof1[0].local_optimizers_map,
+                                          prof2[0].local_optimizers_map)
+        new_opt = GraphToGPU(local_optimizers, local_optimizers_map)
+        toposort_timing = prof1[1] + prof2[1]
+        time_opts = merge_dict(prof1[2], prof2[2])
+        node_created = merge_dict(prof1[3], prof2[3])
+        process_count = merge_dict(prof1[4], prof2[4])
+        return (new_opt,
+                toposort_timing,
+                time_opts,
+                node_created,
+                process_count)
 @local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
 def local_cut_gpu_transfers(node):
    # gpu[ab] -> host -> gpub
@@ -273,7 +541,7 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
 @register_opt('fast_compile')
 @local_optimizer([tensor.Alloc])
-def local_gpuaalloc2(node):
+def local_gpua_alloc2(node):
    """
    Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
@@ -292,22 +560,23 @@ def local_gpuaalloc2(node):
                i.owner.op in [host_from_gpu, tensor.alloc]
                for i in c.inputs[1:])
            for c, idx in node.outputs[0].clients)):
-        return [host_from_gpu(GpuAlloc(None)(*node.inputs))]
+        return [host_from_gpu(gpu_alloc(None)(*node.inputs))]
 @register_opt('fast_compile')
 @op_lifter([tensor.Alloc])
-def local_gpuaalloc(node, context_name):
+@register_opt2([tensor.Alloc], 'fast_compile')
-    return GpuAlloc(context_name)(*node.inputs)
+def local_gpua_alloc(op, context_name, inputs, outputs):
+    return gpu_alloc(context_name)
 @register_opt('fast_compile')
 @op_lifter([tensor.AllocEmpty])
-def local_gpuaallocempty(node, context_name):
+@register_opt2([tensor.AllocEmpty], 'fast_compile')
+def local_gpua_alloc_empty(op, context_name, inputs, outputs):
    # We use _props_dict() to make sure that the GPU op know all the
    # CPU op props.
-    return GpuAllocEmpty(context_name=context_name,
+    return gpu_alloc_empty(context_name, **op._props_dict())
-                         **node.op._props_dict())(*node.inputs)
 @register_opt()
@@ -318,7 +587,7 @@ def local_gpualloc_memset_0(node):
        if (isinstance(inp, GpuArrayConstant) and
                inp.data.size == 1 and
                (numpy.asarray(inp.data) == 0).all()):
-            new_op = GpuAlloc(node.op.context_name, memset_0=True)
+            new_op = gpu_alloc(node.op.context_name, memset_0=True)
            return [new_op(*node.inputs)]
@@ -328,8 +597,8 @@ def local_gpua_alloc_empty_to_zeros(node):
    if isinstance(node.op, GpuAllocEmpty):
        context_name = infer_context_name(*node.inputs)
        z = numpy.asarray(0, dtype=node.outputs[0].dtype)
-        return [GpuAlloc()(as_gpuarray_variable(z, context_name),
+        return [gpu_alloc(context_name)(as_gpuarray_variable(z, context_name),
-                           *node.inputs)]
+                                        *node.inputs)]
 optdb.register('local_gpua_alloc_empty_to_zeros',
               theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
               # After move to gpu and merge2, before inplace.
@@ -352,14 +621,15 @@ def local_gpu_contiguous_gpu_contiguous(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.extra_ops.CpuContiguous])
-def local_gpu_contiguous(node, context_name):
+@register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
+def local_gpua_contiguous(op, context_name, inputs, outputs):
    return gpu_contiguous
 @register_opt('fast_compile')
 @op_lifter([tensor.Reshape])
-def local_gpureshape(node, context_name):
+@register_opt2([tensor.Reshape], 'fast_compile')
-    op = node.op
+def local_gpua_reshape(op, context_name, inputs, outputs):
    name = op.name
    if name:
        name = 'Gpu' + name
@@ -369,32 +639,33 @@ def local_gpureshape(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.Rebroadcast])
-def local_gpu_rebroadcast(node, context_name):
+@register_opt2([tensor.Rebroadcast], 'fast_compile')
-    return node.op(as_gpuarray_variable(node.inputs[0], context_name))
+def local_gpua_rebroadcast(op, context_name, inputs, outputs):
+    return op(as_gpuarray_variable(inputs[0], context_name))
 @register_opt('fast_compile')
 @op_lifter([tensor.Flatten])
-def local_gpuflatten(node, context_name):
+@register_opt2([tensor.Flatten], 'fast_compile')
-    op = node.op
+def local_gpua_flatten(op, context_name, inputs, outputs):
    shp = []
    if op.outdim != 1:
-        shp = [node.inputs[0].shape[i] for i in range(op.outdim - 1)]
+        shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
    shp += [-1]
    res = GpuReshape(op.outdim, None)
-    o = res(node.inputs[0], theano.tensor.as_tensor_variable(shp))
+    o = res(inputs[0], theano.tensor.as_tensor_variable(shp))
    return o
 @register_opt('fast_compile')
 @op_lifter([tensor.Elemwise])
-def local_gpu_elemwise(node, context_name):
+@register_opt2([tensor.Elemwise], 'fast_compile')
-    op = node.op
+def local_gpua_elemwise(op, context_name, inputs, outputs):
    scal_op = op.scalar_op
    name = op.name
    if name:
        name = 'Gpu' + name
-    if len(node.outputs) > 1:
+    if len(outputs) > 1:
        return
    res = GpuElemwise(scal_op, name=name,
                      inplace_pattern=copy.copy(op.inplace_pattern),
@@ -407,13 +678,13 @@ def local_gpu_elemwise(node, context_name):
        # Only transfer the computation on the gpu if the output dtype is
        # floating point. Else, give up on the transfer to the gpu.
-        out_dtype = node.outputs[0].dtype
+        out_dtype = outputs[0].dtype
        if out_dtype not in ['float16', 'float32', 'float64']:
            return
        # Transfer the inputs on the GPU and cast them to the right dtype.
        new_inputs = []
-        for inp in node.inputs:
+        for inp in inputs:
            if inp.dtype != out_dtype:
                gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
                new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp, context_name)))
@@ -423,8 +694,7 @@ def local_gpu_elemwise(node, context_name):
        # Perform the exponent on the gpu and transfer the output back to the
        # cpu.
        gpu_output = res(*new_inputs)
-        cpu_output = host_from_gpu(gpu_output)
+        return [gpu_output]
-        return [cpu_output]
    else:
        return res
@@ -461,29 +731,41 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
 @register_opt('fast_compile')
 @op_lifter([tensor.DimShuffle])
-def local_gpua_dimshuffle(node, context_name):
+@register_opt2([tensor.DimShuffle], 'fast_compile')
-    return GpuDimShuffle(node.op.input_broadcastable,
+def local_gpua_dimshuffle(op, context_name, inputs, outputs):
-                         node.op.new_order)
+    return GpuDimShuffle(op.input_broadcastable,
+                         op.new_order)
 @register_opt('fast_compile')
 @op_lifter([tensor.SpecifyShape])
-def local_gpua_specifyShape(node, context_name):
+@register_opt2([tensor.SpecifyShape], 'fast_compile')
-    if isinstance(node.inputs[0].type, GpuArrayType):
+def local_gpua_specifyShape(op, context_name, inputs, outputs):
+    if isinstance(inputs[0].type, GpuArrayType):
        return
-    inp = [as_gpuarray_variable(node.inputs[0], context_name)]
+    return local_gpua_specifyShape_graph(op, context_name, inputs, outputs)
-    inp += node.inputs[1:]
+@register_opt2([tensor.SpecifyShape], 'fast_compile')
+def local_gpua_specifyShape_graph(op, context_name, inputs, outputs):
+    inp = [as_gpuarray_variable(inputs[0], context_name)]
+    inp += inputs[1:]
    return tensor.specify_shape(*inp)
 @register_opt('fast_compile')
 @op_lifter([theano.compile.ops.Shape])
-def local_gpua_shape(node, context_name):
+def local_gpua_shape(op, context_name, inputs, outputs):
    # op_lifter will call this opt too frequently as the output is
    # always on the CPU.
-    if isinstance(node.inputs[0].type, GpuArrayType):
+    if isinstance(inputs[0].type, GpuArrayType):
        return
-    return [as_gpuarray_variable(node.inputs[0], context_name).shape]
+    return local_gpua_shape_graph(op, context_name, inputs, outputs)
+@register_opt2([tensor.compile.ops.Shape], 'fast_compile')
+def local_gpua_shape_graph(op, context_name, inputs, outputs):
+    return [as_gpuarray_variable(inputs[0], context_name).shape]
 def gpu_print_wrapper(op, cnda):
@@ -492,11 +774,12 @@ def gpu_print_wrapper(op, cnda):
 @register_opt('fast_compile')
 @op_lifter([tensor.printing.Print])
-def local_gpu_print_op(node, context_name):
+@register_opt2([tensor.printing.Print], 'fast_compile')
-    x, = node.inputs
+def local_gpua_print_op(op, context_name, inputs, outputs):
+    x, = inputs
    gpu_x = as_gpuarray_variable(x, context_name=context_name)
-    new_op = node.op.__class__(global_fn=gpu_print_wrapper)
+    new_op = op.__class__(global_fn=gpu_print_wrapper)
-    new_op.old_op = node.op
+    new_op.old_op = op
    return new_op(gpu_x)
@@ -570,28 +853,30 @@ def local_gpu_pdbbreakpoint_op(node):
 @register_opt('fast_compile')
 @op_lifter([IfElse])
-def local_gpua_lazy_ifelse(node, context_name):
+@register_opt2([IfElse], 'fast_compile')
-    if node.op.gpu:
+def local_gpua_lazy_ifelse(op, context_name, inputs, outputs):
+    if op.gpu:
        return
-    c = node.inputs[0]
+    c = inputs[0]
    inps = []
-    for v in node.inputs[1:]:
+    for v in inputs[1:]:
-        if isinstance(v.type, (tensor.TensorType, GpuArrayType)):
+        if isinstance(v.type, tensor.TensorType):
            inps.append(as_gpuarray_variable(v, context_name))
        else:
            inps.append(v)
-    return IfElse(node.op.n_outs, gpu=True)(c, *inps, return_list=True)
+    return IfElse(op.n_outs, gpu=True)(c, *inps, return_list=True)
 @register_opt('fast_compile')
 @op_lifter([tensor.Join])
-def local_gpua_join(node, context_name):
+@register_opt2([tensor.Join], 'fast_compile')
+def local_gpua_join(op, context_name, inputs, outputs):
    return gpu_join
 @register_opt('fast_compile')
 @local_optimizer([GpuJoin])
-def local_gpuajoin_1(node):
+def local_gpua_join_1(node):
    # join of a single element
    if (isinstance(node.op, GpuJoin) and
            len(node.inputs) == 2):
@@ -600,14 +885,16 @@ def local_gpuajoin_1(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.Split])
-def local_gpua_split(node, context_name):
+@register_opt2([tensor.Split], 'fast_compile')
-    return GpuSplit(node.op.len_splits)
+def local_gpua_split(op, context_name, inputs, outputs):
+    # TODO use props
+    return GpuSplit(op.len_splits)
 @register_opt('fast_compile')
 @op_lifter([tensor.Subtensor])
-def local_gpua_subtensor(node, context_name):
+def local_gpua_subtensor(op, context_name, inputs, outputs):
-    x = node.inputs[0]
+    x = inputs[0]
    if (x.owner and isinstance(x.owner.op, HostFromGpu)):
        gpu_x = x.owner.inputs[0]
        if (gpu_x.owner and
@@ -617,41 +904,68 @@ def local_gpua_subtensor(node, context_name):
            if len(x.clients) == 1:
                if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
                                              for v in n.inputs + n.outputs])
-                        for n, _ in node.outputs[0].clients]):
+                        for n, _ in outputs[0].clients]):
                    return
                else:
-                    return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))]
+                    return [host_from_gpu(gpu_x.owner.op(outputs[0]))]
-    return GpuSubtensor(node.op.idx_list)
+    return GpuSubtensor(op.idx_list)
+@register_opt2([tensor.Subtensor], 'fast_compile')
+def local_gpua_subtensor_graph(op, context_name, inputs, outputs):
+    # We need different code as the condition is different as inputs
+    # aren't the same.
+    x = inputs[0]
+    # We don't want to move the subtensor to the GPU if the inputs is
+    # on the CPU and the only client of the CPU node is this
+    # subtensor. This allow to have a smaller transfer.
+    if (x.owner and isinstance(x.owner.op, GpuFromHost)):
+        cpu_x = x.owner.inputs[0]
+        # And it is a shared var or an input of the graph.
+        # and is used by only 1 node.
+        # x is in the new graph, so we can't tests its number of clients.
+        if not cpu_x.owner and len(cpu_x.clients) == 1:
+            c = outputs[0].clients
+            # If the subtensor have only 1 client, do it on the CPU.
+            # We let the other optimization to take care to move the
+            # next node or not.
+            if len(c) == 1:
+                return
+    return GpuSubtensor(op.idx_list)
 @register_opt('fast_compile')
 @op_lifter([tensor.IncSubtensor])
-def local_gpua_incsubtensor(node, context_name):
+@register_opt2([tensor.IncSubtensor], 'fast_compile')
-    op = GpuIncSubtensor(node.op.idx_list, node.op.inplace,
+def local_gpua_inc_subtensor(op, context_name, inputs, outputs):
-                         node.op.set_instead_of_inc,
+    op = GpuIncSubtensor(op.idx_list, op.inplace,
-                         node.op.destroyhandler_tolerate_aliased)
+                         op.set_instead_of_inc,
-    ret = op(*node.inputs)
+                         op.destroyhandler_tolerate_aliased)
-    val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True)
+    ret = op(*inputs)
+    val = getattr(outputs[0].tag, 'nan_guard_mode_check', True)
    ret.tag.nan_guard_mode_check = val
    return ret
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedSubtensor1])
-def local_gpua_advanced_subtensor(node, context_name):
+@register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
+def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
    return GpuAdvancedSubtensor1()
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
-def local_gpua_advanced_incsubtensor(node, context_name):
+@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
+def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
    context = get_context(context_name)
    # This is disabled on non-cuda contexts
    if context.kind != b'cuda':
        return None
-    x, y, ilist = node.inputs
+    x, y, ilist = inputs
    # Gpu Ops needs both inputs to have the same dtype
    if (x.type.dtype != y.type.dtype):
@@ -661,7 +975,7 @@ def local_gpua_advanced_incsubtensor(node, context_name):
        if y.type.dtype != dtype:
            y = tensor.cast(y, dtype)
-    set_instead_of_inc = node.op.set_instead_of_inc
+    set_instead_of_inc = op.set_instead_of_inc
    compute_capability = int(context.bin_id[-2])
@@ -684,29 +998,31 @@ def local_advincsub1_gpua_inplace(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
-def local_gpua_careduce(node, context_name):
+@register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
-    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
+def local_gpua_careduce(op, context_name, inputs, outputs):
-                                      scalar.Maximum, scalar.Minimum)):
+    if isinstance(op.scalar_op, (scalar.Add, scalar.Mul,
+                                 scalar.Maximum, scalar.Minimum)):
        ctx = get_context(context_name)
        if ctx.kind == b'opencl':
-            op = GpuCAReduceCPY
+            op2 = GpuCAReduceCPY
-            if node.op.scalar_op not in [scalar.add, scalar.mul]:
+            if op.scalar_op not in [scalar.add, scalar.mul]:
                # We don't support yet all reduction with cpy code.
                return
        elif ctx.kind == b'cuda':
-            op = GpuCAReduceCuda
+            op2 = GpuCAReduceCuda
        else:
            return False
-        x, = node.inputs
+        x, = inputs
-        greduce = op(
+        greduce = op2(
-            node.op.scalar_op, axis=node.op.axis,
+            op.scalar_op, axis=op.axis,
-            dtype=getattr(node.op, 'dtype', None),
+            dtype=getattr(op, 'dtype', None),
-            acc_dtype=getattr(node.op, 'acc_dtype', None))
+            acc_dtype=getattr(op, 'acc_dtype', None))
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None
-        if (op is GpuCAReduceCPY or
+        if (op2 is GpuCAReduceCPY or
                gvar.owner.op.supports_c_code([
                    as_gpuarray_variable(x, context_name)])):
            return greduce
@@ -717,11 +1033,11 @@ def local_gpua_careduce(node, context_name):
            # to make them a single dimension, do the reduction, and
            # then reshape to get them back.
-            if node.op.axis is None:
+            if op.axis is None:
                reduce_mask = [1] * x.type.ndim
            else:
                reduce_mask = [0] * x.type.ndim
-                for a in node.op.axis:
+                for a in op.axis:
                    assert reduce_mask[a] == 0
                    reduce_mask[a] = 1
@@ -737,11 +1053,11 @@ def local_gpua_careduce(node, context_name):
            for idx, m in enumerate(new_mask):
                if m == 1:
                    new_axis.append(idx)
-            greduce = op(
+            greduce = op2(
-                node.op.scalar_op,
+                op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
-                dtype=getattr(node.op, 'dtype', None),
+                dtype=getattr(op, 'dtype', None),
-                acc_dtype=getattr(node.op, 'acc_dtype', None))
+                acc_dtype=getattr(op, 'acc_dtype', None))
            reshaped_x = x.reshape(tensor.stack(new_in_shp))
            gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
@@ -750,16 +1066,15 @@ def local_gpua_careduce(node, context_name):
            # be None
            reshaped_gpu_inputs = [gpu_reshaped_x]
            if greduce.supports_c_code(reshaped_gpu_inputs):
-                reduce_reshaped_x = host_from_gpu(
+                reduce_reshaped_x = greduce(gpu_reshaped_x)
-                    greduce(gpu_reshaped_x))
-                if reduce_reshaped_x.ndim != node.outputs[0].ndim:
+                if reduce_reshaped_x.ndim != outputs[0].ndim:
                    out_shp = []
                    for i in range(x.ndim):
-                        if i not in node.op.axis:
+                        if i not in op.axis:
                            out_shp.append(shape_i(x, i))
-                    unreshaped_reduce = reduce_reshaped_x.reshape(
+                    unreshaped_reduce = GpuReshape(len(out_shp))(reduce_reshaped_x,
-                        tensor.stack(out_shp))
+                                                                 tensor.stack(out_shp))
                else:
                    unreshaped_reduce = reduce_reshaped_x
                return [unreshaped_reduce]
@@ -767,8 +1082,9 @@ def local_gpua_careduce(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
-def local_gpua_gemv(node, context_name):
+@register_opt2([tensor.blas.Gemv], 'fast_compile')
-    if node.op.inplace:
+def local_gpua_gemv(op, context_name, inputs, outputs):
+    if op.inplace:
        return gpugemv_inplace
    else:
        return gpugemv_no_inplace
@@ -776,8 +1092,9 @@ def local_gpua_gemv(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Gemm])
-def local_gpua_gemm(node, context_name):
+@register_opt2([tensor.blas.Gemm], 'fast_compile')
-    if node.op.inplace:
+def local_gpua_gemm(op, context_name, inputs, outputs):
+    if op.inplace:
        return gpugemm_inplace
    else:
        return gpugemm_no_inplace
@@ -785,27 +1102,29 @@ def local_gpua_gemm(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.BatchedDot])
-def local_gpua_gemmbatch(node, context_name):
+@register_opt2([tensor.blas.BatchedDot], 'fast_compile')
-    a, b = node.inputs
+def local_gpua_gemmbatch(op, context_name, inputs, outputs):
+    a, b = inputs
    c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
    return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
 @register_opt('fast_compile')
 @op_lifter([tensor.basic.Dot])
-def local_gpua_hgemm(node, context_name):
+@register_opt2([tensor.basic.Dot], 'fast_compile')
+def local_gpua_hgemm(op, context_name, inputs, outputs):
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':
        _logger.warning("Not performing dot of float16 on the GPU since "
                        "cuda 7.5 is not available. Updating could speed up "
                        "your code.")
        return
-    A = node.inputs[0]
+    A = inputs[0]
-    B = node.inputs[1]
+    B = inputs[1]
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = node.inputs[0].fgraph
+        fgraph = outputs[0].fgraph
-        C = GpuAllocEmpty(dtype='float16', context_name=context_name)(
+        C = gpu_alloc_empty(context_name, dtype='float16')(
            shape_i(A, 0, fgraph),
            shape_i(B, 1, fgraph))
        return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
@@ -813,95 +1132,106 @@ def local_gpua_hgemm(node, context_name):
 @register_opt()
 @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
-def local_gpuagemm_alpha_merge(node, *inputs):
+def local_gpua_gemm_alpha_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]
 @register_opt()
 @output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
-def local_gpuagemm_output_merge(node, *inputs):
+def local_gpua_gemm_output_merge(node, *inputs):
    return [gpugemm_no_inplace(*inputs)]
 @register_opt()
 @alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
-def local_gpuagemmbatch_alpha_merge(node, *inputs):
+def local_gpua_gemmbatch_alpha_merge(node, *inputs):
    return [gpugemmbatch_no_inplace(*inputs)]
 @register_opt()
 @output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
-def local_gpuagemmbatch_output_merge(node, *inputs):
+def local_gpua_gemmbatch_output_merge(node, *inputs):
    return [gpugemmbatch_no_inplace(*inputs)]
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
-def local_gpua_ger(node, context_name):
+@register_opt2([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer], 'fast_compile')
-    return GpuGer(inplace=node.op.destructive)
+def local_gpua_ger(op, context_name, inputs, outputs):
+    return GpuGer(inplace=op.destructive)
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Dot22])
-def local_gpua_dot22(node, context_name):
+@register_opt2([tensor.blas.Dot22], 'fast_compile')
+def local_gpua_dot22(op, context_name, inputs, outputs):
    return gpu_dot22
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Dot22Scalar])
-def local_gpua_dot22scalar(node, context_name):
+@register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
-    x, y, a = node.inputs
+def local_gpua_dot22scalar(op, context_name, inputs, outputs):
+    x, y, a = inputs
    x = as_gpuarray_variable(x, context_name)
    y = as_gpuarray_variable(y, context_name)
-    z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1])
+    z = gpu_alloc_empty(context_name, dtype=x.dtype)(x.shape[0], y.shape[1])
    return [gpugemm_no_inplace(z, a, x, y, 0)]
 @register_opt('fast_compile')
 @op_lifter([tensor.basic.Eye])
-def local_gpua_eye(node, context_name):
+@register_opt2([tensor.basic.Eye], 'fast_compile')
-    return GpuEye(dtype=node.op.dtype, context_name=context_name)
+def local_gpua_eye(op, context_name, inputs, outputs):
+    return GpuEye(dtype=op.dtype, context_name=context_name)
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
-def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name):
+@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
+def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs, outputs):
    return gpu_crossentropy_softmax_argmax_1hot_with_bias
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
-def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name):
+@register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile')
+def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs, outputs):
    return gpu_crossentropy_softmax_1hot_with_bias_dx
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.Softmax], cuda_only=True)
-def local_gpua_softmax(node, context_name):
+@register_opt2([tensor.nnet.Softmax], 'fast_compile')
+def local_gpua_softmax(op, context_name, inputs, outputs):
    return gpu_softmax
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
-def local_gpua_softmaxwithbias(node, context_name):
+@register_opt2([tensor.nnet.SoftmaxWithBias], 'fast_compile')
+def local_gpua_softmaxwithbias(op, context_name, inputs, outputs):
    return gpu_softmax_with_bias
 @register_opt('fast_compile')
 @op_lifter([theano.tensor.opt.Assert])
-def local_assert(node, context_name):
+def local_gpua_assert(op, context_name, inputs, outputs):
+    if isinstance(inputs[0].type, GpuArrayType):
-    # Check if input nodes are already on the GPU
-    if isinstance(node.inputs[0].type, GpuArrayType):
        return
-    return [host_from_gpu(node.op(as_gpuarray_variable(node.inputs[0],
+    return local_gpua_assert_graph(op, context_name, inputs, outputs)
-                                                       context_name),
-                                  *node.inputs[1:]))]
+@register_opt2([theano.tensor.opt.Assert], 'fast_compile')
+def local_gpua_assert_graph(op, context_name, inputs, outputs):
+    return [op(as_gpuarray_variable(inputs[0], context_name),
+               *inputs[1:])]
 @register_opt('fast_compile')
 @op_lifter([ConvOp])
-def local_error_convop(node, context_name):
+@register_opt2([ConvOp], 'fast_compile')
+def local_gpua_error_convop(op, context_name, inputs, outputs):
    assert False, """
 ConvOp does not work with the gpuarray backend.
@@ -912,8 +1242,9 @@ theano.tensor.nnet.conv2d()
 @register_opt('fast_compile')
 @op_lifter([SparseBlockGemv])
-def local_lift_sparseblockgemv(node, context_name):
+@register_opt2([SparseBlockGemv], 'fast_compile')
-    if node.op.inplace:
+def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
+    if op.inplace:
        return gpu_sparse_block_gemv_inplace
    else:
        return gpu_sparse_block_gemv
@@ -921,8 +1252,9 @@ def local_lift_sparseblockgemv(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([SparseBlockOuter])
-def local_lift_sparseblockouter(node, context_name):
+@register_opt2([SparseBlockOuter], 'fast_compile')
-    if node.op.inplace:
+def local_gpua_sparseblockouter(op, context_name, inputs, outputs):
+    if op.inplace:
        return gpu_sparse_block_outer_inplace
    else:
        return gpu_sparse_block_outer
@@ -943,20 +1275,27 @@ def local_inplace_sparseblockouter(node):
 # This deals with any abstract convs that have a transfer somewhere
-@register_opt('fast_compile')
+@register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @op_lifter([AbstractConv2d,
            AbstractConv2d_gradWeights,
            AbstractConv2d_gradInputs])
-def local_lift_abstractconv2d(node, context_name):
+def local_gpua_abstractconv2d(op, context_name, inputs, outputs):
-    if isinstance(node.outputs[0].type, GpuArrayType):
+    if isinstance(outputs[0].type, GpuArrayType):
        # Don't handle this node here, it's already on the GPU.
        return
-    inps = list(node.inputs)
+    return local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs)
-    inps[0] = as_gpuarray_variable(node.inputs[0],
+@register_opt2([AbstractConv2d,
+                AbstractConv2d_gradWeights,
+                AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
+def local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs):
+    inps = list(inputs)
+    inps[0] = as_gpuarray_variable(inputs[0],
                                   context_name=context_name)
-    inps[1] = as_gpuarray_variable(node.inputs[1],
+    inps[1] = as_gpuarray_variable(inputs[1],
                                   context_name=context_name)
-    return [node.op(*inps)]
+    return [op(*inps)]
 # Register this here so that it goes after the abstract lifting
 register_opt('fast_compile')(conv_groupopt)
@@ -980,10 +1319,10 @@ def local_gpu_elemwise_careduce(node):
            isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
        op = node.op
        inp = node.inputs[0].owner.inputs[0]
-        return [GpuCAReduceCuda(scalar_op=op.scalar_op,
+        return [gpu_ca_reduce_cuda(scalar_op=op.scalar_op,
-                                axis=op.axis,
+                                   axis=op.axis,
-                                reduce_mask=op.reduce_mask,
+                                   reduce_mask=op.reduce_mask,
-                                pre_scalar_op=scalar.basic.sqr)(inp)]
+                                   pre_scalar_op=scalar.basic.sqr)(inp)]
 @local_optimizer(None)
@@ -1063,35 +1402,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
 @register_opt('scan', 'fast_compile')
 @op_lifter([scan_op.Scan])
-def local_scan_to_gpua(node, context_name):
+@register_opt2([scan_op.Scan], 'fast_compile')
-    info = copy.deepcopy(node.op.info)
+def local_gpua_scan_to_gpua(op, context_name, inputs, outputs):
+    info = copy.deepcopy(op.info)
    if info.get('gpua', False):
        return
    info['gpua'] = True
-    nw_ins = [node.inputs[0]]
+    nw_ins = [inputs[0]]
    e = (1 +
-         node.op.n_seqs +
+         op.n_seqs +
-         node.op.n_mit_mot +
+         op.n_mit_mot +
-         node.op.n_mit_sot +
+         op.n_mit_sot +
-         node.op.n_sit_sot +
+         op.n_sit_sot +
-         node.op.n_shared_outs)
+         op.n_shared_outs)
-    nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]]
+    nw_ins += [safe_to_gpu(x, context_name) for x in inputs[1:e]]
    b = e
-    e = e + node.op.n_nit_sot
+    e = e + op.n_nit_sot
-    nw_ins += node.inputs[b:e]
+    nw_ins += inputs[b:e]
-    nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]]
+    nw_ins += [safe_to_gpu(x, context_name) for x in inputs[e:]]
-    scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs]
+    scan_ins = [tensor_to_gpu(x, context_name) for x in op.inputs]
    # The inner output corresponding to the looping condition should not be
    # moved to the gpu
-    if node.op.info['as_while']:
+    if op.info['as_while']:
-        scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]]
+        scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs[:-1]]
-        scan_outs += [node.op.outputs[-1]]
+        scan_outs += [op.outputs[-1]]
    else:
-        scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs]
+        scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs]
    scan_outs = scan_utils.clone(
        scan_outs,
-        replace=list(zip(node.op.inputs,
+        replace=list(zip(op.inputs,
                         (safe_to_cpu(x) for x in scan_ins))))
    # We need to construct the hash here, because scan

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -8,7 +8,7 @@ from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)
-from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty
+from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
 from .elemwise import GpuDimShuffle, GpuElemwise
 _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
            if (alloc.owner and
                    isinstance(alloc.owner.op, GpuAllocEmpty) and
                    len(alloc.clients) > 1):
-                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
+                alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype)
-                                         alloc.owner.op.context_name)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
            return maker(node, inputs)
        return opt

--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
        if not dnn_available(test_ctx_name):
            raise SkipTest(dnn_available.msg)
        mode = mode_with_gpu
        if fd != (1, 1):
            raise SkipTest("Doesn't have CUDNN implementation")
        o = self.get_output_shape(i, f, s, b, fd)
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, mode=mode,
                     provide_shape=provide_shape, border_mode=b,

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -396,7 +396,7 @@ def test_gpueye():
        k_symb = numpy.asarray(0)
        out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
        f = theano.function([N_symb, M_symb],
-                            out,
+                            T.stack(out),
                            mode=mode_with_gpu)
        result = numpy.asarray(f(N, M))
        assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0():
    ones = numpy.ones((2,), dtype='float32')
    # Test with 0 from CPU op.
+    # Should not be transfered as the only client is the output
    a = tensor.alloc(z, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 2
+    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
+    assert isinstance(topo[0].op, theano.tensor.Alloc)
+    assert (numpy.asarray(f(6)) == 0).all()
+    # Test with 0 from CPU op.
+    # Should be transfered as it is used by another op.
+    a = tensor.alloc(z, i)
+    f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert isinstance(topo[0].op, GpuAlloc)
    assert (numpy.asarray(f(6)) == 0).all()
    # Test with 0
@@ -177,19 +187,30 @@ def test_local_gpualloc_empty():
    ii = theano.tensor.iscalar()
    # Test with vector
+    # Should not be moved as the only client is the output
    a = tensor.AllocEmpty('float32')(i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 2
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, theano.tensor.AllocEmpty)
+    # This return not initilized data, so we can only check the shape
+    assert f(3).shape == (3,)
+    # Test with vector
+    # Should be moved
+    a = tensor.AllocEmpty('float32')(i)
+    f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
    assert isinstance(topo[0].op, GpuAllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3).shape == (3,)
    # Test with matrix
    a = tensor.AllocEmpty('float32')(i, ii)
-    f = theano.function([i, ii], a, mode=mode_with_gpu)
+    f = theano.function([i, ii], a.cumsum(axis=0), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 2
+    assert len(topo) == 3
    assert isinstance(topo[0].op, GpuAllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3, 4).shape == (3, 4)
@@ -334,7 +355,10 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
-    assert any([isinstance(node.op, GpuElemwise) for node in topo])
+    # Our optimizer isn't smart enough to move to the GPU Elemwise.
+    # If it where just a little bit smarter, it could wrongly move it to the GPU.
+    # If it where super smart, it would know it should not move it to the GPU.
+    assert any([isinstance(node.op, tensor.Elemwise) for node in topo])
 def test_local_gpu_elemwise():
@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op():
    out = theano.tensor.tanh(ms).dot(ms.T)
    mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
-    mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise")
+    mode_local_assert = mode_local_assert.excluding("local_gpua_elemwise")
    old = theano.config.assert_no_cpu_op
    old2 = theano.config.on_opt_error

--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -233,7 +233,7 @@ class GpuArrayType(Type):
        return data
    def filter_variable(self, other, allow_convert=True):
-        from theano.gpuarray import GpuFromHost
+        from theano.gpuarray.basic_ops import gpu_from_host
        if hasattr(other, '_as_GpuArrayVariable'):
            other = other._as_GpuArrayVariable(self.context_name)
@@ -265,7 +265,7 @@ class GpuArrayType(Type):
                                 str(self.broadcastable)))
            other = other2
-        return GpuFromHost(self.context_name)(other)
+        return gpu_from_host(self.context_name)(other)
    @staticmethod
    def values_eq(a, b, force_same_dtype=True):

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -24,10 +24,11 @@ from . import multinomial
 import theano.sandbox.cuda
 from theano.sandbox.cuda import GpuOp
-from theano.gpuarray.basic_ops import GpuKernelBase, Kernel
+from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
 from theano.gpuarray.type import GpuArrayType
 from theano.gpuarray.fp16_help import write_w
 from theano.gpuarray.opt import (register_opt as register_gpua,
+                                 register_opt2,
                                 host_from_gpu as host_from_gpua)
 if theano.sandbox.cuda.cuda_available:
    from theano.sandbox.cuda import (CudaNdarrayType,
@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object):
        return final_samples
+@register_opt2([mrg_uniform], 'fast_compile')
+def local_gpua_mrg_graph(op, context_name, inputs, outputs):
+    if (type(op) == mrg_uniform and
+            isinstance(inputs[0].type, GpuArrayType)):
+        outs = GPUA_mrg_uniform.new(inputs[0],
+                                    op.output_type.ndim,
+                                    op.output_type.dtype,
+                                    inputs[1])
+        return [outs[0], host_from_gpua(outs[1])]
 @register_gpua('fast_compile')
 @local_optimizer([mrg_uniform])
 def local_gpua_mrg(node):
-    # TODO : need description for function
+    context_name = infer_context_name(*node.inputs)
-    if (type(node.op) == mrg_uniform and
+    return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
-            isinstance(node.inputs[0].type, GpuArrayType)):
-        outs = GPUA_mrg_uniform.new(node.inputs[0],
-                                    node.op.output_type.ndim,
-                                    node.op.output_type.dtype,
-                                    node.inputs[1])
-        return [outs[0], host_from_gpua(outs[1])]
 MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)

--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None):
        return d
    visited.add(out)
    from theano.sandbox import cuda
-    from theano import gpuarray
+    from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu
+    from theano.gpuarray import pygpu_activated
+    from theano.gpuarray.type import GpuArrayType
    if out == x:
        if isinstance(x.type, cuda.CudaNdarrayType):
            d[out] = cuda.gpu_from_host(x_copy)
        else:
-            assert isinstance(x.type, gpuarray.GpuArrayType)
+            assert isinstance(x.type, GpuArrayType)
-            d[out] = gpuarray.GpuFromHost(x.type.context_name)(x_copy)
+            d[out] = gpu_from_host(x.type.context_name)(x_copy)
        return d
    elif out.owner is None:
        return d
@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None):
          out.owner.inputs == [x]):
        d[out] = tensor.as_tensor_variable(x_copy)
        return d
-    elif (gpuarray.pygpu_activated and
+    elif (pygpu_activated and
-          out.owner.op == gpuarray.host_from_gpu and
+          out.owner.op == host_from_gpu and
          out.owner.inputs == [x]):
        d[out] = tensor.as_tensor_variable(x_copy)
        return d

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                v = v.owner.inputs[0]
                continue
            elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
-                if isinstance(v.owner.inputs[0], Constant):
+                i = v.owner.op.i
-                    return numpy.asarray(
+                inp = v.owner.inputs[0]
-                        v.owner.inputs[0].data.shape[v.owner.op.i])
+                if isinstance(inp, Constant):
+                    return numpy.asarray(inp.data.shape[i])
+                # The shape of a broadcastable dimension is 1
+                if (hasattr(inp.type, 'broadcastable') and
+                        inp.type.broadcastable[i]):
+                    return numpy.asarray(1)
            # Don't act as the constant_folding optimization here as this
            # fct is used too early in the optimization phase.  This would
            # mess with the stabilization optimization and be too slow.
@@ -2690,15 +2696,18 @@ class Alloc(gof.Op):
        sh = [as_tensor_variable(s) for s in shape]
        bcast = []
        for i, s in enumerate(sh):
-            if config.exception_verbosity == 'high':
+            def err_str():
-                s_as_str = '\n' + min_informative_str(s)
+                if config.exception_verbosity == 'high':
-            else:
+                    return '\n' + min_informative_str(s)
-                s_as_str = str(s)
+                else:
+                    return str(s)
            if s.type.dtype[:3] not in ('int', 'uin'):
+                s_as_str = err_str()
                raise TypeError('Shape arguments to Alloc must be integers, '
                                'but argument %s is not for apply node: %s' %
                                (i, s_as_str))
            if s.ndim != 0:
+                s_as_str = err_str()
                raise TypeError(
                    "Each shape dimension to Alloc must be a scalar, ",
                    'but dimension %s have %d dimensions for apply node: %s' %

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
    """
    bsize, imshp = image_shape[0], image_shape[2:]
    nkern, kshp = kernel_shape[0], kernel_shape[2:]
    if filter_dilation is None:
        filter_dilation = numpy.ones(len(subsample), dtype='int')
    if isinstance(border_mode, tuple):
        out_shp = tuple(get_conv_shape_1axis(
            imshp[i], kshp[i], border_mode[i],
@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
        pad = border_mode
        if pad < 0:
            raise ValueError("border_mode must be >= 0")
-    out_shp = (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
+    # In case of symbolic shape, we want to build the smallest graph
+    # (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
+    if pad == 0:
+        out_shp = (image_shape - dil_kernel_shape)
+    else:
+        out_shp = (image_shape + 2 * pad - dil_kernel_shape)
+    if subsample != 1:
+        out_shp = out_shp // subsample
+    out_shp = out_shp + 1
    return out_shp

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase):
        assert get_scalar_constant_value(s) == 3
        s = opt.Shape_i(1)(c)
        assert get_scalar_constant_value(s) == 4
+        d = theano.shared(numpy.random.randn(1,1), broadcastable=(True, True))
+        f = theano.tensor.basic.ScalarFromTensor()(opt.Shape_i(0)(d))
+        assert get_scalar_constant_value(f) == 1
    def test_elemwise(self):
        # We test only for a few elemwise, the list of all supported