Merge pull request #4570 from Sentient07/new_graph2gpu

New graph2gpu

Merge pull request #4570 from Sentient07/new_graph2gpu
a24fd9bb · Frédéric Bastien · GitHub · 2b371c6f · 195f9b1d · a24fd9bb
--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -402,6 +402,14 @@ class Shape_i(gof.Op):
    def infer_shape(self, node, input_shapes):
        return [()]
+    def connection_pattern(self, node):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [[False]]
    def grad(self, inp, grads):
        return [theano.gradient.grad_not_implemented(
                op=self, x_pos=0, x=inp[0],
@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None):
    return var.shape[i]
+def shape_i_op(i):
+    key = i
+    if key not in shape_i_op.cache:
+        shape_i_op.cache[key] = Shape_i(i)
+    return shape_i_op.cache[key]
+shape_i_op.cache = {}
 def register_shape_i_c_code(typ, code, check_input, version=()):
    """
    Tell Shape_i how to generate C code for a Theano Type.

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -54,7 +54,7 @@ def _atexit_print_fn():
        destination_file = open(config.profiling.destination, 'w')
    for ps in _atexit_print_list:
-        if ps.fct_callcount or ps.compile_time > 0:
+        if ps.fct_callcount >= 1 or ps.compile_time > 1:
            ps.summary(file=destination_file,
                       n_ops_to_print=config.profiling.n_ops,
                       n_apply_to_print=config.profiling.n_apply)

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            for (t, count, n_created, o) in count_opt[::-1]:
                print(blanc, '  %.3fs - %d - %d - %s' % (
                    t, count, n_created, o), file=stream)
-            print(blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
+            print(blanc, '  %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
                not_used_time, len(not_used)), file=stream)
            not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
            for (t, o) in not_used[::-1]:

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name):
        # If we couldn't deal with transfers, then maybe it's a tensor
        if isinstance(x.type, tensor.TensorType):
-            return GpuFromHost(context_name)(x)
+            return gpu_from_host(context_name)(x)
    # Try _as_GpuArrayVariable if possible
    if hasattr(x, '_as_GpuArrayVariable'):
@@ -544,7 +544,7 @@ class HostFromGpu(Op):
    def grad(self, inputs, grads):
        gz, = grads
-        return [GpuFromHost(inputs[0].type.context_name)(gz)]
+        return [gpu_from_host(inputs[0].type.context_name)(gz)]
    def R_op(self, inputs, eval_points):
        ev, = eval_points
@@ -647,6 +647,14 @@ class GpuFromHost(Op):
        return (9,)
+# Caching GPUAlloc
+def gpu_from_host(ctx):
+    if ctx not in gpu_alloc.cache:
+        gpu_from_host.cache[ctx] = GpuFromHost(ctx)
+    return gpu_from_host.cache[ctx]
+gpu_from_host.cache = {}
 class GpuToGpu(Op):
    """
    Transfer data between GPUs.
@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc):
        return True
+# Caching GPUAlloc
+def gpu_alloc(ctx, memset_0=False):
+    key = (ctx, memset_0)
+    if key not in gpu_alloc.cache:
+        gpu_alloc.cache[key] = GpuAlloc(ctx, memset_0)
+    return gpu_alloc.cache[key]
+gpu_alloc.cache = {}
 class GpuAllocEmpty(HideC, Alloc):
    """
    Allocate uninitialized memory on the GPU.
@@ -956,6 +973,14 @@ def empty_like(var):
    return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
+def gpu_alloc_empty(ctx, dtype):
+    key = (dtype, ctx)
+    if key not in gpu_alloc_empty.cache:
+        gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
+    return gpu_alloc_empty.cache[key]
+gpu_alloc_empty.cache = {}
 class GpuContiguous(Op):
    """
    Return a C contiguous version of the input.
@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape):
    def make_node(self, x, shp):
        ctx_name = infer_context_name(x)
        x = as_gpuarray_variable(x, context_name=ctx_name)
+        shp = tensor.as_tensor_variable(shp)
        res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
        otype = GpuArrayType(dtype=res.dtype,
                             broadcastable=res.broadcastable,

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        return kernels
+# Caching GpuCAReduceCuda
+def gpu_ca_reduce_cuda(scalar_op, axis=None, reduce_mask=None, dtype=None, acc_dtype=None,
+                       pre_scalar_op=None):
+    key = (scalar_op, axis, reduce_mask, dtype, acc_dtype,
+           pre_scalar_op)
+    if key not in gpu_ca_reduce_cuda.cache:
+        gpu_ca_reduce_cuda.cache[key] = GpuCAReduceCuda(scalar_op, axis, reduce_mask, dtype,
+                                                        acc_dtype, pre_scalar_op)
+    return gpu_ca_reduce_cuda.cache[key]
+gpu_ca_reduce_cuda.cache = {}
 class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
    """
    CAReduce that reuse the python code from gpuarray.

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division
 import os
 from theano import Apply, Op
 from theano.tensor.extra_ops import CumsumOp
+from .basic_ops import infer_context_name
 try:
    from pygpu import gpuarray
 except ImportError:
    pass
-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape)
-                        infer_context_name, GpuFromHost)
+from .opt import register_opt, op_lifter, register_opt2
-from .opt import register_opt as register_gpu_opt, op_lifter
 class GpuCumsum(GpuKernelBase, Op):
@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op):
    def make_node(self, x):
        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
-        x = as_gpuarray_variable(x, infer_context_name(x))
+        context_name = infer_context_name(x)
+        x = as_gpuarray_variable(x, context_name)
        if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            raise NotImplementedError('Only cumsum on 1D, 2D and\
@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op):
        return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
+@register_opt('fast_compile')
 @op_lifter([CumsumOp])
-def use_gpu_cumsumop(node, ctx_name):
+@register_opt2([CumsumOp], 'fast_compile')
-    if node.inputs[0].dtype == 'float32':
+def local_gpua_cumsumop(op, ctx_name, inputs, outputs):
-        axis = node.op.axis
+    if inputs[0].dtype == 'float32':
-        x = node.inputs[0]
+        axis = op.axis
+        x = inputs[0]
        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None
-        if axis is None and x.ndim > 1:
+        x = as_gpuarray_variable(x, ctx_name)
-            x = x.flatten()
-        x = GpuFromHost(ctx_name)(x)
+        if axis is None and x.ndim > 1:
+            x = GpuReshape(1)(x, (-1,))
        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0
        return GpuCumsum(axis)(x)
-register_gpu_opt()(use_gpu_cumsumop)
--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType
 from theano.gpuarray import (basic_ops, GpuArrayType)
 import theano.tensor.fft
-from .opt import register_opt, op_lifter
+from .opt import register_opt, op_lifter, register_opt2
 try:
    import pygpu
@@ -373,10 +373,12 @@ def _unitary(norm):
 if scikits_cuda_available:
    @register_opt('fast_compile')
    @op_lifter([theano.tensor.fft.RFFTOp])
-    def local_curfft_op(node, context_name):
+    @register_opt2([theano.tensor.fft.RFFTOp], 'fast_compile')
+    def local_gpua_curfft_op(op, ctx_name, inputs, outputs):
        return curfft_op
    @register_opt('fast_compile')
    @op_lifter([theano.tensor.fft.IRFFTOp])
-    def local_cuirfft_op(node, context_name):
+    @register_opt2([theano.tensor.fft.IRFFTOp], 'fast_compile')
+    def local_gpua_cuirfft_op(op, ctx_name, inputs, outputs):
        return cuirfft_op
--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -14,7 +14,7 @@ from theano.gof import Op
 from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from theano import gpuarray
 from .basic_ops import as_gpuarray_variable, infer_context_name
-from .opt import register_opt, op_lifter
+from .opt import register_opt, op_lifter, register_opt2
 from .type import GpuArrayType
@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial(
        return (1,)
-@register_opt()
+@register_opt('fast_compile')
 @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
-def local_gpua_multinomial(node, context_name):
+@register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
+def local_gpua_multinomial(op, context_name, inputs, outputs):
    # TODO : need description for function
-    if len(node.inputs) == 2:
+    if len(inputs) == 2:
-        p, u = node.inputs
+        p, u = inputs
        n_samples = 1
    else:
-        p, u, n_samples = node.inputs
+        p, u, n_samples = inputs
    try:
        if get_scalar_constant_value(n_samples) != 1:
            return None
    except NotScalarConstantError:
        return None
-    m, = node.outputs
+    m, = outputs
    if (p.dtype == u.dtype == m.dtype == 'float32'):
-        gpu_op = GPUAMultinomialFromUniform(node.op.odtype)
+        gpu_op = GPUAMultinomialFromUniform(op.odtype)
        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
            gpu_op(p, u))
--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -13,7 +13,7 @@ except ImportError:
 from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
                        infer_context_name)
-from .opt import register_opt as register_gpu_opt, op_lifter
+from .opt import register_opt2, op_lifter, register_opt
 from .type import GpuArrayType
@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        Op.perform(self, node, inp, out, ctx)
+@register_opt('fast_compile')
 @op_lifter([Images2Neibs])
-def use_gpu_images2neibs(node, context_name):
+@register_opt2([Images2Neibs], 'fast_compile')
-    if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
+def local_gpua_images2neibs(op, context_name, inputs, outputs):
-        return GpuImages2Neibs(node.op.mode)
+    if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
+        return GpuImages2Neibs(op.mode)
-register_gpu_opt()(use_gpu_images2neibs)
--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
 from . import opt
 from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
-                        infer_context_name)
+                        infer_context_name, gpu_alloc_empty)
 from .type import gpu_context_type
 from .opt_util import alpha_merge, output_merge
@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
        return '\n'.join(codel)
-@opt.register_opt()
+@opt.register_opt('fast_compile')
 @opt.op_lifter([tensor.Dot])
-def local_dot_to_gemm16(node, ctx_name):
+@opt.register_opt2([tensor.Dot], 'fast_compile')
+def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
    if nerv is None:
        return
-    A = node.inputs[0]
+    A = inputs[0]
-    B = node.inputs[1]
+    B = inputs[1]
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = node.inputs[0].fgraph
+        fgraph = getattr(outputs[0], 'fgraph', None)
-        C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
+        C = gpu_alloc_empty(ctx_name, dtype='float16')(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -8,7 +8,7 @@ from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)
-from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty
+from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
 from .elemwise import GpuDimShuffle, GpuElemwise
 _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
            if (alloc.owner and
                    isinstance(alloc.owner.op, GpuAllocEmpty) and
                    len(alloc.clients) > 1):
-                alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
+                alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype)
-                                         alloc.owner.op.context_name)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
            return maker(node, inputs)
        return opt

--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
        if not dnn_available(test_ctx_name):
            raise SkipTest(dnn_available.msg)
        mode = mode_with_gpu
        if fd != (1, 1):
            raise SkipTest("Doesn't have CUDNN implementation")
        o = self.get_output_shape(i, f, s, b, fd)
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, mode=mode,
                     provide_shape=provide_shape, border_mode=b,

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -396,7 +396,7 @@ def test_gpueye():
        k_symb = numpy.asarray(0)
        out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
        f = theano.function([N_symb, M_symb],
-                            out,
+                            T.stack(out),
                            mode=mode_with_gpu)
        result = numpy.asarray(f(N, M))
        assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0():
    ones = numpy.ones((2,), dtype='float32')
    # Test with 0 from CPU op.
+    # Should not be transfered as the only client is the output
    a = tensor.alloc(z, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 2
+    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
+    assert isinstance(topo[0].op, theano.tensor.Alloc)
+    assert (numpy.asarray(f(6)) == 0).all()
+    # Test with 0 from CPU op.
+    # Should be transfered as it is used by another op.
+    a = tensor.alloc(z, i)
+    f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert isinstance(topo[0].op, GpuAlloc)
    assert (numpy.asarray(f(6)) == 0).all()
    # Test with 0
@@ -177,19 +187,30 @@ def test_local_gpualloc_empty():
    ii = theano.tensor.iscalar()
    # Test with vector
+    # Should not be moved as the only client is the output
    a = tensor.AllocEmpty('float32')(i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 2
+    assert len(topo) == 1
+    assert isinstance(topo[0].op, theano.tensor.AllocEmpty)
+    # This return not initilized data, so we can only check the shape
+    assert f(3).shape == (3,)
+    # Test with vector
+    # Should be moved
+    a = tensor.AllocEmpty('float32')(i)
+    f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
    assert isinstance(topo[0].op, GpuAllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3).shape == (3,)
    # Test with matrix
    a = tensor.AllocEmpty('float32')(i, ii)
-    f = theano.function([i, ii], a, mode=mode_with_gpu)
+    f = theano.function([i, ii], a.cumsum(axis=0), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 2
+    assert len(topo) == 3
    assert isinstance(topo[0].op, GpuAllocEmpty)
    # This return not initilized data, so we can only check the shape
    assert f(3, 4).shape == (3, 4)
@@ -334,7 +355,10 @@ def test_local_gpu_subtensor():
    topo = f.maker.fgraph.toposort()
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
-    assert any([isinstance(node.op, GpuElemwise) for node in topo])
+    # Our optimizer isn't smart enough to move to the GPU Elemwise.
+    # If it where just a little bit smarter, it could wrongly move it to the GPU.
+    # If it where super smart, it would know it should not move it to the GPU.
+    assert any([isinstance(node.op, tensor.Elemwise) for node in topo])
 def test_local_gpu_elemwise():
@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op():
    out = theano.tensor.tanh(ms).dot(ms.T)
    mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
-    mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise")
+    mode_local_assert = mode_local_assert.excluding("local_gpua_elemwise")
    old = theano.config.assert_no_cpu_op
    old2 = theano.config.on_opt_error

--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -233,7 +233,7 @@ class GpuArrayType(Type):
        return data
    def filter_variable(self, other, allow_convert=True):
-        from theano.gpuarray import GpuFromHost
+        from theano.gpuarray.basic_ops import gpu_from_host
        if hasattr(other, '_as_GpuArrayVariable'):
            other = other._as_GpuArrayVariable(self.context_name)
@@ -265,7 +265,7 @@ class GpuArrayType(Type):
                                 str(self.broadcastable)))
            other = other2
-        return GpuFromHost(self.context_name)(other)
+        return gpu_from_host(self.context_name)(other)
    @staticmethod
    def values_eq(a, b, force_same_dtype=True):

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -24,10 +24,11 @@ from . import multinomial
 import theano.sandbox.cuda
 from theano.sandbox.cuda import GpuOp
-from theano.gpuarray.basic_ops import GpuKernelBase, Kernel
+from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
 from theano.gpuarray.type import GpuArrayType
 from theano.gpuarray.fp16_help import write_w
 from theano.gpuarray.opt import (register_opt as register_gpua,
+                                 register_opt2,
                                 host_from_gpu as host_from_gpua)
 if theano.sandbox.cuda.cuda_available:
    from theano.sandbox.cuda import (CudaNdarrayType,
@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object):
        return final_samples
+@register_opt2([mrg_uniform], 'fast_compile')
+def local_gpua_mrg_graph(op, context_name, inputs, outputs):
+    if (type(op) == mrg_uniform and
+            isinstance(inputs[0].type, GpuArrayType)):
+        outs = GPUA_mrg_uniform.new(inputs[0],
+                                    op.output_type.ndim,
+                                    op.output_type.dtype,
+                                    inputs[1])
+        return [outs[0], host_from_gpua(outs[1])]
 @register_gpua('fast_compile')
 @local_optimizer([mrg_uniform])
 def local_gpua_mrg(node):
-    # TODO : need description for function
+    context_name = infer_context_name(*node.inputs)
-    if (type(node.op) == mrg_uniform and
+    return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
-            isinstance(node.inputs[0].type, GpuArrayType)):
-        outs = GPUA_mrg_uniform.new(node.inputs[0],
-                                    node.op.output_type.ndim,
-                                    node.op.output_type.dtype,
-                                    node.inputs[1])
-        return [outs[0], host_from_gpua(outs[1])]
 MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)

--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None):
        return d
    visited.add(out)
    from theano.sandbox import cuda
-    from theano import gpuarray
+    from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu
+    from theano.gpuarray import pygpu_activated
+    from theano.gpuarray.type import GpuArrayType
    if out == x:
        if isinstance(x.type, cuda.CudaNdarrayType):
            d[out] = cuda.gpu_from_host(x_copy)
        else:
-            assert isinstance(x.type, gpuarray.GpuArrayType)
+            assert isinstance(x.type, GpuArrayType)
-            d[out] = gpuarray.GpuFromHost(x.type.context_name)(x_copy)
+            d[out] = gpu_from_host(x.type.context_name)(x_copy)
        return d
    elif out.owner is None:
        return d
@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None):
          out.owner.inputs == [x]):
        d[out] = tensor.as_tensor_variable(x_copy)
        return d
-    elif (gpuarray.pygpu_activated and
+    elif (pygpu_activated and
-          out.owner.op == gpuarray.host_from_gpu and
+          out.owner.op == host_from_gpu and
          out.owner.inputs == [x]):
        d[out] = tensor.as_tensor_variable(x_copy)
        return d

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                v = v.owner.inputs[0]
                continue
            elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
-                if isinstance(v.owner.inputs[0], Constant):
+                i = v.owner.op.i
-                    return numpy.asarray(
+                inp = v.owner.inputs[0]
-                        v.owner.inputs[0].data.shape[v.owner.op.i])
+                if isinstance(inp, Constant):
+                    return numpy.asarray(inp.data.shape[i])
+                # The shape of a broadcastable dimension is 1
+                if (hasattr(inp.type, 'broadcastable') and
+                        inp.type.broadcastable[i]):
+                    return numpy.asarray(1)
            # Don't act as the constant_folding optimization here as this
            # fct is used too early in the optimization phase.  This would
            # mess with the stabilization optimization and be too slow.
@@ -2690,15 +2696,18 @@ class Alloc(gof.Op):
        sh = [as_tensor_variable(s) for s in shape]
        bcast = []
        for i, s in enumerate(sh):
-            if config.exception_verbosity == 'high':
+            def err_str():
-                s_as_str = '\n' + min_informative_str(s)
+                if config.exception_verbosity == 'high':
-            else:
+                    return '\n' + min_informative_str(s)
-                s_as_str = str(s)
+                else:
+                    return str(s)
            if s.type.dtype[:3] not in ('int', 'uin'):
+                s_as_str = err_str()
                raise TypeError('Shape arguments to Alloc must be integers, '
                                'but argument %s is not for apply node: %s' %
                                (i, s_as_str))
            if s.ndim != 0:
+                s_as_str = err_str()
                raise TypeError(
                    "Each shape dimension to Alloc must be a scalar, ",
                    'but dimension %s have %d dimensions for apply node: %s' %

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
    """
    bsize, imshp = image_shape[0], image_shape[2:]
    nkern, kshp = kernel_shape[0], kernel_shape[2:]
    if filter_dilation is None:
        filter_dilation = numpy.ones(len(subsample), dtype='int')
    if isinstance(border_mode, tuple):
        out_shp = tuple(get_conv_shape_1axis(
            imshp[i], kshp[i], border_mode[i],
@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
        pad = border_mode
        if pad < 0:
            raise ValueError("border_mode must be >= 0")
-    out_shp = (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
+    # In case of symbolic shape, we want to build the smallest graph
+    # (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
+    if pad == 0:
+        out_shp = (image_shape - dil_kernel_shape)
+    else:
+        out_shp = (image_shape + 2 * pad - dil_kernel_shape)
+    if subsample != 1:
+        out_shp = out_shp // subsample
+    out_shp = out_shp + 1
    return out_shp

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase):
        assert get_scalar_constant_value(s) == 3
        s = opt.Shape_i(1)(c)
        assert get_scalar_constant_value(s) == 4
+        d = theano.shared(numpy.random.randn(1,1), broadcastable=(True, True))
+        f = theano.tensor.basic.ScalarFromTensor()(opt.Shape_i(0)(d))
+        assert get_scalar_constant_value(f) == 1
    def test_elemwise(self):
        # We test only for a few elemwise, the list of all supported